refactor: Extract PathPart to its own module

And be more explicit about the privacy of the inner String
pull/24376/head
Carol (Nichols || Goulding) 2021-01-14 10:41:22 -05:00
parent 004fb8f803
commit 979458034a
5 changed files with 131 additions and 112 deletions

View File

@ -1,7 +1,6 @@
//! This module contains code for abstracting object locations that work
//! across different backing implementations and platforms.
use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS};
use std::{mem, path::PathBuf};
/// Paths that came from or are to be used in cloud-based object storage
@ -14,6 +13,9 @@ pub mod file;
pub mod parsed;
use parsed::DirsAndFileName;
mod parts;
use parts::PathPart;
/// Universal interface for handling paths and locations for objects and
/// directories in the object store.
///
@ -214,103 +216,11 @@ impl PartialEq for PathRepresentation {
/// The delimiter to separate object namespaces, creating a directory structure.
pub const DELIMITER: &str = "/";
// percent_encode's API needs this as a byte
const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
/// The PathPart type exists to validate the directory/file names that form part
/// of a path.
///
/// A PathPart instance is guaranteed to contain no `/` characters as it can
/// only be constructed by going through the `try_from` impl.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default)]
pub struct PathPart(String);
/// Characters we want to encode.
const INVALID: &AsciiSet = &CONTROLS
// The delimiter we are reserving for internal hierarchy
.add(DELIMITER_BYTE)
// Characters AWS recommends avoiding for object keys
// https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
.add(b'\\')
.add(b'{')
// TODO: Non-printable ASCII characters (128255 decimal characters)
.add(b'^')
.add(b'}')
.add(b'%')
.add(b'`')
.add(b']')
.add(b'"') // " <-- my editor is confused about double quotes within single quotes
.add(b'>')
.add(b'[')
.add(b'~')
.add(b'<')
.add(b'#')
.add(b'|')
// Characters Google Cloud Storage recommends avoiding for object names
// https://cloud.google.com/storage/docs/naming-objects
.add(b'\r')
.add(b'\n')
.add(b'*')
.add(b'?');
impl From<&str> for PathPart {
fn from(v: &str) -> Self {
match v {
// We don't want to encode `.` generally, but we do want to disallow parts of paths
// to be equal to `.` or `..` to prevent file system traversal shenanigans.
"." => Self(String::from("%2E")),
".." => Self(String::from("%2E%2E")),
other => Self(percent_encode(other.as_bytes(), INVALID).to_string()),
}
}
}
impl std::fmt::Display for PathPart {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
percent_decode_str(&self.0)
.decode_utf8()
.expect("Valid UTF-8 that came from String")
.fmt(f)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_part_delimiter_gets_encoded() {
let part: PathPart = "foo/bar".into();
assert_eq!(part, PathPart(String::from("foo%2Fbar")));
}
#[test]
fn path_part_gets_decoded_for_display() {
let part: PathPart = "foo/bar".into();
assert_eq!(part.to_string(), "foo/bar");
}
#[test]
fn path_part_given_already_encoded_string() {
let part: PathPart = "foo%2Fbar".into();
assert_eq!(part, PathPart(String::from("foo%252Fbar")));
assert_eq!(part.to_string(), "foo%2Fbar");
}
#[test]
fn path_part_cant_be_one_dot() {
let part: PathPart = ".".into();
assert_eq!(part, PathPart(String::from("%2E")));
assert_eq!(part.to_string(), ".");
}
#[test]
fn path_part_cant_be_two_dots() {
let part: PathPart = "..".into();
assert_eq!(part, PathPart(String::from("%2E%2E")));
assert_eq!(part.to_string(), "..");
}
// Invariants to maintain/document/test:
//
// - always ends in DELIMITER if it's a directory. If it's the end object, it
@ -507,10 +417,10 @@ mod tests {
let path_buf_parts: DirsAndFileName = path_buf.into();
assert!(cloud_parts.directories.is_empty());
assert_eq!(cloud_parts.file_name.unwrap().0, "blah.json");
assert_eq!(cloud_parts.file_name.unwrap().encoded(), "blah.json");
assert!(path_buf_parts.directories.is_empty());
assert_eq!(path_buf_parts.file_name.unwrap().0, "blah.json");
assert_eq!(path_buf_parts.file_name.unwrap().encoded(), "blah.json");
// empty
let cloud = PathRepresentation::RawCloud("".into());

View File

@ -1,4 +1,4 @@
use super::{ObjectStorePath, PathRepresentation, DELIMITER};
use super::{ObjectStorePath, PathPart, PathRepresentation, DELIMITER};
use itertools::Itertools;
@ -20,14 +20,14 @@ impl CloudConverter {
let mut path = dirs_and_file_name
.directories
.iter()
.map(|p| &p.0)
.map(PathPart::encoded)
.join(DELIMITER);
if !path.is_empty() {
path.push_str(DELIMITER);
}
if let Some(file_name) = &dirs_and_file_name.file_name {
path.push_str(&file_name.0);
path.push_str(file_name.encoded());
}
path
}

View File

@ -1,4 +1,4 @@
use super::{ObjectStorePath, PathRepresentation};
use super::{ObjectStorePath, PathPart, PathRepresentation};
use std::path::PathBuf;
@ -21,10 +21,10 @@ impl FileConverter {
let mut path: PathBuf = dirs_and_file_name
.directories
.iter()
.map(|p| &p.0)
.map(PathPart::encoded)
.collect();
if let Some(file_name) = &dirs_and_file_name.file_name {
path.push(&file_name.0);
path.push(file_name.encoded());
}
path
}

View File

@ -17,7 +17,9 @@ impl DirsAndFileName {
use itertools::Diff;
match diff {
None => match (self.file_name.as_ref(), prefix.file_name.as_ref()) {
(Some(self_file), Some(prefix_file)) => self_file.0.starts_with(&prefix_file.0),
(Some(self_file), Some(prefix_file)) => {
self_file.encoded().starts_with(prefix_file.encoded())
}
(Some(_self_file), None) => true,
(None, Some(_prefix_file)) => false,
(None, None) => true,
@ -27,7 +29,7 @@ impl DirsAndFileName {
.next()
.expect("must have at least one mismatch to be in this case");
match prefix.file_name.as_ref() {
Some(prefix_file) => next_dir.0.starts_with(&prefix_file.0),
Some(prefix_file) => next_dir.encoded().starts_with(prefix_file.encoded()),
None => true,
}
}
@ -42,8 +44,8 @@ impl DirsAndFileName {
&& remaining_self
.next()
.expect("must be at least one value")
.0
.starts_with(&first_prefix.0)
.encoded()
.starts_with(first_prefix.encoded())
}
_ => false,
}
@ -103,12 +105,12 @@ impl From<PathRepresentation> for DirsAndFileName {
fn from(path_rep: PathRepresentation) -> Self {
match path_rep {
PathRepresentation::RawCloud(path) => {
let mut parts: Vec<_> = path
let mut parts: Vec<PathPart> = path
.split_terminator(DELIMITER)
.map(|s| PathPart(s.to_string()))
.collect();
let maybe_file_name = match parts.pop() {
Some(file) if file.0.contains('.') => Some(file),
Some(file) if file.encoded().contains('.') => Some(file),
Some(dir) => {
parts.push(dir);
None
@ -121,17 +123,17 @@ impl From<PathRepresentation> for DirsAndFileName {
}
}
PathRepresentation::RawPathBuf(path) => {
let mut parts: Vec<_> = path
let mut parts: Vec<PathPart> = path
.iter()
.flat_map(|s| s.to_os_string().into_string().map(PathPart))
.collect();
let maybe_file_name = match parts.pop() {
Some(file)
if !file.0.starts_with('.')
&& (file.0.ends_with(".json")
|| file.0.ends_with(".parquet")
|| file.0.ends_with(".segment")) =>
if !file.encoded().starts_with('.')
&& (file.encoded().ends_with(".json")
|| file.encoded().ends_with(".parquet")
|| file.encoded().ends_with(".segment")) =>
{
Some(file)
}

View File

@ -0,0 +1,107 @@
use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS};
use super::DELIMITER;
// percent_encode's API needs this as a byte
const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
/// The PathPart type exists to validate the directory/file names that form part
/// of a path.
///
/// A PathPart instance is guaranteed to contain no `/` characters as it can
/// only be constructed by going through the `try_from` impl.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default)]
pub struct PathPart(pub(super) String);
/// Characters we want to encode.
const INVALID: &AsciiSet = &CONTROLS
// The delimiter we are reserving for internal hierarchy
.add(DELIMITER_BYTE)
// Characters AWS recommends avoiding for object keys
// https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
.add(b'\\')
.add(b'{')
// TODO: Non-printable ASCII characters (128255 decimal characters)
.add(b'^')
.add(b'}')
.add(b'%')
.add(b'`')
.add(b']')
.add(b'"') // " <-- my editor is confused about double quotes within single quotes
.add(b'>')
.add(b'[')
.add(b'~')
.add(b'<')
.add(b'#')
.add(b'|')
// Characters Google Cloud Storage recommends avoiding for object names
// https://cloud.google.com/storage/docs/naming-objects
.add(b'\r')
.add(b'\n')
.add(b'*')
.add(b'?');
impl From<&str> for PathPart {
fn from(v: &str) -> Self {
match v {
// We don't want to encode `.` generally, but we do want to disallow parts of paths
// to be equal to `.` or `..` to prevent file system traversal shenanigans.
"." => Self(String::from("%2E")),
".." => Self(String::from("%2E%2E")),
other => Self(percent_encode(other.as_bytes(), INVALID).to_string()),
}
}
}
impl std::fmt::Display for PathPart {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
percent_decode_str(&self.0)
.decode_utf8()
.expect("Valid UTF-8 that came from String")
.fmt(f)
}
}
impl PathPart {
pub fn encoded(&self) -> &str {
&self.0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_part_delimiter_gets_encoded() {
let part: PathPart = "foo/bar".into();
assert_eq!(part, PathPart(String::from("foo%2Fbar")));
}
#[test]
fn path_part_gets_decoded_for_display() {
let part: PathPart = "foo/bar".into();
assert_eq!(part.to_string(), "foo/bar");
}
#[test]
fn path_part_given_already_encoded_string() {
let part: PathPart = "foo%2Fbar".into();
assert_eq!(part, PathPart(String::from("foo%252Fbar")));
assert_eq!(part.to_string(), "foo%2Fbar");
}
#[test]
fn path_part_cant_be_one_dot() {
let part: PathPart = ".".into();
assert_eq!(part, PathPart(String::from("%2E")));
assert_eq!(part.to_string(), ".");
}
#[test]
fn path_part_cant_be_two_dots() {
let part: PathPart = "..".into();
assert_eq!(part, PathPart(String::from("%2E%2E")));
assert_eq!(part.to_string(), "..");
}
}