Merge pull request #7961 from influxdata/dom/partition-key-truncate

feat: bound partition key size
pull/24376/head
Dom 2023-06-13 14:08:45 +01:00 committed by GitHub
commit 7fef809b2a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 639 additions and 80 deletions

1
Cargo.lock generated
View File

@ -3650,6 +3650,7 @@ dependencies = [
"schema", "schema",
"snafu", "snafu",
"thiserror", "thiserror",
"unicode-segmentation",
"workspace-hack", "workspace-hack",
] ]

View File

@ -47,14 +47,68 @@
//! [`build_column_values()`] function can be used to obtain the set of //! [`build_column_values()`] function can be used to obtain the set of
//! [`TemplatePart::TagValue`] the key was constructed from. //! [`TemplatePart::TagValue`] the key was constructed from.
//! //!
//! ### Value Truncation
//!
//! Partition key parts are limited to, at most, 200 bytes in length
//! ([`PARTITION_KEY_MAX_PART_LEN`]). If any single partition key part exceeds
//! this length limit, it is truncated and the truncation marker `#`
//! ([`PARTITION_KEY_PART_TRUNCATED`]) is appended.
//!
//! When rebuilding column values using [`build_column_values()`], a truncated
//! key part yields [`ColumnValue::Prefix`], which can only be used for prefix
//! matching - equality matching against a string always returns false.
//!
//! Two considerations must be made when truncating the generated key:
//!
//! * The string may contain encoded sequences in the form %XX, and the string
//! should not be split within an encoded sequence, or decoding the string
//! will fail.
//!
//! * This may be a unicode string - what the user might consider a "character"
//! may in fact be multiple unicode code-points, each of which may span
//! multiple bytes.
//!
//! Slicing a unicode code-point in half may lead to an invalid UTF-8 string,
//! which will prevent it from being used in Rust (and likely many other
//! languages/systems). Because partition keys are represented as strings and
//! not bytes, splitting a code-point in half MUST be avoided.
//!
//! Further to this, a sequence of multiple code-points can represent a single
//! "character" - this is called a grapheme. For example, the representation of
//! the Tamil "ni" character "நி" is composed of two multi-byte code-points; the
//! Tamil letter "na" which renders as "ந" and the vowel sign "ி", each 3 bytes
//! long. If split after the first 3 bytes, the compound "ni" character will be
//! incorrectly rendered as the single "na"/"ந" character.
//!
//! Depending on what the consumer of the split string considers a character,
//! prefix/equality matching may produce differing results if a grapheme is
//! split. If the caller performs a byte-wise comparison, everything is fine -
//! if they perform a "character" comparison, then the equality may be lost
//! depending on what they consider a character.
//!
//! Therefore this implementation takes the conservative approach of never
//! splitting code-points (for UTF-8 correctness) nor graphemes for simplicity
//! and compatibility for the consumer. This may be relaxed in the future to
//! allow splitting graphemes, but by being conservative we give ourselves this
//! option - we can't easily do the reverse!
//!
//! ## Part Limit & Maximum Key Size
//!
//! The number of parts in a partition template is limited to 8
//! ([`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`]), validated at creation time.
//!
//! Together with the above value truncation, this bounds the maximum length of
//! a partition key to 1,607 bytes (1.57 KiB).
//!
//! ### Reserved Characters //! ### Reserved Characters
//! //!
//! Reserved characters that are percent encoded (in addition to non-printable //! Reserved characters that are percent encoded (in addition to non-ASCII
//! characters), and their meaning: //! characters), and their meaning:
//! //!
//! * `|` - partition key part delimiter ([`PARTITION_KEY_DELIMITER`]) //! * `|` - partition key part delimiter ([`PARTITION_KEY_DELIMITER`])
//! * `!` - NULL/missing partition key part ([`PARTITION_KEY_VALUE_NULL`]) //! * `!` - NULL/missing partition key part ([`PARTITION_KEY_VALUE_NULL`])
//! * `^` - empty string partition key part ([`PARTITION_KEY_VALUE_EMPTY`]) //! * `^` - empty string partition key part ([`PARTITION_KEY_VALUE_EMPTY`])
//! * `#` - key part truncation marker ([`PARTITION_KEY_PART_TRUNCATED`])
//! * `%` - required for unambiguous reversal of percent encoding //! * `%` - required for unambiguous reversal of percent encoding
//! //!
//! These characters are defined in [`ENCODED_PARTITION_KEY_CHARS`] and chosen //! These characters are defined in [`ENCODED_PARTITION_KEY_CHARS`] and chosen
@ -74,13 +128,14 @@
//! //!
//! The following partition keys are derived: //! The following partition keys are derived:
//! //!
//! * `time=2023-01-01, a=bananas, b=plátanos` -> `2023|bananas|plátanos //! * `time=2023-01-01, a=bananas, b=plátanos` -> `2023|bananas|plátanos`
//! * `time=2023-01-01, b=plátanos` -> `2023|!|plátanos` //! * `time=2023-01-01, b=plátanos` -> `2023|!|plátanos`
//! * `time=2023-01-01, another=cat, b=plátanos` -> `2023|!|plátanos` //! * `time=2023-01-01, another=cat, b=plátanos` -> `2023|!|plátanos`
//! * `time=2023-01-01` -> `2023|!|!` //! * `time=2023-01-01` -> `2023|!|!`
//! * `time=2023-01-01, a=cat|dog, b=!` -> `2023|cat%7Cdog|%21` //! * `time=2023-01-01, a=cat|dog, b=!` -> `2023|cat%7Cdog|%21`
//! * `time=2023-01-01, a=%50` -> `2023|%2550|!` //! * `time=2023-01-01, a=%50` -> `2023|%2550|!`
//! * `time=2023-01-01, a=` -> `2023|^|!` //! * `time=2023-01-01, a=` -> `2023|^|!`
//! * `time=2023-01-01, a=<long string>` -> `2023|<long string>#|!`
//! //!
//! When using the default partitioning template (YYYY-MM-DD) there is no //! When using the default partitioning template (YYYY-MM-DD) there is no
//! encoding necessary, as the derived partition key contains a single part, and //! encoding necessary, as the derived partition key contains a single part, and
@ -144,6 +199,17 @@ pub const PARTITION_KEY_VALUE_NULL: char = '!';
/// The `str` form of the [`PARTITION_KEY_VALUE_NULL`] character. /// The `str` form of the [`PARTITION_KEY_VALUE_NULL`] character.
pub const PARTITION_KEY_VALUE_NULL_STR: &str = "!"; pub const PARTITION_KEY_VALUE_NULL_STR: &str = "!";
/// The maximum permissible length of a partition key part, after encoding
/// reserved & non-ASCII characters.
pub const PARTITION_KEY_MAX_PART_LEN: usize = 200;
/// The truncation sentinel character, used to explicitly identify a partition
/// key as having been truncated.
///
/// Truncated partition key parts can only be used for prefix matching, and
/// yield a [`ColumnValue::Prefix`] from [`build_column_values()`].
pub const PARTITION_KEY_PART_TRUNCATED: char = '#';
/// The minimal set of characters that must be encoded during partition key /// The minimal set of characters that must be encoded during partition key
/// generation when they form part of a partition key part, in order to be /// generation when they form part of a partition key part, in order to be
/// unambiguously reversible. /// unambiguously reversible.
@ -153,6 +219,7 @@ pub const ENCODED_PARTITION_KEY_CHARS: AsciiSet = CONTROLS
.add(PARTITION_KEY_DELIMITER as u8) .add(PARTITION_KEY_DELIMITER as u8)
.add(PARTITION_KEY_VALUE_NULL as u8) .add(PARTITION_KEY_VALUE_NULL as u8)
.add(PARTITION_KEY_VALUE_EMPTY as u8) .add(PARTITION_KEY_VALUE_EMPTY as u8)
.add(PARTITION_KEY_PART_TRUNCATED as u8)
.add(b'%'); // Required for reversible unambiguous encoding .add(b'%'); // Required for reversible unambiguous encoding
/// Allocationless and protobufless access to the parts of a template needed to /// Allocationless and protobufless access to the parts of a template needed to
@ -218,6 +285,12 @@ impl TablePartitionTemplateOverride {
} }
} }
/// Returns the number of parts in this template.
#[allow(clippy::len_without_is_empty)] // Senseless - there must always be >0 parts.
pub fn len(&self) -> usize {
self.parts().count()
}
/// Iterate through the protobuf parts and lend out what the `mutable_batch` crate needs to /// Iterate through the protobuf parts and lend out what the `mutable_batch` crate needs to
/// build `PartitionKey`s. If this table doesn't have a custom template, use the application /// build `PartitionKey`s. If this table doesn't have a custom template, use the application
/// default of partitioning by day. /// default of partitioning by day.
@ -366,6 +439,54 @@ mod serialization {
} }
} }
/// The value of a column, reversed from a partition key.
///
/// See [`build_column_values()`].
#[derive(Debug, Clone, PartialEq)]
pub enum ColumnValue<'a> {
/// The inner value is the exact, unmodified input column value.
Identity(Cow<'a, str>),
/// The inner value is a variable length prefix of the input column value.
///
/// The string value is always guaranteed to be valid UTF-8.
///
/// Attempting to equality match this variant against a string will always
/// be false - use [`ColumnValue::is_prefix_match_of()`] to prefix match
/// instead.
Prefix(Cow<'a, str>),
}
impl<'a> ColumnValue<'a> {
/// Returns true if `other` is a byte-wise prefix match of `self`.
///
/// This method can be called for both [`ColumnValue::Identity`] and
/// [`ColumnValue::Prefix`].
pub fn is_prefix_match_of<T>(&self, other: T) -> bool
where
T: AsRef<[u8]>,
{
let this = match self {
ColumnValue::Identity(v) => v.as_bytes(),
ColumnValue::Prefix(v) => v.as_bytes(),
};
other.as_ref().starts_with(this)
}
}
impl<'a, T> PartialEq<T> for ColumnValue<'a>
where
T: AsRef<str>,
{
fn eq(&self, other: &T) -> bool {
match self {
ColumnValue::Identity(v) => other.as_ref().eq(v.as_ref()),
ColumnValue::Prefix(_) => false,
}
}
}
/// Reverse a `partition_key` generated from the given partition key `template`, /// Reverse a `partition_key` generated from the given partition key `template`,
/// reconstructing the set of tag values in the form of `(column name, column /// reconstructing the set of tag values in the form of `(column name, column
/// value)` tuples that the `partition_key` was generated from. /// value)` tuples that the `partition_key` was generated from.
@ -381,7 +502,7 @@ mod serialization {
pub fn build_column_values<'a>( pub fn build_column_values<'a>(
template: &'a TablePartitionTemplateOverride, template: &'a TablePartitionTemplateOverride,
partition_key: &'a str, partition_key: &'a str,
) -> impl Iterator<Item = (&'a str, Cow<'a, str>)> { ) -> impl Iterator<Item = (&'a str, ColumnValue<'a>)> {
// Exploded parts of the generated key on the "/" character. // Exploded parts of the generated key on the "/" character.
// //
// Any uses of the "/" character within the partition key's user-provided // Any uses of the "/" character within the partition key's user-provided
@ -424,12 +545,30 @@ pub fn build_column_values<'a>(
}) })
// Reverse the urlencoding of all value parts // Reverse the urlencoding of all value parts
.map(|(name, value)| { .map(|(name, value)| {
( let decoded = percent_decode_str(value)
name, .decode_utf8()
percent_decode_str(value) .expect("invalid partition key part encoding");
.decode_utf8()
.expect("invalid partition key part encoding"), // Inspect the final character in the string, pre-decoding, to
) // determine if it has been truncated.
if value
.as_bytes()
.last()
.map(|v| *v == PARTITION_KEY_PART_TRUNCATED as u8)
.unwrap_or_default()
{
// Remove the truncation marker.
let len = decoded.len() - 1;
// Only allocate if needed; re-borrow a subslice of `Cow::Borrowed` if not.
let column_cow = match decoded {
Cow::Borrowed(s) => Cow::Borrowed(&s[..len]),
Cow::Owned(s) => Cow::Owned(s[..len].to_string()),
};
return (name, ColumnValue::Prefix(column_cow));
}
(name, ColumnValue::Identity(decoded))
}) })
} }
@ -467,6 +606,21 @@ mod tests {
use sqlx::Encode; use sqlx::Encode;
use test_helpers::assert_error; use test_helpers::assert_error;
#[test]
fn test_max_partition_key_len() {
let max_len: usize =
// 8 parts, at most 200 bytes long.
(MAXIMUM_NUMBER_OF_TEMPLATE_PARTS * PARTITION_KEY_MAX_PART_LEN)
// 7 delimiting characters between parts.
+ (MAXIMUM_NUMBER_OF_TEMPLATE_PARTS - 1);
// If this changes, the module documentation should be changed too.
//
// This shouldn't change without consideration of primary key overlap as
// a result.
assert_eq!(max_len, 1_607, "update module docs please");
}
#[test] #[test]
fn empty_parts_is_invalid() { fn empty_parts_is_invalid() {
let err = serialization::Wrapper::try_from(proto::PartitionTemplate { parts: vec![] }); let err = serialization::Wrapper::try_from(proto::PartitionTemplate { parts: vec![] });
@ -555,6 +709,17 @@ mod tests {
assert_error!(err, ValidationError::InvalidStrftime(ref format) if format.is_empty()); assert_error!(err, ValidationError::InvalidStrftime(ref format) if format.is_empty());
} }
fn identity(s: &str) -> ColumnValue<'_> {
ColumnValue::Identity(s.into())
}
fn prefix<'a, T>(s: T) -> ColumnValue<'a>
where
T: Into<Cow<'a, str>>,
{
ColumnValue::Prefix(s.into())
}
/// Generate a test that asserts "partition_key" is reversible, yielding /// Generate a test that asserts "partition_key" is reversible, yielding
/// "want" assuming the partition "template" was used. /// "want" assuming the partition "template" was used.
macro_rules! test_build_column_values { macro_rules! test_build_column_values {
@ -570,17 +735,13 @@ mod tests {
let template = $template.into_iter().collect::<Vec<_>>(); let template = $template.into_iter().collect::<Vec<_>>();
let template = test_table_partition_override(template); let template = test_table_partition_override(template);
// normalise the values into a (str, string) for the comparison // normalise the values into a (str, ColumnValue) for the comparison
let want = $want let want = $want
.into_iter() .into_iter()
.map(|(k, v)| {
let v: &str = v;
(k, v.to_string())
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let got = build_column_values(&template, $partition_key) let input = String::from($partition_key);
.map(|(k, v)| (k, v.to_string())) let got = build_column_values(&template, input.as_str())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert_eq!(got, want); assert_eq!(got, want);
@ -597,7 +758,7 @@ mod tests {
TemplatePart::TagValue("b"), TemplatePart::TagValue("b"),
], ],
partition_key = "2023|bananas|plátanos", partition_key = "2023|bananas|plátanos",
want = [("a", "bananas"), ("b", "plátanos")] want = [("a", identity("bananas")), ("b", identity("plátanos"))]
); );
test_build_column_values!( test_build_column_values!(
@ -608,7 +769,7 @@ mod tests {
TemplatePart::TagValue("b"), TemplatePart::TagValue("b"),
], ],
partition_key = "2023|!|plátanos", partition_key = "2023|!|plátanos",
want = [("b", "plátanos")] want = [("b", identity("plátanos"))]
); );
test_build_column_values!( test_build_column_values!(
@ -630,7 +791,7 @@ mod tests {
TemplatePart::TagValue("b"), TemplatePart::TagValue("b"),
], ],
partition_key = "2023|cat%7Cdog|%21", partition_key = "2023|cat%7Cdog|%21",
want = [("a", "cat|dog"), ("b", "!")] want = [("a", identity("cat|dog")), ("b", identity("!"))]
); );
test_build_column_values!( test_build_column_values!(
@ -641,7 +802,40 @@ mod tests {
TemplatePart::TagValue("b"), TemplatePart::TagValue("b"),
], ],
partition_key = "2023|%2550|!", partition_key = "2023|%2550|!",
want = [("a", "%50")] want = [("a", identity("%50"))]
);
test_build_column_values!(
module_doc_example_7,
template = [
TemplatePart::TimeFormat("%Y"),
TemplatePart::TagValue("a"),
TemplatePart::TagValue("b"),
],
partition_key = "2023|BANANAS#|!",
want = [("a", prefix("BANANAS"))]
);
test_build_column_values!(
unicode_code_point_prefix,
template = [
TemplatePart::TimeFormat("%Y"),
TemplatePart::TagValue("a"),
TemplatePart::TagValue("b"),
],
partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!",
want = [("a", prefix("(ノಠ益ಠ)ノ彡┻━┻"))]
);
test_build_column_values!(
unicode_grapheme,
template = [
TemplatePart::TimeFormat("%Y"),
TemplatePart::TagValue("a"),
TemplatePart::TagValue("b"),
],
partition_key = "2023|%E0%AE%A8%E0%AE%BF#|!",
want = [("a", prefix("நி"))]
); );
test_build_column_values!( test_build_column_values!(
@ -651,8 +845,8 @@ mod tests {
TemplatePart::TagValue("a"), TemplatePart::TagValue("a"),
TemplatePart::TagValue("b"), TemplatePart::TagValue("b"),
], ],
partition_key = "2023|is%7Cnot%21ambiguous%2510|!", partition_key = "2023|is%7Cnot%21ambiguous%2510%23|!",
want = [("a", "is|not!ambiguous%10")] want = [("a", identity("is|not!ambiguous%10#"))]
); );
test_build_column_values!( test_build_column_values!(
@ -671,11 +865,30 @@ mod tests {
} }
#[test] #[test]
fn test_empty_partition_key_char_str_equality() { fn test_column_value_partial_eq() {
assert_eq!( assert_eq!(identity("bananas"), "bananas");
PARTITION_KEY_VALUE_EMPTY.to_string(),
PARTITION_KEY_VALUE_EMPTY_STR assert_ne!(identity("bananas"), "bananas2");
); assert_ne!(identity("bananas2"), "bananas");
assert_ne!(prefix("bananas"), "bananas");
assert_ne!(prefix("bananas"), "bananas2");
assert_ne!(prefix("bananas2"), "bananas");
}
#[test]
fn test_column_value_is_prefix_match() {
let b = "bananas".to_string();
assert!(identity("bananas").is_prefix_match_of(b));
assert!(identity("bananas").is_prefix_match_of("bananas"));
assert!(identity("bananas").is_prefix_match_of("bananas2"));
assert!(prefix("bananas").is_prefix_match_of("bananas"));
assert!(prefix("bananas").is_prefix_match_of("bananas2"));
assert!(!identity("bananas2").is_prefix_match_of("bananas"));
assert!(!prefix("bananas2").is_prefix_match_of("bananas"));
} }
/// This test asserts the default derived partitioning scheme with no /// This test asserts the default derived partitioning scheme with no
@ -695,6 +908,14 @@ mod tests {
assert_matches!(got.as_slice(), [TemplatePart::TimeFormat("%Y-%m-%d")]); assert_matches!(got.as_slice(), [TemplatePart::TimeFormat("%Y-%m-%d")]);
} }
#[test]
fn len_of_default_template_is_1() {
let ns = NamespacePartitionTemplateOverride::default();
let t = TablePartitionTemplateOverride::try_new(None, &ns).unwrap();
assert_eq!(t.len(), 1);
}
#[test] #[test]
fn no_custom_table_template_specified_gets_namespace_template() { fn no_custom_table_template_specified_gets_namespace_template() {
let namespace_template = let namespace_template =
@ -707,6 +928,7 @@ mod tests {
let table_template = let table_template =
TablePartitionTemplateOverride::try_new(None, &namespace_template).unwrap(); TablePartitionTemplateOverride::try_new(None, &namespace_template).unwrap();
assert_eq!(table_template.len(), 1);
assert_eq!(table_template.0, namespace_template.0); assert_eq!(table_template.0, namespace_template.0);
} }
@ -730,6 +952,7 @@ mod tests {
) )
.unwrap(); .unwrap();
assert_eq!(table_template.len(), 1);
assert_eq!(table_template.0.unwrap().inner(), &custom_table_template); assert_eq!(table_template.0.unwrap().inner(), &custom_table_template);
} }
@ -779,5 +1002,6 @@ mod tests {
); );
let table_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect(); let table_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect();
assert_eq!(table_json_str, expected_json_str); assert_eq!(table_json_str, expected_json_str);
assert_eq!(table.len(), 2);
} }
} }

View File

@ -19,6 +19,7 @@ itertools = "0.10"
workspace-hack = { version = "0.1", path = "../workspace-hack" } workspace-hack = { version = "0.1", path = "../workspace-hack" }
percent-encoding = "2.2.0" percent-encoding = "2.2.0"
thiserror = "1.0.40" thiserror = "1.0.40"
unicode-segmentation = "1.10.1"
[dev-dependencies] [dev-dependencies]
assert_matches = "1.5.0" assert_matches = "1.5.0"

View File

@ -11,11 +11,13 @@ use std::{borrow::Cow, ops::Range};
use data_types::partition_template::{ use data_types::partition_template::{
TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS, TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS,
PARTITION_KEY_DELIMITER, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, PARTITION_KEY_DELIMITER, PARTITION_KEY_MAX_PART_LEN,
PARTITION_KEY_PART_TRUNCATED, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR,
}; };
use percent_encoding::utf8_percent_encode; use percent_encoding::utf8_percent_encode;
use schema::{InfluxColumnType, TIME_COLUMN_NAME}; use schema::{InfluxColumnType, TIME_COLUMN_NAME};
use thiserror::Error; use thiserror::Error;
use unicode_segmentation::UnicodeSegmentation;
use crate::{ use crate::{
column::{Column, ColumnData}, column::{Column, ColumnData},
@ -49,6 +51,14 @@ pub fn partition_batch<'a>(
batch: &'a MutableBatch, batch: &'a MutableBatch,
template: &'a TablePartitionTemplateOverride, template: &'a TablePartitionTemplateOverride,
) -> impl Iterator<Item = (Result<String, PartitionKeyError>, Range<usize>)> + 'a { ) -> impl Iterator<Item = (Result<String, PartitionKeyError>, Range<usize>)> + 'a {
let parts = template.len();
if parts > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS {
panic!(
"partition template contains {} parts, which exceeds the maximum of {} parts",
parts, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS
);
}
range_encode(partition_keys(batch, template.parts())) range_encode(partition_keys(batch, template.parts()))
} }
@ -88,13 +98,9 @@ impl<'a> Template<'a> {
// potentially different key. // potentially different key.
*last_key = Some(this_key); *last_key = Some(this_key);
out.write_str(never_empty( out.write_str(
Cow::from(utf8_percent_encode( encode_key_part(dictionary.lookup_id(this_key).unwrap()).as_ref(),
dictionary.lookup_id(this_key).unwrap(), )?
&ENCODED_PARTITION_KEY_CHARS,
))
.as_ref(),
))?
} }
_ => return Err(PartitionKeyError::TagValueNotTag(col.influx_type())), _ => return Err(PartitionKeyError::TagValueNotTag(col.influx_type())),
}, },
@ -147,13 +153,49 @@ impl<'a> Template<'a> {
} }
} }
/// Return `s` if it is non-empty, else [`PARTITION_KEY_VALUE_EMPTY_STR`]. fn encode_key_part(s: &str) -> Cow<'_, str> {
#[inline(always)] // Encode reserved characters and non-ascii characters.
fn never_empty(s: &str) -> &str { let as_str: Cow<'_, str> = utf8_percent_encode(s, &ENCODED_PARTITION_KEY_CHARS).into();
if s.is_empty() {
return PARTITION_KEY_VALUE_EMPTY_STR; match as_str.len() {
0 => Cow::Borrowed(PARTITION_KEY_VALUE_EMPTY_STR),
1..=PARTITION_KEY_MAX_PART_LEN => as_str,
_ => {
// This string exceeds the maximum byte length limit and must be
// truncated.
//
// Truncation of unicode strings can be tricky - this implementation
// avoids splitting unicode code-points nor graphemes. See the
// partition_template module docs in data_types before altering
// this.
// Preallocate the string to hold the long partition key part.
let mut buf = String::with_capacity(PARTITION_KEY_MAX_PART_LEN);
// This is a slow path, re-encoding the original input string -
// fortunately this is an uncommon path.
//
// Walk the string, encoding each grapheme (which includes spaces)
// individually, tracking the total length of the encoded string.
// Once it hits 199 bytes, stop and append a #.
let mut bytes = 0;
s.graphemes(true)
.map(|v| Cow::from(utf8_percent_encode(v, &ENCODED_PARTITION_KEY_CHARS)))
.take_while(|v| {
bytes += v.len(); // Byte length of encoded grapheme
bytes < PARTITION_KEY_MAX_PART_LEN
})
.for_each(|v| buf.push_str(v.as_ref()));
// Append the truncation marker.
buf.push(PARTITION_KEY_PART_TRUNCATED);
assert!(buf.len() <= PARTITION_KEY_MAX_PART_LEN);
Cow::Owned(buf)
}
} }
s
} }
/// Returns an iterator of partition keys for the given table batch. /// Returns an iterator of partition keys for the given table batch.
@ -293,7 +335,9 @@ mod tests {
use crate::writer::Writer; use crate::writer::Writer;
use assert_matches::assert_matches; use assert_matches::assert_matches;
use data_types::partition_template::{build_column_values, test_table_partition_override}; use data_types::partition_template::{
build_column_values, test_table_partition_override, ColumnValue,
};
use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy}; use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy};
use rand::prelude::*; use rand::prelude::*;
@ -529,6 +573,20 @@ mod tests {
assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_))); assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
} }
fn identity<'a, T>(s: T) -> ColumnValue<'a>
where
T: Into<Cow<'a, str>>,
{
ColumnValue::Identity(s.into())
}
fn prefix<'a, T>(s: T) -> ColumnValue<'a>
where
T: Into<Cow<'a, str>>,
{
ColumnValue::Prefix(s.into())
}
// Generate a test that asserts the derived partition key matches // Generate a test that asserts the derived partition key matches
// "want_key", when using the provided "template" parts and set of "tags". // "want_key", when using the provided "template" parts and set of "tags".
// //
@ -557,8 +615,9 @@ mod tests {
.unwrap(); .unwrap();
for (col, value) in $tags { for (col, value) in $tags {
let v = String::from(value);
writer writer
.write_tag(col, Some(&[0b00000001]), vec![value].into_iter()) .write_tag(col, Some(&[0b00000001]), vec![v.as_str()].into_iter())
.unwrap(); .unwrap();
} }
@ -569,24 +628,19 @@ mod tests {
// normalise the values. // normalise the values.
let keys = generate_denormalised_keys(&batch, template.parts()) let keys = generate_denormalised_keys(&batch, template.parts())
.unwrap(); .unwrap();
assert_eq!(keys, vec![$want_key.to_string()]); assert_eq!(keys, vec![$want_key.to_string()], "generated key differs");
// Reverse the encoding. // Reverse the encoding.
let reversed = build_column_values(&template, &keys[0]); let reversed = build_column_values(&template, &keys[0]);
// normalise the tags into a (str, string) for the comparison // Expect the tags to be (str, ColumnValue) for the
let want = $want_reversed_tags // comparison
let want: Vec<(&str, ColumnValue<'_>)> = $want_reversed_tags
.into_iter() .into_iter()
.map(|(k, v)| { .collect();
let v: &str = v;
(k, v.to_string())
})
.collect::<Vec<_>>();
let got = reversed let got = reversed.collect::<Vec<_>>();
.map(|(k, v)| (k, v.to_string())) assert_eq!(got, want, "reversed key differs");
.collect::<Vec<_>>();
assert_eq!(got, want);
} }
} }
}; };
@ -601,7 +655,7 @@ mod tests {
], ],
tags = [("a", "bananas"), ("b", "are_good")], tags = [("a", "bananas"), ("b", "are_good")],
want_key = "2023|bananas|are_good", want_key = "2023|bananas|are_good",
want_reversed_tags = [("a", "bananas"), ("b", "are_good")] want_reversed_tags = [("a", identity("bananas")), ("b", identity("are_good"))]
); );
test_partition_key!( test_partition_key!(
@ -613,7 +667,7 @@ mod tests {
], ],
tags = [("a", "bananas"), ("b", "plátanos")], tags = [("a", "bananas"), ("b", "plátanos")],
want_key = "2023|bananas|pl%C3%A1tanos", want_key = "2023|bananas|pl%C3%A1tanos",
want_reversed_tags = [("a", "bananas"), ("b", "plátanos")] want_reversed_tags = [("a", identity("bananas")), ("b", identity("plátanos"))]
); );
test_partition_key!( test_partition_key!(
@ -629,7 +683,7 @@ mod tests {
template = [TemplatePart::TagValue("a")], template = [TemplatePart::TagValue("a")],
tags = [("a", "")], tags = [("a", "")],
want_key = "^", want_key = "^",
want_reversed_tags = [("a", "")] want_reversed_tags = [("a", identity(""))]
); );
test_partition_key!( test_partition_key!(
@ -637,7 +691,7 @@ mod tests {
template = [TemplatePart::TagValue("a"), TemplatePart::TagValue("b")], template = [TemplatePart::TagValue("a"), TemplatePart::TagValue("b")],
tags = [("a", "bananas")], tags = [("a", "bananas")],
want_key = "bananas|!", want_key = "bananas|!",
want_reversed_tags = [("a", "bananas")] want_reversed_tags = [("a", identity("bananas"))]
); );
test_partition_key!( test_partition_key!(
@ -652,7 +706,256 @@ mod tests {
], ],
tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")], tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")],
want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E", want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E",
want_reversed_tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")] want_reversed_tags = [
("a", identity("|")),
("b", identity("!")),
("d", identity("%7C%21%257C")),
("e", identity("^"))
]
);
test_partition_key!(
truncated_char_reserved,
template = [TemplatePart::TagValue("a")],
tags = [("a", "#")],
want_key = "%23",
want_reversed_tags = [("a", identity("#"))]
);
// Keys < 200 bytes long should not be truncated.
test_partition_key!(
truncate_length_199,
template = [TemplatePart::TagValue("a")],
tags = [("a", "A".repeat(199))],
want_key = "A".repeat(199),
want_reversed_tags = [("a", identity("A".repeat(199)))]
);
// Keys of exactly 200 bytes long should not be truncated.
test_partition_key!(
truncate_length_200,
template = [TemplatePart::TagValue("a")],
tags = [("a", "A".repeat(200))],
want_key = "A".repeat(200),
want_reversed_tags = [("a", identity("A".repeat(200)))]
);
// Keys > 200 bytes long should be truncated to exactly 200 bytes,
// terminated by a # character.
test_partition_key!(
truncate_length_201,
template = [TemplatePart::TagValue("a")],
tags = [("a", "A".repeat(201))],
want_key = format!("{}#", "A".repeat(199)),
want_reversed_tags = [("a", prefix("A".repeat(199)))]
);
// A key ending in an encoded sequence that does not cross the cut-off point
// is preserved.
//
// This subtest generates a key of:
//
// `A..<repeats>%`
// ^ cutoff
//
// Which when encoded, becomes:
//
// `A..<repeats>%25`
// ^ cutoff
//
// So the entire encoded sequence should be preserved.
test_partition_key!(
truncate_encoding_sequence_ok,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}%", "A".repeat(197)))],
want_key = format!("{}%25", "A".repeat(197)), // Not truncated
want_reversed_tags = [("a", identity(format!("{}%", "A".repeat(197))))]
);
// A key ending in an encoded sequence should not be split.
//
// This subtest generates a key of:
//
// `A..<repeats>%`
// ^ cutoff
//
// Which when encoded, becomes:
//
// `A..<repeats>% 25` (space added for clarity)
// ^ cutoff
//
// Where naive slicing would result in truncating an encoding sequence and
// therefore the whole encoded sequence should be truncated.
test_partition_key!(
truncate_encoding_sequence_truncated_1,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}%", "A".repeat(198)))],
want_key = format!("{}#", "A".repeat(198)), // Truncated
want_reversed_tags = [("a", prefix("A".repeat(198)))]
);
// A key ending in an encoded sequence should not be split.
//
// This subtest generates a key of:
//
// `A..<repeats>%`
// ^ cutoff
//
// Which when encoded, becomes:
//
// `A..<repeats>%2 5` (space added for clarity)
// ^ cutoff
//
// Where naive slicing would result in truncating an encoding sequence and
// therefore the whole encoded sequence should be truncated.
test_partition_key!(
truncate_encoding_sequence_truncated_2,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}%", "A".repeat(199)))],
want_key = format!("{}#", "A".repeat(199)), // Truncated
want_reversed_tags = [("a", prefix("A".repeat(199)))]
);
// A key ending in a unicode code-point should never be split.
//
// This subtest generates a key of:
//
// `A..<repeats>🍌`
// ^ cutoff
//
// Which when encoded, becomes:
//
// `A..<repeats>%F0%9F%8D%8C`
// ^ cutoff
//
// Therefore the entire code-point should be removed from the truncated
// output.
//
// This test MUST NOT fail, or an invalid UTF-8 string is being generated
// which is unusable in languages (like Rust).
//
// Advances the cut-off to ensure the position within the code-point doesn't
// affect the output.
test_partition_key!(
truncate_within_code_point_1,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}🍌", "A".repeat(194)))],
want_key = format!("{}#", "A".repeat(194)),
want_reversed_tags = [("a", prefix("A".repeat(194)))]
);
test_partition_key!(
truncate_within_code_point_2,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}🍌", "A".repeat(195)))],
want_key = format!("{}#", "A".repeat(195)),
want_reversed_tags = [("a", prefix("A".repeat(195)))]
);
test_partition_key!(
truncate_within_code_point_3,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}🍌", "A".repeat(196)))],
want_key = format!("{}#", "A".repeat(196)),
want_reversed_tags = [("a", prefix("A".repeat(196)))]
);
// A key ending in a unicode grapheme should never be split.
//
// This subtest generates a key of:
//
// `A..<repeats>நிbananas`
// ^ cutoff
//
// Which when encoded, becomes:
//
// `A..<repeats>நிbananas` (within a grapheme)
// ^ cutoff
//
// Therefore the entire grapheme (நி) should be removed from the truncated
// output.
//
// This is a conservative implementation, and may be relaxed in the future.
//
// This first test asserts that a grapheme can be included, and then
// subsequent tests increment the cut-off point by 1 byte each time.
test_partition_key!(
truncate_within_grapheme_0,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(181)))],
want_key = format!("{}%E0%AE%A8%E0%AE%BF#", "A".repeat(181)),
want_reversed_tags = [("a", prefix(format!("{}நி", "A".repeat(181))))]
);
test_partition_key!(
truncate_within_grapheme_1,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(182)))],
want_key = format!("{}#", "A".repeat(182)),
want_reversed_tags = [("a", prefix("A".repeat(182)))]
);
test_partition_key!(
truncate_within_grapheme_2,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(183)))],
want_key = format!("{}#", "A".repeat(183)),
want_reversed_tags = [("a", prefix("A".repeat(183)))]
);
test_partition_key!(
truncate_within_grapheme_3,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(184)))],
want_key = format!("{}#", "A".repeat(184)),
want_reversed_tags = [("a", prefix("A".repeat(184)))]
);
test_partition_key!(
truncate_within_grapheme_4,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(185)))],
want_key = format!("{}#", "A".repeat(185)),
want_reversed_tags = [("a", prefix("A".repeat(185)))]
);
test_partition_key!(
truncate_within_grapheme_5,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(186)))],
want_key = format!("{}#", "A".repeat(186)),
want_reversed_tags = [("a", prefix("A".repeat(186)))]
);
test_partition_key!(
truncate_within_grapheme_6,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(187)))],
want_key = format!("{}#", "A".repeat(187)),
want_reversed_tags = [("a", prefix("A".repeat(187)))]
);
test_partition_key!(
truncate_within_grapheme_7,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(188)))],
want_key = format!("{}#", "A".repeat(188)),
want_reversed_tags = [("a", prefix("A".repeat(188)))]
);
test_partition_key!(
truncate_within_grapheme_8,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(189)))],
want_key = format!("{}#", "A".repeat(189)),
want_reversed_tags = [("a", prefix("A".repeat(189)))]
);
test_partition_key!(
truncate_within_grapheme_9,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நிbananas", "A".repeat(190)))],
want_key = format!("{}#", "A".repeat(190)),
want_reversed_tags = [("a", prefix("A".repeat(190)))]
);
// As above, but the grapheme is the last portion of the generated string
// (no trailing bananas).
test_partition_key!(
truncate_grapheme_identity,
template = [TemplatePart::TagValue("a")],
tags = [("a", format!("{}நி", "A".repeat(182)))],
want_key = format!("{}%E0%AE%A8%E0%AE%BF", "A".repeat(182)),
want_reversed_tags = [("a", identity(format!("{}நி", "A".repeat(182))))]
); );
/// A test using an invalid strftime format string. /// A test using an invalid strftime format string.
@ -679,6 +982,20 @@ mod tests {
assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime)); assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime));
} }
#[test]
#[should_panic(
expected = "partition template contains 9 parts, which exceeds the maximum of 8 parts"
)]
fn test_too_many_parts() {
let template = test_table_partition_override(
std::iter::repeat(TemplatePart::TagValue("bananas"))
.take(9)
.collect(),
);
let _ = partition_batch(&MutableBatch::new(), &template);
}
// These values are arbitrarily chosen when building an input to the // These values are arbitrarily chosen when building an input to the
// partitioner. // partitioner.
@ -704,11 +1021,11 @@ mod tests {
]; ];
prop_compose! { prop_compose! {
/// Yields a vector of up to 12 unique template parts, chosen from /// Yields a vector of up to [`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`] unique
/// [`TEST_TEMPLATE_PARTS`]. /// template parts, chosen from [`TEST_TEMPLATE_PARTS`].
fn arbitrary_template_parts()(set in proptest::collection::vec( fn arbitrary_template_parts()(set in proptest::collection::vec(
proptest::sample::select(TEST_TEMPLATE_PARTS), proptest::sample::select(TEST_TEMPLATE_PARTS),
(1, 12) // Set size range (1, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS) // Set size range
)) -> Vec<TemplatePart<'static>> { )) -> Vec<TemplatePart<'static>> {
let mut set = set; let mut set = set;
set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}")); set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}"));
@ -762,20 +1079,44 @@ mod tests {
assert_eq!(keys.len(), 1); assert_eq!(keys.len(), 1);
// Reverse the encoding. // Reverse the encoding.
let reversed = build_column_values(&template, &keys[0]).map(|(k, v)| (k, v.to_string())).collect::<Vec<_>>(); let reversed: Vec<(&str, ColumnValue<'_>)> = build_column_values(&template, &keys[0]).collect();
// Build the expected set of reversed tags by filtering out any // Build the expected set of reversed tags by filtering out any
// NULL tags (preserving empty string values). // NULL tags (preserving empty string values).
let want_reversed = template.parts().filter_map(|v| match v { let want_reversed: Vec<(&str, String)> = template.parts().filter_map(|v| match v {
TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => { TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => {
// This tag had a (potentially empty) value wrote and should // This tag had a (potentially empty) value wrote and should
// appear in the reversed output. // appear in the reversed output.
Some((col_name, tag_values.get(col_name).unwrap().to_string())) Some((col_name, tag_values.get(col_name).unwrap().to_string()))
} }
_ => None, _ => None,
}).collect::<Vec<_>>(); }).collect();
assert_eq!(reversed, want_reversed); assert_eq!(want_reversed.len(), reversed.len());
for (want, got) in want_reversed.iter().zip(reversed.iter()) {
assert_eq!(got.0, want.0, "column names differ");
match got.1 {
ColumnValue::Identity(_) => {
// An identity is both equal to, and a prefix of, the
// original value.
assert_eq!(got.1, want.1, "identity values differ");
assert!(
got.1.is_prefix_match_of(&want.1),
"prefix mismatch; {:?} is not a prefix of {:?}",
got.1,
want.1
);
},
ColumnValue::Prefix(_) => assert!(
got.1.is_prefix_match_of(&want.1),
"prefix mismatch; {:?} is not a prefix of {:?}",
got.1,
want.1
),
};
}
} }
/// A property test that asserts the partitioner tolerates (does not /// A property test that asserts the partitioner tolerates (does not

View File

@ -1,12 +1,10 @@
use std::{borrow::Cow, fmt::Write}; use std::fmt::Write;
use chrono::{format::StrftimeItems, TimeZone, Utc}; use chrono::{format::StrftimeItems, TimeZone, Utc};
use data_types::partition_template::ENCODED_PARTITION_KEY_CHARS;
use percent_encoding::utf8_percent_encode;
use crate::PartitionKeyError; use crate::PartitionKeyError;
use super::never_empty; use super::encode_key_part;
/// The number of nanoseconds in 1 day, definitely recited from memory. /// The number of nanoseconds in 1 day, definitely recited from memory.
const DAY_NANOSECONDS: i64 = 86_400_000_000_000; const DAY_NANOSECONDS: i64 = 86_400_000_000_000;
@ -210,10 +208,7 @@ impl<'a> StrftimeFormatter<'a> {
.map_err(|_| PartitionKeyError::InvalidStrftime)?; .map_err(|_| PartitionKeyError::InvalidStrftime)?;
// Encode any reserved characters in this new string. // Encode any reserved characters in this new string.
buf.1 = never_empty( buf.1 = encode_key_part(&buf.1).to_string();
Cow::from(utf8_percent_encode(&buf.1, &ENCODED_PARTITION_KEY_CHARS)).as_ref(),
)
.to_string();
// Render this new value to the caller's buffer // Render this new value to the caller's buffer
out.write_str(&buf.1)?; out.write_str(&buf.1)?;
@ -344,7 +339,7 @@ mod tests {
/// formatter, therefore this test asserts the following property: /// formatter, therefore this test asserts the following property:
/// ///
/// For any timestamp and formatter, the output of this type must /// For any timestamp and formatter, the output of this type must
/// match the output of chrono's formatter, after URL encoding. /// match the output of chrono's formatter, after key encoding.
/// ///
/// Validating this asserts correctness of the wrapper itself, assuming /// Validating this asserts correctness of the wrapper itself, assuming
/// chrono's formatter produces correct output. Note the encoding is /// chrono's formatter produces correct output. Note the encoding is
@ -366,10 +361,7 @@ mod tests {
Utc.timestamp_nanos(ts) Utc.timestamp_nanos(ts)
.format_with_items(items.clone()) .format_with_items(items.clone())
); );
let control = never_empty( let control = encode_key_part(&control);
Cow::from(utf8_percent_encode(&control, &ENCODED_PARTITION_KEY_CHARS)).as_ref(),
)
.to_string();
// Generate the test string. // Generate the test string.
let mut test = String::new(); let mut test = String::new();