influxdb/mutable_batch/Cargo.toml

30 lines
850 B
TOML
Raw Normal View History

[package]
name = "mutable_batch"
description = "A mutable arrow RecordBatch"
version.workspace = true
authors.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
arrow = { workspace = true, features = ["prettyprint"] }
arrow_util = { path = "../arrow_util" }
chrono = { version = "0.4", default-features = false }
2022-05-05 19:29:24 +00:00
data_types = { path = "../data_types" }
iox_time = { path = "../iox_time" }
schema = { path = "../schema" }
2022-01-11 19:22:36 +00:00
snafu = "0.7"
hashbrown = { workspace = true }
itertools = "0.11"
2023-02-24 18:02:23 +00:00
workspace-hack = { version = "0.1", path = "../workspace-hack" }
feat: unambiguously reversible partition keys This commit changes the format of partition keys when generated with non-default partition key templates ONLY. A prior fixture test is unchanged by this commit, ensuring the default partition keys remain the same. When a custom partition key template is provided, it may specify one or more parts, with the TagValue template causing values extracted from tag columns to appear in the derived partition key. This commit changes the generated partition key in the following ways: * The delimiter of multi-part partition keys; the character used to delimit partition key parts is changed from "/" to "|" (the pipe character) as it is less likely to occur in user-provided input, reducing the encoding overhead. * The format of the extracted TagValue values (see below). Building on the work of custom partition key overrides, where an immutable partition template is resolved and set at table creation time, the changes in this PR enable the derived partition key to be unambiguously reversed into the set of tag (column_name, column_value) tuples it was generated from for use in query pruning logic. This is implemented by the build_column_values() method in this commit, which requires both the template, and the derived partition key. Prior to this commit, a partition key value extracted from a tag column was in the form "tagname_x" where "x" is the value and "tagname" is the name of the tag column it was extracted from. After this commit, the partition key value is in the form "x"; the column name is removed from the derived string to reduce the catalog storage overhead (a key driver of COGS). In the case of a NULL tag value, the sentinel value "!" is inserted instead of the prior "tagname_" marker. In the case of an empty string tag value (""), the sentinel "^" value is inserted instead of the "tagname_-" marker, ensuring the distinction between an empty value and a not-present tag is preserved. Additionally tag values utilise percent encoding to encode reserved characters (part delimiter, empty sentinel character, % itself) to eliminate deserialisation ambiguity. Examples of how this has changed derived partition keys, for a template of [Time(YYYY-MM-DD), TagValue(region), TagValue(bananas)]: Write: time=1970-01-01,region=west,other=ignored Old: "1970-01-01-region_west-bananas" New: "1970-01-01|west|!" Write: time=1970-01-01,other=ignored Old: "1970-01-01-region-bananas" New: "1970-01-01|!|!"
2023-05-29 12:47:25 +00:00
percent-encoding = "2.2.0"
thiserror = "1.0.44"
unicode-segmentation = "1.10.1"
[dev-dependencies]
assert_matches = "1.5.0"
mutable_batch_lp = { path = "../mutable_batch_lp" }
paste = "1.0.14"
proptest = { version = "1.2.0", default-features = false }
rand = "0.8"