diff --git a/read_buffer/src/column/string.rs b/read_buffer/src/column/string.rs index 7f34f327cd..dfa13eaaae 100644 --- a/read_buffer/src/column/string.rs +++ b/read_buffer/src/column/string.rs @@ -1,11 +1,11 @@ -use std::collections::BTreeSet; +use std::collections::{BTreeMap, BTreeSet}; use arrow::{self, array::Array}; use either::Either; use super::cmp; use super::encoding::string::{dictionary, rle}; -use super::encoding::string::{Dictionary, Encoding, RLE}; +use super::encoding::string::{Dictionary, Encoding, NULL_ID, RLE}; use crate::column::{RowIDs, Statistics, Value, Values}; // Edd's totally made up magic constant. This determines whether we would use @@ -152,7 +152,7 @@ impl StringEncoding { } } - /// All values present at the provided logical row ids. + /// All values present at the provided logical row IDs. /// /// TODO(edd): perf - pooling of destination vectors. pub fn values(&self, row_ids: &[u32]) -> Values<'_> { @@ -162,6 +162,67 @@ impl StringEncoding { } } + /// Returns all values present at the provided logical row IDs as a + /// dictionary encoded `Values` format. + pub fn values_as_dictionary(&self, row_ids: &[u32]) -> Values<'_> { + // + // Example: + // + // Suppose you have column encoded like this: + // + // values: NULL, "alpha", "beta", "gamma" + // encoded: 1, 1, 2, 0, 3 (alpha, alpha, beta, NULL, gamma) + // + // And only the rows: {0, 1, 3, 4} are required. + // + // The column encoding will return the following encoded values + // + // encoded: 1, 1, 0, 3 (alpha, alpha, NULL, gamma) + // + // Because the dictionary has likely changed, the encoded values need + // to be transformed into a new domain so that they are: + // + // keys: [1, 1, 0, 2] + // values: [None, Some("alpha"), Some("gamma")] + let mut keys = self.encoded_values(row_ids, vec![]); + + // build a mapping from encoded value to new ordinal position. + let mut ordinal_mapping = BTreeMap::new(); + for key in &keys { + // no hashbrown entry API for ordered set. `contains_key` is most + // performant way to build this mapping. + if ordinal_mapping.contains_key(key) { + continue; + } + ordinal_mapping.insert(*key, 0); + } + + // create new ordinal offsets - the encoded values for the + // dictionary will be correctly ordered, but they need to be shifted + // into a new domain [0, keys.len()). + for (i, offset) in ordinal_mapping.values_mut().enumerate() { + *offset = i as u32; + } + + // Rewrite all the encoded values into the new domain. + for id in keys.iter_mut() { + *id = *ordinal_mapping.get(id).unwrap(); + } + + let values = match &self { + Self::RleDictionary(c) => ordinal_mapping + .keys() + .map(|id| c.decode_id(*id)) + .collect::>(), + Self::Dictionary(c) => ordinal_mapping + .keys() + .map(|id| c.decode_id(*id)) + .collect::>(), + }; + + Values::Dictionary(keys, values) + } + /// All values in the column. /// /// TODO(edd): perf - pooling of destination vectors. @@ -172,6 +233,48 @@ impl StringEncoding { } } + /// Returns all values as a dictionary encoded `Values` format. + pub fn all_values_as_dictionary(&self) -> Values<'_> { + let mut keys = self.all_encoded_values(vec![]); + + let values = if self.contains_null() { + // The column's ordered set of values including None because that is a + // reserved encoded key (`0`). + let mut values = vec![None]; + match &self { + Self::RleDictionary(c) => { + values.extend(c.dictionary().into_iter().map(|s| Some(s.as_str()))); + } + Self::Dictionary(c) => { + values.extend(c.dictionary().into_iter().map(|s| Some(s.as_str()))); + } + }; + values + } else { + // since column doesn't contain null we need to shift all the encoded + // values down + assert_eq!(NULL_ID, 0); + for key in keys.iter_mut() { + *key -= 1; + } + + match &self { + Self::RleDictionary(c) => c + .dictionary() + .into_iter() + .map(|s| Some(s.as_str())) + .collect::>(), + Self::Dictionary(c) => c + .dictionary() + .into_iter() + .map(|s| Some(s.as_str())) + .collect::>(), + } + }; + + Values::Dictionary(keys, values) + } + /// Returns the logical value for the specified encoded representation. pub fn decode_id(&self, encoded_id: u32) -> Value<'_> { match &self { @@ -487,3 +590,154 @@ impl From<&[&str]> for StringEncoding { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + // tests both `values_as_dictionary` and `all_values_as_dictionary` + fn values_as_dictionary() { + let set = vec!["apple", "beta", "orange", "pear"]; + let data = vec![ + Some("apple"), + Some("apple"), + Some("pear"), + None, + None, + Some("orange"), + Some("beta"), + ]; + + let mut rle = RLE::with_dictionary( + set.iter() + .cloned() + .map(String::from) + .collect::>(), + ); + for v in data.iter().map(|x| x.map(String::from)) { + rle.push_additional(v, 1); + } + + let mut dict = Dictionary::with_dictionary( + set.into_iter() + .map(String::from) + .collect::>(), + ); + for v in data.iter().map(|x| x.map(String::from)) { + dict.push_additional(v, 1); + } + + let encodings = vec![ + StringEncoding::RleDictionary(rle), + StringEncoding::Dictionary(dict), + ]; + + for enc in encodings { + _values_as_dictionary(&enc); + _all_values_as_dictionary(&enc); + } + + // example without NULL values + let data = vec![ + Some("apple"), + Some("apple"), + Some("beta"), + Some("orange"), + Some("pear"), + ]; + + let encodings = vec![ + StringEncoding::RleDictionary(RLE::from(data.clone())), + StringEncoding::Dictionary(Dictionary::from(data)), + ]; + + for enc in encodings { + let exp_keys = vec![0, 0, 1, 2, 3]; + let exp_values = vec![Some("apple"), Some("beta"), Some("orange"), Some("pear")]; + + let values = enc.all_values_as_dictionary(); + if let Values::Dictionary(got_keys, got_values) = values { + assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc); + assert_eq!( + got_values, exp_values, + "values comparison for {} failed", + enc + ); + } else { + panic!("invalid Values format returned, got {:?}", values); + } + } + } + + fn _values_as_dictionary(enc: &StringEncoding) { + // column is: [apple, apple, pear, NULL, NULL, orange, beta] + + let cases = vec![ + ( + &[0, 3, 4][..], // apple NULL, NULL + (vec![1, 0, 0], vec![None, Some("apple")]), + ), + ( + &[6], // beta + (vec![0], vec![Some("beta")]), + ), + ( + &[0, 3, 5][..], // apple NULL, orange + (vec![1, 0, 2], vec![None, Some("apple"), Some("orange")]), + ), + ( + &[0, 1, 2, 3, 4, 5, 6], // apple, apple, pear, NULL, NULL, orange, beta + ( + vec![1, 1, 4, 0, 0, 3, 2], + vec![ + None, + Some("apple"), + Some("beta"), + Some("orange"), + Some("pear"), + ], + ), + ), + ]; + + for (row_ids, (exp_keys, exp_values)) in cases { + let values = enc.values_as_dictionary(row_ids); + if let Values::Dictionary(got_keys, got_values) = values { + assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc); + assert_eq!( + got_values, exp_values, + "values comparison for {} failed", + enc + ); + } else { + panic!("invalid Values format returned, got {:?}", values); + } + } + } + + fn _all_values_as_dictionary(enc: &StringEncoding) { + // column is: [apple, apple, pear, NULL, NULL, orange, beta] + + let exp_keys = vec![1, 1, 4, 0, 0, 3, 2]; + let exp_values = vec![ + None, + Some("apple"), + Some("beta"), + Some("orange"), + Some("pear"), + ]; + + let values = enc.all_values_as_dictionary(); + if let Values::Dictionary(got_keys, got_values) = values { + assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc); + assert_eq!( + got_values, exp_values, + "values comparison for {} failed", + enc + ); + } else { + panic!("invalid Values format returned, got {:?}", values); + } + } +}