feat: teach string encoding to production Dictionary values

pull/24376/head
Edd Robinson 2021-05-19 12:13:49 +01:00
parent 634ceb886b
commit 3de6f3f8bd
1 changed files with 257 additions and 3 deletions

View File

@ -1,11 +1,11 @@
use std::collections::BTreeSet;
use std::collections::{BTreeMap, BTreeSet};
use arrow::{self, array::Array};
use either::Either;
use super::cmp;
use super::encoding::string::{dictionary, rle};
use super::encoding::string::{Dictionary, Encoding, RLE};
use super::encoding::string::{Dictionary, Encoding, NULL_ID, RLE};
use crate::column::{RowIDs, Statistics, Value, Values};
// Edd's totally made up magic constant. This determines whether we would use
@ -152,7 +152,7 @@ impl StringEncoding {
}
}
/// All values present at the provided logical row ids.
/// All values present at the provided logical row IDs.
///
/// TODO(edd): perf - pooling of destination vectors.
pub fn values(&self, row_ids: &[u32]) -> Values<'_> {
@ -162,6 +162,67 @@ impl StringEncoding {
}
}
/// Returns all values present at the provided logical row IDs as a
/// dictionary encoded `Values` format.
pub fn values_as_dictionary(&self, row_ids: &[u32]) -> Values<'_> {
//
// Example:
//
// Suppose you have column encoded like this:
//
// values: NULL, "alpha", "beta", "gamma"
// encoded: 1, 1, 2, 0, 3 (alpha, alpha, beta, NULL, gamma)
//
// And only the rows: {0, 1, 3, 4} are required.
//
// The column encoding will return the following encoded values
//
// encoded: 1, 1, 0, 3 (alpha, alpha, NULL, gamma)
//
// Because the dictionary has likely changed, the encoded values need
// to be transformed into a new domain so that they are:
//
// keys: [1, 1, 0, 2]
// values: [None, Some("alpha"), Some("gamma")]
let mut keys = self.encoded_values(row_ids, vec![]);
// build a mapping from encoded value to new ordinal position.
let mut ordinal_mapping = BTreeMap::new();
for key in &keys {
// no hashbrown entry API for ordered set. `contains_key` is most
// performant way to build this mapping.
if ordinal_mapping.contains_key(key) {
continue;
}
ordinal_mapping.insert(*key, 0);
}
// create new ordinal offsets - the encoded values for the
// dictionary will be correctly ordered, but they need to be shifted
// into a new domain [0, keys.len()).
for (i, offset) in ordinal_mapping.values_mut().enumerate() {
*offset = i as u32;
}
// Rewrite all the encoded values into the new domain.
for id in keys.iter_mut() {
*id = *ordinal_mapping.get(id).unwrap();
}
let values = match &self {
Self::RleDictionary(c) => ordinal_mapping
.keys()
.map(|id| c.decode_id(*id))
.collect::<Vec<_>>(),
Self::Dictionary(c) => ordinal_mapping
.keys()
.map(|id| c.decode_id(*id))
.collect::<Vec<_>>(),
};
Values::Dictionary(keys, values)
}
/// All values in the column.
///
/// TODO(edd): perf - pooling of destination vectors.
@ -172,6 +233,48 @@ impl StringEncoding {
}
}
/// Returns all values as a dictionary encoded `Values` format.
pub fn all_values_as_dictionary(&self) -> Values<'_> {
let mut keys = self.all_encoded_values(vec![]);
let values = if self.contains_null() {
// The column's ordered set of values including None because that is a
// reserved encoded key (`0`).
let mut values = vec![None];
match &self {
Self::RleDictionary(c) => {
values.extend(c.dictionary().into_iter().map(|s| Some(s.as_str())));
}
Self::Dictionary(c) => {
values.extend(c.dictionary().into_iter().map(|s| Some(s.as_str())));
}
};
values
} else {
// since column doesn't contain null we need to shift all the encoded
// values down
assert_eq!(NULL_ID, 0);
for key in keys.iter_mut() {
*key -= 1;
}
match &self {
Self::RleDictionary(c) => c
.dictionary()
.into_iter()
.map(|s| Some(s.as_str()))
.collect::<Vec<_>>(),
Self::Dictionary(c) => c
.dictionary()
.into_iter()
.map(|s| Some(s.as_str()))
.collect::<Vec<_>>(),
}
};
Values::Dictionary(keys, values)
}
/// Returns the logical value for the specified encoded representation.
pub fn decode_id(&self, encoded_id: u32) -> Value<'_> {
match &self {
@ -487,3 +590,154 @@ impl From<&[&str]> for StringEncoding {
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
// tests both `values_as_dictionary` and `all_values_as_dictionary`
fn values_as_dictionary() {
let set = vec!["apple", "beta", "orange", "pear"];
let data = vec![
Some("apple"),
Some("apple"),
Some("pear"),
None,
None,
Some("orange"),
Some("beta"),
];
let mut rle = RLE::with_dictionary(
set.iter()
.cloned()
.map(String::from)
.collect::<BTreeSet<String>>(),
);
for v in data.iter().map(|x| x.map(String::from)) {
rle.push_additional(v, 1);
}
let mut dict = Dictionary::with_dictionary(
set.into_iter()
.map(String::from)
.collect::<BTreeSet<String>>(),
);
for v in data.iter().map(|x| x.map(String::from)) {
dict.push_additional(v, 1);
}
let encodings = vec![
StringEncoding::RleDictionary(rle),
StringEncoding::Dictionary(dict),
];
for enc in encodings {
_values_as_dictionary(&enc);
_all_values_as_dictionary(&enc);
}
// example without NULL values
let data = vec![
Some("apple"),
Some("apple"),
Some("beta"),
Some("orange"),
Some("pear"),
];
let encodings = vec![
StringEncoding::RleDictionary(RLE::from(data.clone())),
StringEncoding::Dictionary(Dictionary::from(data)),
];
for enc in encodings {
let exp_keys = vec![0, 0, 1, 2, 3];
let exp_values = vec![Some("apple"), Some("beta"), Some("orange"), Some("pear")];
let values = enc.all_values_as_dictionary();
if let Values::Dictionary(got_keys, got_values) = values {
assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc);
assert_eq!(
got_values, exp_values,
"values comparison for {} failed",
enc
);
} else {
panic!("invalid Values format returned, got {:?}", values);
}
}
}
fn _values_as_dictionary(enc: &StringEncoding) {
// column is: [apple, apple, pear, NULL, NULL, orange, beta]
let cases = vec![
(
&[0, 3, 4][..], // apple NULL, NULL
(vec![1, 0, 0], vec![None, Some("apple")]),
),
(
&[6], // beta
(vec![0], vec![Some("beta")]),
),
(
&[0, 3, 5][..], // apple NULL, orange
(vec![1, 0, 2], vec![None, Some("apple"), Some("orange")]),
),
(
&[0, 1, 2, 3, 4, 5, 6], // apple, apple, pear, NULL, NULL, orange, beta
(
vec![1, 1, 4, 0, 0, 3, 2],
vec![
None,
Some("apple"),
Some("beta"),
Some("orange"),
Some("pear"),
],
),
),
];
for (row_ids, (exp_keys, exp_values)) in cases {
let values = enc.values_as_dictionary(row_ids);
if let Values::Dictionary(got_keys, got_values) = values {
assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc);
assert_eq!(
got_values, exp_values,
"values comparison for {} failed",
enc
);
} else {
panic!("invalid Values format returned, got {:?}", values);
}
}
}
fn _all_values_as_dictionary(enc: &StringEncoding) {
// column is: [apple, apple, pear, NULL, NULL, orange, beta]
let exp_keys = vec![1, 1, 4, 0, 0, 3, 2];
let exp_values = vec![
None,
Some("apple"),
Some("beta"),
Some("orange"),
Some("pear"),
];
let values = enc.all_values_as_dictionary();
if let Values::Dictionary(got_keys, got_values) = values {
assert_eq!(got_keys, exp_keys, "key comparison for {} failed", enc);
assert_eq!(
got_values, exp_values,
"values comparison for {} failed",
enc
);
} else {
panic!("invalid Values format returned, got {:?}", values);
}
}
}