From 1252d1b2f46bb52274627ad2575d3737fdd80a26 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 12 Nov 2020 11:51:29 +0000 Subject: [PATCH] feat: wire up Plain dictionary encoder --- segment_store/src/column.rs | 74 +++++++++++++++++--- segment_store/src/column/dictionary.rs | 27 +++---- segment_store/src/column/dictionary/plain.rs | 6 +- segment_store/src/column/dictionary/rle.rs | 4 +- 4 files changed, 85 insertions(+), 26 deletions(-) diff --git a/segment_store/src/column.rs b/segment_store/src/column.rs index 6a8f34e6fa..529622cb32 100644 --- a/segment_store/src/column.rs +++ b/segment_store/src/column.rs @@ -14,6 +14,15 @@ use arrow_deps::arrow::array::{ }; use arrow_deps::{arrow, arrow::array::Array}; +// Edd's totally made up magic constant. This determines whether we would use +// a run-length encoded dictionary encoding or just a plain dictionary encoding. +// I have ideas about how to build heuristics to do this in a much better way +// than an arbitrary constant but for now it's this... +// +// FWIW it's not the cardinality of the column that should drive the decision +// it's how many run-lengths would be produced in an RLE column and whether that +// compression is worth the memory and compute costs to work on it. +pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 1_000_000; /// The possible logical types that column values can have. All values in a /// column have the same physical type. pub enum Column { @@ -630,6 +639,7 @@ impl MetaData { } pub enum StringEncoding { RLEDictionary(dictionary::RLE), + Dictionary(dictionary::Plain), // TODO - simple array encoding, e.g., via Arrow String array. } @@ -640,6 +650,7 @@ impl StringEncoding { pub fn contains_null(&self) -> bool { match &self { Self::RLEDictionary(c) => c.contains_null(), + Self::Dictionary(c) => c.contains_null(), } } @@ -650,6 +661,10 @@ impl StringEncoding { Some(v) => Value::String(v), None => Value::Null, }, + Self::Dictionary(c) => match c.value(row_id) { + Some(v) => Value::String(v), + None => Value::Null, + }, } } @@ -659,6 +674,7 @@ impl StringEncoding { pub fn values(&self, row_ids: &[u32]) -> Values { match &self { Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))), + Self::Dictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))), } } @@ -668,6 +684,7 @@ impl StringEncoding { pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> { match &self { Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())), + Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())), } } @@ -675,6 +692,7 @@ impl StringEncoding { pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs { match &self { Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst), + Self::Dictionary(c) => c.row_ids_filter(value, op, dst), } } @@ -683,7 +701,11 @@ impl StringEncoding { /// ids. pub fn min(&self, row_ids: &[u32]) -> Value<'_> { match &self { - StringEncoding::RLEDictionary(c) => match c.min(row_ids) { + Self::RLEDictionary(c) => match c.min(row_ids) { + Some(min) => Value::String(min), + None => Value::Null, + }, + Self::Dictionary(c) => match c.min(row_ids) { Some(min) => Value::String(min), None => Value::Null, }, @@ -695,7 +717,11 @@ impl StringEncoding { /// ids. pub fn max(&self, row_ids: &[u32]) -> Value<'_> { match &self { - StringEncoding::RLEDictionary(c) => match c.max(row_ids) { + Self::RLEDictionary(c) => match c.max(row_ids) { + Some(max) => Value::String(max), + None => Value::Null, + }, + Self::Dictionary(c) => match c.max(row_ids) { Some(max) => Value::String(max), None => Value::Null, }, @@ -705,7 +731,8 @@ impl StringEncoding { /// The number of non-null values at the provided row ids. pub fn count(&self, row_ids: &[u32]) -> u32 { match &self { - StringEncoding::RLEDictionary(c) => c.count(row_ids), + Self::RLEDictionary(c) => c.count(row_ids), + Self::Dictionary(c) => c.count(row_ids), } } @@ -726,7 +753,12 @@ impl StringEncoding { } } - let mut data = dictionary::RLE::with_dictionary(dictionary); + let mut data: dictionary::Encoding = + if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT { + dictionary::Encoding::Plain(dictionary::Plain::with_dictionary(dictionary)) + } else { + dictionary::Encoding::RLE(dictionary::RLE::with_dictionary(dictionary)) + }; let mut prev = if !arr.is_null(0) { Some(arr.value(0)) @@ -777,7 +809,12 @@ impl StringEncoding { range, }; - Self::RLEDictionary(data) + // TODO(edd): consider just storing under the `StringEncoding` a + // `Dictionary` variant that would be a `dictionary::Encoding`. + match data { + dictionary::Encoding::RLE(enc) => Self::RLEDictionary(enc), + dictionary::Encoding::Plain(enc) => Self::Dictionary(enc), + } } /// All encoded values for the provided logical row ids. @@ -786,6 +823,7 @@ impl StringEncoding { pub fn encoded_values(&self, row_ids: &[u32], dst: Vec) -> Vec { match &self { Self::RLEDictionary(c) => c.encoded_values(row_ids, dst), + Self::Dictionary(c) => c.encoded_values(row_ids, dst), } } @@ -795,6 +833,7 @@ impl StringEncoding { pub fn all_encoded_values(&self, dst: Vec) -> Vec { match &self { Self::RLEDictionary(c) => c.all_encoded_values(dst), + Self::Dictionary(c) => c.all_encoded_values(dst), } } @@ -877,7 +916,23 @@ impl StringEncoding { // generates metadata for an encoded column. fn meta(data: &Self) -> MetaData { match data { - StringEncoding::RLEDictionary(data) => { + Self::RLEDictionary(data) => { + let dictionary = data.dictionary(); + let range = if !dictionary.is_empty() { + let min = data.dictionary()[0].clone(); + let max = data.dictionary()[data.dictionary().len() - 1].clone(); + Some((min, max)) + } else { + None + }; + + MetaData { + size: data.size(), + rows: data.num_rows(), + range, + } + } + Self::Dictionary(data) => { let dictionary = data.dictionary(); let range = if !dictionary.is_empty() { let min = data.dictionary()[0].clone(); @@ -900,7 +955,8 @@ impl StringEncoding { impl std::fmt::Display for StringEncoding { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - StringEncoding::RLEDictionary(data) => write!(f, "{}", data), + Self::RLEDictionary(data) => write!(f, "{}", data), + Self::Dictionary(data) => write!(f, "{}", data), } } } @@ -2547,7 +2603,7 @@ mod test { assert_eq!( meta, super::MetaData:: { - size: 0, + size: 317, rows: 4, range: Some(("hello".to_string(), "world".to_string())), } @@ -2577,7 +2633,7 @@ mod test { assert_eq!( meta, super::MetaData:: { - size: 0, + size: 301, rows: 2, range: Some(("hello".to_string(), "world".to_string())), } diff --git a/segment_store/src/column/dictionary.rs b/segment_store/src/column/dictionary.rs index 7a4e90a108..3eb8e3bf5a 100644 --- a/segment_store/src/column/dictionary.rs +++ b/segment_store/src/column/dictionary.rs @@ -14,7 +14,7 @@ use crate::column::{cmp, RowIDs}; /// The encoded id for a NULL value. pub const NULL_ID: u32 = 0; -enum Encoding { +pub enum Encoding { RLE(RLE), Plain(Plain), } @@ -27,21 +27,28 @@ impl Encoding { } } - fn size(&self) -> u64 { + pub fn size(&self) -> u64 { match &self { Encoding::RLE(enc) => enc.size(), Encoding::Plain(enc) => enc.size(), } } - fn push(&mut self, v: String) { + pub fn num_rows(&self) -> u32 { + match &self { + Encoding::RLE(enc) => enc.num_rows(), + Encoding::Plain(enc) => enc.num_rows(), + } + } + + pub fn push(&mut self, v: String) { match self { Encoding::RLE(ref mut enc) => enc.push(v), Encoding::Plain(ref mut enc) => enc.push(v), } } - fn push_none(&mut self) { + pub fn push_none(&mut self) { match self { Encoding::RLE(ref mut enc) => enc.push_none(), Encoding::Plain(ref mut enc) => enc.push_none(), @@ -51,7 +58,7 @@ impl Encoding { /// Adds additional repetitions of the provided value to the encoded data. /// It is the caller's responsibility to ensure that the dictionary encoded /// remains sorted. - fn push_additional(&mut self, v: Option, additional: u32) { + pub fn push_additional(&mut self, v: Option, additional: u32) { match self { Encoding::RLE(ref mut env) => env.push_additional(v, additional), Encoding::Plain(ref mut env) => env.push_additional(v, additional), @@ -121,14 +128,10 @@ impl Encoding { // // - fn dictionary(&self) -> &[String] { + pub fn dictionary(&self) -> Vec<&String> { match self { Encoding::RLE(enc) => enc.dictionary(), - Encoding::Plain(enc) => { - todo!() // figure out storing entries. - // let v = enc.dictionary(); - // v.as_slice() - } + Encoding::Plain(enc) => enc.dictionary(), } } @@ -775,7 +778,7 @@ mod test { assert_eq!( enc.dictionary(), - &["east".to_string(), "west".to_string(), "zoo".to_string()], + vec![&"east".to_string(), &"west".to_string(), &"zoo".to_string()], "{}", name ); diff --git a/segment_store/src/column/dictionary/plain.rs b/segment_store/src/column/dictionary/plain.rs index 33affb4620..6ed2aa8087 100644 --- a/segment_store/src/column/dictionary/plain.rs +++ b/segment_store/src/column/dictionary/plain.rs @@ -373,7 +373,7 @@ impl Plain { // TODO(edd): rethink returning `Vec` by looking at if we can store // entries in a `Vec` rather than a `Vec>`. It would // then allow us to return a `&[String]` here. - pub fn dictionary(&self) -> Vec { + pub fn dictionary(&self) -> Vec<&String> { if self.entries.len() == 1 { // no non-null entries. return vec![]; @@ -382,8 +382,8 @@ impl Plain { self.entries .iter() .skip(1) - .filter_map(|v| v.clone()) - .collect::>() + .filter_map(|v| v.as_ref()) + .collect() } /// Returns the logical value present at the provided row id. Panics if the diff --git a/segment_store/src/column/dictionary/rle.rs b/segment_store/src/column/dictionary/rle.rs index 7de6170748..aba327dfe6 100644 --- a/segment_store/src/column/dictionary/rle.rs +++ b/segment_store/src/column/dictionary/rle.rs @@ -381,8 +381,8 @@ impl RLE { // // - pub fn dictionary(&self) -> &[String] { - &self.index_entries[1..] + pub fn dictionary(&self) -> Vec<&String> { + self.index_entries.iter().skip(1).collect() } /// Returns the logical value present at the provided row id.