feat: wire up Plain dictionary encoder

pull/24376/head
Edd Robinson 2020-11-12 11:51:29 +00:00
parent 94d37a9ff2
commit 1252d1b2f4
4 changed files with 85 additions and 26 deletions

View File

@ -14,6 +14,15 @@ use arrow_deps::arrow::array::{
}; };
use arrow_deps::{arrow, arrow::array::Array}; use arrow_deps::{arrow, arrow::array::Array};
// Edd's totally made up magic constant. This determines whether we would use
// a run-length encoded dictionary encoding or just a plain dictionary encoding.
// I have ideas about how to build heuristics to do this in a much better way
// than an arbitrary constant but for now it's this...
//
// FWIW it's not the cardinality of the column that should drive the decision
// it's how many run-lengths would be produced in an RLE column and whether that
// compression is worth the memory and compute costs to work on it.
pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 1_000_000;
/// The possible logical types that column values can have. All values in a /// The possible logical types that column values can have. All values in a
/// column have the same physical type. /// column have the same physical type.
pub enum Column { pub enum Column {
@ -630,6 +639,7 @@ impl<T: PartialOrd + std::fmt::Debug> MetaData<T> {
} }
pub enum StringEncoding { pub enum StringEncoding {
RLEDictionary(dictionary::RLE), RLEDictionary(dictionary::RLE),
Dictionary(dictionary::Plain),
// TODO - simple array encoding, e.g., via Arrow String array. // TODO - simple array encoding, e.g., via Arrow String array.
} }
@ -640,6 +650,7 @@ impl StringEncoding {
pub fn contains_null(&self) -> bool { pub fn contains_null(&self) -> bool {
match &self { match &self {
Self::RLEDictionary(c) => c.contains_null(), Self::RLEDictionary(c) => c.contains_null(),
Self::Dictionary(c) => c.contains_null(),
} }
} }
@ -650,6 +661,10 @@ impl StringEncoding {
Some(v) => Value::String(v), Some(v) => Value::String(v),
None => Value::Null, None => Value::Null,
}, },
Self::Dictionary(c) => match c.value(row_id) {
Some(v) => Value::String(v),
None => Value::Null,
},
} }
} }
@ -659,6 +674,7 @@ impl StringEncoding {
pub fn values(&self, row_ids: &[u32]) -> Values { pub fn values(&self, row_ids: &[u32]) -> Values {
match &self { match &self {
Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))), Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
Self::Dictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
} }
} }
@ -668,6 +684,7 @@ impl StringEncoding {
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> { pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
match &self { match &self {
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())), Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
} }
} }
@ -675,6 +692,7 @@ impl StringEncoding {
pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs { pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs {
match &self { match &self {
Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst), Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst),
Self::Dictionary(c) => c.row_ids_filter(value, op, dst),
} }
} }
@ -683,7 +701,11 @@ impl StringEncoding {
/// ids. /// ids.
pub fn min(&self, row_ids: &[u32]) -> Value<'_> { pub fn min(&self, row_ids: &[u32]) -> Value<'_> {
match &self { match &self {
StringEncoding::RLEDictionary(c) => match c.min(row_ids) { Self::RLEDictionary(c) => match c.min(row_ids) {
Some(min) => Value::String(min),
None => Value::Null,
},
Self::Dictionary(c) => match c.min(row_ids) {
Some(min) => Value::String(min), Some(min) => Value::String(min),
None => Value::Null, None => Value::Null,
}, },
@ -695,7 +717,11 @@ impl StringEncoding {
/// ids. /// ids.
pub fn max(&self, row_ids: &[u32]) -> Value<'_> { pub fn max(&self, row_ids: &[u32]) -> Value<'_> {
match &self { match &self {
StringEncoding::RLEDictionary(c) => match c.max(row_ids) { Self::RLEDictionary(c) => match c.max(row_ids) {
Some(max) => Value::String(max),
None => Value::Null,
},
Self::Dictionary(c) => match c.max(row_ids) {
Some(max) => Value::String(max), Some(max) => Value::String(max),
None => Value::Null, None => Value::Null,
}, },
@ -705,7 +731,8 @@ impl StringEncoding {
/// The number of non-null values at the provided row ids. /// The number of non-null values at the provided row ids.
pub fn count(&self, row_ids: &[u32]) -> u32 { pub fn count(&self, row_ids: &[u32]) -> u32 {
match &self { match &self {
StringEncoding::RLEDictionary(c) => c.count(row_ids), Self::RLEDictionary(c) => c.count(row_ids),
Self::Dictionary(c) => c.count(row_ids),
} }
} }
@ -726,7 +753,12 @@ impl StringEncoding {
} }
} }
let mut data = dictionary::RLE::with_dictionary(dictionary); let mut data: dictionary::Encoding =
if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
dictionary::Encoding::Plain(dictionary::Plain::with_dictionary(dictionary))
} else {
dictionary::Encoding::RLE(dictionary::RLE::with_dictionary(dictionary))
};
let mut prev = if !arr.is_null(0) { let mut prev = if !arr.is_null(0) {
Some(arr.value(0)) Some(arr.value(0))
@ -777,7 +809,12 @@ impl StringEncoding {
range, range,
}; };
Self::RLEDictionary(data) // TODO(edd): consider just storing under the `StringEncoding` a
// `Dictionary` variant that would be a `dictionary::Encoding`.
match data {
dictionary::Encoding::RLE(enc) => Self::RLEDictionary(enc),
dictionary::Encoding::Plain(enc) => Self::Dictionary(enc),
}
} }
/// All encoded values for the provided logical row ids. /// All encoded values for the provided logical row ids.
@ -786,6 +823,7 @@ impl StringEncoding {
pub fn encoded_values(&self, row_ids: &[u32], dst: Vec<u32>) -> Vec<u32> { pub fn encoded_values(&self, row_ids: &[u32], dst: Vec<u32>) -> Vec<u32> {
match &self { match &self {
Self::RLEDictionary(c) => c.encoded_values(row_ids, dst), Self::RLEDictionary(c) => c.encoded_values(row_ids, dst),
Self::Dictionary(c) => c.encoded_values(row_ids, dst),
} }
} }
@ -795,6 +833,7 @@ impl StringEncoding {
pub fn all_encoded_values(&self, dst: Vec<u32>) -> Vec<u32> { pub fn all_encoded_values(&self, dst: Vec<u32>) -> Vec<u32> {
match &self { match &self {
Self::RLEDictionary(c) => c.all_encoded_values(dst), Self::RLEDictionary(c) => c.all_encoded_values(dst),
Self::Dictionary(c) => c.all_encoded_values(dst),
} }
} }
@ -877,7 +916,23 @@ impl StringEncoding {
// generates metadata for an encoded column. // generates metadata for an encoded column.
fn meta(data: &Self) -> MetaData<String> { fn meta(data: &Self) -> MetaData<String> {
match data { match data {
StringEncoding::RLEDictionary(data) => { Self::RLEDictionary(data) => {
let dictionary = data.dictionary();
let range = if !dictionary.is_empty() {
let min = data.dictionary()[0].clone();
let max = data.dictionary()[data.dictionary().len() - 1].clone();
Some((min, max))
} else {
None
};
MetaData {
size: data.size(),
rows: data.num_rows(),
range,
}
}
Self::Dictionary(data) => {
let dictionary = data.dictionary(); let dictionary = data.dictionary();
let range = if !dictionary.is_empty() { let range = if !dictionary.is_empty() {
let min = data.dictionary()[0].clone(); let min = data.dictionary()[0].clone();
@ -900,7 +955,8 @@ impl StringEncoding {
impl std::fmt::Display for StringEncoding { impl std::fmt::Display for StringEncoding {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
StringEncoding::RLEDictionary(data) => write!(f, "{}", data), Self::RLEDictionary(data) => write!(f, "{}", data),
Self::Dictionary(data) => write!(f, "{}", data),
} }
} }
} }
@ -2547,7 +2603,7 @@ mod test {
assert_eq!( assert_eq!(
meta, meta,
super::MetaData::<String> { super::MetaData::<String> {
size: 0, size: 317,
rows: 4, rows: 4,
range: Some(("hello".to_string(), "world".to_string())), range: Some(("hello".to_string(), "world".to_string())),
} }
@ -2577,7 +2633,7 @@ mod test {
assert_eq!( assert_eq!(
meta, meta,
super::MetaData::<String> { super::MetaData::<String> {
size: 0, size: 301,
rows: 2, rows: 2,
range: Some(("hello".to_string(), "world".to_string())), range: Some(("hello".to_string(), "world".to_string())),
} }

View File

@ -14,7 +14,7 @@ use crate::column::{cmp, RowIDs};
/// The encoded id for a NULL value. /// The encoded id for a NULL value.
pub const NULL_ID: u32 = 0; pub const NULL_ID: u32 = 0;
enum Encoding { pub enum Encoding {
RLE(RLE), RLE(RLE),
Plain(Plain), Plain(Plain),
} }
@ -27,21 +27,28 @@ impl Encoding {
} }
} }
fn size(&self) -> u64 { pub fn size(&self) -> u64 {
match &self { match &self {
Encoding::RLE(enc) => enc.size(), Encoding::RLE(enc) => enc.size(),
Encoding::Plain(enc) => enc.size(), Encoding::Plain(enc) => enc.size(),
} }
} }
fn push(&mut self, v: String) { pub fn num_rows(&self) -> u32 {
match &self {
Encoding::RLE(enc) => enc.num_rows(),
Encoding::Plain(enc) => enc.num_rows(),
}
}
pub fn push(&mut self, v: String) {
match self { match self {
Encoding::RLE(ref mut enc) => enc.push(v), Encoding::RLE(ref mut enc) => enc.push(v),
Encoding::Plain(ref mut enc) => enc.push(v), Encoding::Plain(ref mut enc) => enc.push(v),
} }
} }
fn push_none(&mut self) { pub fn push_none(&mut self) {
match self { match self {
Encoding::RLE(ref mut enc) => enc.push_none(), Encoding::RLE(ref mut enc) => enc.push_none(),
Encoding::Plain(ref mut enc) => enc.push_none(), Encoding::Plain(ref mut enc) => enc.push_none(),
@ -51,7 +58,7 @@ impl Encoding {
/// Adds additional repetitions of the provided value to the encoded data. /// Adds additional repetitions of the provided value to the encoded data.
/// It is the caller's responsibility to ensure that the dictionary encoded /// It is the caller's responsibility to ensure that the dictionary encoded
/// remains sorted. /// remains sorted.
fn push_additional(&mut self, v: Option<String>, additional: u32) { pub fn push_additional(&mut self, v: Option<String>, additional: u32) {
match self { match self {
Encoding::RLE(ref mut env) => env.push_additional(v, additional), Encoding::RLE(ref mut env) => env.push_additional(v, additional),
Encoding::Plain(ref mut env) => env.push_additional(v, additional), Encoding::Plain(ref mut env) => env.push_additional(v, additional),
@ -121,14 +128,10 @@ impl Encoding {
// //
// //
fn dictionary(&self) -> &[String] { pub fn dictionary(&self) -> Vec<&String> {
match self { match self {
Encoding::RLE(enc) => enc.dictionary(), Encoding::RLE(enc) => enc.dictionary(),
Encoding::Plain(enc) => { Encoding::Plain(enc) => enc.dictionary(),
todo!() // figure out storing entries.
// let v = enc.dictionary();
// v.as_slice()
}
} }
} }
@ -775,7 +778,7 @@ mod test {
assert_eq!( assert_eq!(
enc.dictionary(), enc.dictionary(),
&["east".to_string(), "west".to_string(), "zoo".to_string()], vec![&"east".to_string(), &"west".to_string(), &"zoo".to_string()],
"{}", "{}",
name name
); );

View File

@ -373,7 +373,7 @@ impl Plain {
// TODO(edd): rethink returning `Vec<String>` by looking at if we can store // TODO(edd): rethink returning `Vec<String>` by looking at if we can store
// entries in a `Vec<String>` rather than a `Vec<Option<String>>`. It would // entries in a `Vec<String>` rather than a `Vec<Option<String>>`. It would
// then allow us to return a `&[String]` here. // then allow us to return a `&[String]` here.
pub fn dictionary(&self) -> Vec<String> { pub fn dictionary(&self) -> Vec<&String> {
if self.entries.len() == 1 { if self.entries.len() == 1 {
// no non-null entries. // no non-null entries.
return vec![]; return vec![];
@ -382,8 +382,8 @@ impl Plain {
self.entries self.entries
.iter() .iter()
.skip(1) .skip(1)
.filter_map(|v| v.clone()) .filter_map(|v| v.as_ref())
.collect::<Vec<String>>() .collect()
} }
/// Returns the logical value present at the provided row id. Panics if the /// Returns the logical value present at the provided row id. Panics if the

View File

@ -381,8 +381,8 @@ impl RLE {
// //
// //
pub fn dictionary(&self) -> &[String] { pub fn dictionary(&self) -> Vec<&String> {
&self.index_entries[1..] self.index_entries.iter().skip(1).collect()
} }
/// Returns the logical value present at the provided row id. /// Returns the logical value present at the provided row id.