feat: wire up Plain dictionary encoder

pull/24376/head
Edd Robinson 2020-11-12 11:51:29 +00:00
parent 94d37a9ff2
commit 1252d1b2f4
4 changed files with 85 additions and 26 deletions

View File

@ -14,6 +14,15 @@ use arrow_deps::arrow::array::{
};
use arrow_deps::{arrow, arrow::array::Array};
// Edd's totally made up magic constant. This determines whether we would use
// a run-length encoded dictionary encoding or just a plain dictionary encoding.
// I have ideas about how to build heuristics to do this in a much better way
// than an arbitrary constant but for now it's this...
//
// FWIW it's not the cardinality of the column that should drive the decision
// it's how many run-lengths would be produced in an RLE column and whether that
// compression is worth the memory and compute costs to work on it.
pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 1_000_000;
/// The possible logical types that column values can have. All values in a
/// column have the same physical type.
pub enum Column {
@ -630,6 +639,7 @@ impl<T: PartialOrd + std::fmt::Debug> MetaData<T> {
}
pub enum StringEncoding {
RLEDictionary(dictionary::RLE),
Dictionary(dictionary::Plain),
// TODO - simple array encoding, e.g., via Arrow String array.
}
@ -640,6 +650,7 @@ impl StringEncoding {
pub fn contains_null(&self) -> bool {
match &self {
Self::RLEDictionary(c) => c.contains_null(),
Self::Dictionary(c) => c.contains_null(),
}
}
@ -650,6 +661,10 @@ impl StringEncoding {
Some(v) => Value::String(v),
None => Value::Null,
},
Self::Dictionary(c) => match c.value(row_id) {
Some(v) => Value::String(v),
None => Value::Null,
},
}
}
@ -659,6 +674,7 @@ impl StringEncoding {
pub fn values(&self, row_ids: &[u32]) -> Values {
match &self {
Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
Self::Dictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
}
}
@ -668,6 +684,7 @@ impl StringEncoding {
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
match &self {
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
}
}
@ -675,6 +692,7 @@ impl StringEncoding {
pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs {
match &self {
Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst),
Self::Dictionary(c) => c.row_ids_filter(value, op, dst),
}
}
@ -683,7 +701,11 @@ impl StringEncoding {
/// ids.
pub fn min(&self, row_ids: &[u32]) -> Value<'_> {
match &self {
StringEncoding::RLEDictionary(c) => match c.min(row_ids) {
Self::RLEDictionary(c) => match c.min(row_ids) {
Some(min) => Value::String(min),
None => Value::Null,
},
Self::Dictionary(c) => match c.min(row_ids) {
Some(min) => Value::String(min),
None => Value::Null,
},
@ -695,7 +717,11 @@ impl StringEncoding {
/// ids.
pub fn max(&self, row_ids: &[u32]) -> Value<'_> {
match &self {
StringEncoding::RLEDictionary(c) => match c.max(row_ids) {
Self::RLEDictionary(c) => match c.max(row_ids) {
Some(max) => Value::String(max),
None => Value::Null,
},
Self::Dictionary(c) => match c.max(row_ids) {
Some(max) => Value::String(max),
None => Value::Null,
},
@ -705,7 +731,8 @@ impl StringEncoding {
/// The number of non-null values at the provided row ids.
pub fn count(&self, row_ids: &[u32]) -> u32 {
match &self {
StringEncoding::RLEDictionary(c) => c.count(row_ids),
Self::RLEDictionary(c) => c.count(row_ids),
Self::Dictionary(c) => c.count(row_ids),
}
}
@ -726,7 +753,12 @@ impl StringEncoding {
}
}
let mut data = dictionary::RLE::with_dictionary(dictionary);
let mut data: dictionary::Encoding =
if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
dictionary::Encoding::Plain(dictionary::Plain::with_dictionary(dictionary))
} else {
dictionary::Encoding::RLE(dictionary::RLE::with_dictionary(dictionary))
};
let mut prev = if !arr.is_null(0) {
Some(arr.value(0))
@ -777,7 +809,12 @@ impl StringEncoding {
range,
};
Self::RLEDictionary(data)
// TODO(edd): consider just storing under the `StringEncoding` a
// `Dictionary` variant that would be a `dictionary::Encoding`.
match data {
dictionary::Encoding::RLE(enc) => Self::RLEDictionary(enc),
dictionary::Encoding::Plain(enc) => Self::Dictionary(enc),
}
}
/// All encoded values for the provided logical row ids.
@ -786,6 +823,7 @@ impl StringEncoding {
pub fn encoded_values(&self, row_ids: &[u32], dst: Vec<u32>) -> Vec<u32> {
match &self {
Self::RLEDictionary(c) => c.encoded_values(row_ids, dst),
Self::Dictionary(c) => c.encoded_values(row_ids, dst),
}
}
@ -795,6 +833,7 @@ impl StringEncoding {
pub fn all_encoded_values(&self, dst: Vec<u32>) -> Vec<u32> {
match &self {
Self::RLEDictionary(c) => c.all_encoded_values(dst),
Self::Dictionary(c) => c.all_encoded_values(dst),
}
}
@ -877,7 +916,23 @@ impl StringEncoding {
// generates metadata for an encoded column.
fn meta(data: &Self) -> MetaData<String> {
match data {
StringEncoding::RLEDictionary(data) => {
Self::RLEDictionary(data) => {
let dictionary = data.dictionary();
let range = if !dictionary.is_empty() {
let min = data.dictionary()[0].clone();
let max = data.dictionary()[data.dictionary().len() - 1].clone();
Some((min, max))
} else {
None
};
MetaData {
size: data.size(),
rows: data.num_rows(),
range,
}
}
Self::Dictionary(data) => {
let dictionary = data.dictionary();
let range = if !dictionary.is_empty() {
let min = data.dictionary()[0].clone();
@ -900,7 +955,8 @@ impl StringEncoding {
impl std::fmt::Display for StringEncoding {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
StringEncoding::RLEDictionary(data) => write!(f, "{}", data),
Self::RLEDictionary(data) => write!(f, "{}", data),
Self::Dictionary(data) => write!(f, "{}", data),
}
}
}
@ -2547,7 +2603,7 @@ mod test {
assert_eq!(
meta,
super::MetaData::<String> {
size: 0,
size: 317,
rows: 4,
range: Some(("hello".to_string(), "world".to_string())),
}
@ -2577,7 +2633,7 @@ mod test {
assert_eq!(
meta,
super::MetaData::<String> {
size: 0,
size: 301,
rows: 2,
range: Some(("hello".to_string(), "world".to_string())),
}

View File

@ -14,7 +14,7 @@ use crate::column::{cmp, RowIDs};
/// The encoded id for a NULL value.
pub const NULL_ID: u32 = 0;
enum Encoding {
pub enum Encoding {
RLE(RLE),
Plain(Plain),
}
@ -27,21 +27,28 @@ impl Encoding {
}
}
fn size(&self) -> u64 {
pub fn size(&self) -> u64 {
match &self {
Encoding::RLE(enc) => enc.size(),
Encoding::Plain(enc) => enc.size(),
}
}
fn push(&mut self, v: String) {
pub fn num_rows(&self) -> u32 {
match &self {
Encoding::RLE(enc) => enc.num_rows(),
Encoding::Plain(enc) => enc.num_rows(),
}
}
pub fn push(&mut self, v: String) {
match self {
Encoding::RLE(ref mut enc) => enc.push(v),
Encoding::Plain(ref mut enc) => enc.push(v),
}
}
fn push_none(&mut self) {
pub fn push_none(&mut self) {
match self {
Encoding::RLE(ref mut enc) => enc.push_none(),
Encoding::Plain(ref mut enc) => enc.push_none(),
@ -51,7 +58,7 @@ impl Encoding {
/// Adds additional repetitions of the provided value to the encoded data.
/// It is the caller's responsibility to ensure that the dictionary encoded
/// remains sorted.
fn push_additional(&mut self, v: Option<String>, additional: u32) {
pub fn push_additional(&mut self, v: Option<String>, additional: u32) {
match self {
Encoding::RLE(ref mut env) => env.push_additional(v, additional),
Encoding::Plain(ref mut env) => env.push_additional(v, additional),
@ -121,14 +128,10 @@ impl Encoding {
//
//
fn dictionary(&self) -> &[String] {
pub fn dictionary(&self) -> Vec<&String> {
match self {
Encoding::RLE(enc) => enc.dictionary(),
Encoding::Plain(enc) => {
todo!() // figure out storing entries.
// let v = enc.dictionary();
// v.as_slice()
}
Encoding::Plain(enc) => enc.dictionary(),
}
}
@ -775,7 +778,7 @@ mod test {
assert_eq!(
enc.dictionary(),
&["east".to_string(), "west".to_string(), "zoo".to_string()],
vec![&"east".to_string(), &"west".to_string(), &"zoo".to_string()],
"{}",
name
);

View File

@ -373,7 +373,7 @@ impl Plain {
// TODO(edd): rethink returning `Vec<String>` by looking at if we can store
// entries in a `Vec<String>` rather than a `Vec<Option<String>>`. It would
// then allow us to return a `&[String]` here.
pub fn dictionary(&self) -> Vec<String> {
pub fn dictionary(&self) -> Vec<&String> {
if self.entries.len() == 1 {
// no non-null entries.
return vec![];
@ -382,8 +382,8 @@ impl Plain {
self.entries
.iter()
.skip(1)
.filter_map(|v| v.clone())
.collect::<Vec<String>>()
.filter_map(|v| v.as_ref())
.collect()
}
/// Returns the logical value present at the provided row id. Panics if the

View File

@ -381,8 +381,8 @@ impl RLE {
//
//
pub fn dictionary(&self) -> &[String] {
&self.index_entries[1..]
pub fn dictionary(&self) -> Vec<&String> {
self.index_entries.iter().skip(1).collect()
}
/// Returns the logical value present at the provided row id.