feat: wire up Plain dictionary encoder
parent
94d37a9ff2
commit
1252d1b2f4
|
@ -14,6 +14,15 @@ use arrow_deps::arrow::array::{
|
||||||
};
|
};
|
||||||
use arrow_deps::{arrow, arrow::array::Array};
|
use arrow_deps::{arrow, arrow::array::Array};
|
||||||
|
|
||||||
|
// Edd's totally made up magic constant. This determines whether we would use
|
||||||
|
// a run-length encoded dictionary encoding or just a plain dictionary encoding.
|
||||||
|
// I have ideas about how to build heuristics to do this in a much better way
|
||||||
|
// than an arbitrary constant but for now it's this...
|
||||||
|
//
|
||||||
|
// FWIW it's not the cardinality of the column that should drive the decision
|
||||||
|
// it's how many run-lengths would be produced in an RLE column and whether that
|
||||||
|
// compression is worth the memory and compute costs to work on it.
|
||||||
|
pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 1_000_000;
|
||||||
/// The possible logical types that column values can have. All values in a
|
/// The possible logical types that column values can have. All values in a
|
||||||
/// column have the same physical type.
|
/// column have the same physical type.
|
||||||
pub enum Column {
|
pub enum Column {
|
||||||
|
@ -630,6 +639,7 @@ impl<T: PartialOrd + std::fmt::Debug> MetaData<T> {
|
||||||
}
|
}
|
||||||
pub enum StringEncoding {
|
pub enum StringEncoding {
|
||||||
RLEDictionary(dictionary::RLE),
|
RLEDictionary(dictionary::RLE),
|
||||||
|
Dictionary(dictionary::Plain),
|
||||||
// TODO - simple array encoding, e.g., via Arrow String array.
|
// TODO - simple array encoding, e.g., via Arrow String array.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -640,6 +650,7 @@ impl StringEncoding {
|
||||||
pub fn contains_null(&self) -> bool {
|
pub fn contains_null(&self) -> bool {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => c.contains_null(),
|
Self::RLEDictionary(c) => c.contains_null(),
|
||||||
|
Self::Dictionary(c) => c.contains_null(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -650,6 +661,10 @@ impl StringEncoding {
|
||||||
Some(v) => Value::String(v),
|
Some(v) => Value::String(v),
|
||||||
None => Value::Null,
|
None => Value::Null,
|
||||||
},
|
},
|
||||||
|
Self::Dictionary(c) => match c.value(row_id) {
|
||||||
|
Some(v) => Value::String(v),
|
||||||
|
None => Value::Null,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -659,6 +674,7 @@ impl StringEncoding {
|
||||||
pub fn values(&self, row_ids: &[u32]) -> Values {
|
pub fn values(&self, row_ids: &[u32]) -> Values {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
|
Self::RLEDictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
|
||||||
|
Self::Dictionary(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -668,6 +684,7 @@ impl StringEncoding {
|
||||||
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
||||||
|
Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -675,6 +692,7 @@ impl StringEncoding {
|
||||||
pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs {
|
pub fn row_ids_filter(&self, op: &cmp::Operator, value: &str, dst: RowIDs) -> RowIDs {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst),
|
Self::RLEDictionary(c) => c.row_ids_filter(value, op, dst),
|
||||||
|
Self::Dictionary(c) => c.row_ids_filter(value, op, dst),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -683,7 +701,11 @@ impl StringEncoding {
|
||||||
/// ids.
|
/// ids.
|
||||||
pub fn min(&self, row_ids: &[u32]) -> Value<'_> {
|
pub fn min(&self, row_ids: &[u32]) -> Value<'_> {
|
||||||
match &self {
|
match &self {
|
||||||
StringEncoding::RLEDictionary(c) => match c.min(row_ids) {
|
Self::RLEDictionary(c) => match c.min(row_ids) {
|
||||||
|
Some(min) => Value::String(min),
|
||||||
|
None => Value::Null,
|
||||||
|
},
|
||||||
|
Self::Dictionary(c) => match c.min(row_ids) {
|
||||||
Some(min) => Value::String(min),
|
Some(min) => Value::String(min),
|
||||||
None => Value::Null,
|
None => Value::Null,
|
||||||
},
|
},
|
||||||
|
@ -695,7 +717,11 @@ impl StringEncoding {
|
||||||
/// ids.
|
/// ids.
|
||||||
pub fn max(&self, row_ids: &[u32]) -> Value<'_> {
|
pub fn max(&self, row_ids: &[u32]) -> Value<'_> {
|
||||||
match &self {
|
match &self {
|
||||||
StringEncoding::RLEDictionary(c) => match c.max(row_ids) {
|
Self::RLEDictionary(c) => match c.max(row_ids) {
|
||||||
|
Some(max) => Value::String(max),
|
||||||
|
None => Value::Null,
|
||||||
|
},
|
||||||
|
Self::Dictionary(c) => match c.max(row_ids) {
|
||||||
Some(max) => Value::String(max),
|
Some(max) => Value::String(max),
|
||||||
None => Value::Null,
|
None => Value::Null,
|
||||||
},
|
},
|
||||||
|
@ -705,7 +731,8 @@ impl StringEncoding {
|
||||||
/// The number of non-null values at the provided row ids.
|
/// The number of non-null values at the provided row ids.
|
||||||
pub fn count(&self, row_ids: &[u32]) -> u32 {
|
pub fn count(&self, row_ids: &[u32]) -> u32 {
|
||||||
match &self {
|
match &self {
|
||||||
StringEncoding::RLEDictionary(c) => c.count(row_ids),
|
Self::RLEDictionary(c) => c.count(row_ids),
|
||||||
|
Self::Dictionary(c) => c.count(row_ids),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -726,7 +753,12 @@ impl StringEncoding {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut data = dictionary::RLE::with_dictionary(dictionary);
|
let mut data: dictionary::Encoding =
|
||||||
|
if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
|
||||||
|
dictionary::Encoding::Plain(dictionary::Plain::with_dictionary(dictionary))
|
||||||
|
} else {
|
||||||
|
dictionary::Encoding::RLE(dictionary::RLE::with_dictionary(dictionary))
|
||||||
|
};
|
||||||
|
|
||||||
let mut prev = if !arr.is_null(0) {
|
let mut prev = if !arr.is_null(0) {
|
||||||
Some(arr.value(0))
|
Some(arr.value(0))
|
||||||
|
@ -777,7 +809,12 @@ impl StringEncoding {
|
||||||
range,
|
range,
|
||||||
};
|
};
|
||||||
|
|
||||||
Self::RLEDictionary(data)
|
// TODO(edd): consider just storing under the `StringEncoding` a
|
||||||
|
// `Dictionary` variant that would be a `dictionary::Encoding`.
|
||||||
|
match data {
|
||||||
|
dictionary::Encoding::RLE(enc) => Self::RLEDictionary(enc),
|
||||||
|
dictionary::Encoding::Plain(enc) => Self::Dictionary(enc),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// All encoded values for the provided logical row ids.
|
/// All encoded values for the provided logical row ids.
|
||||||
|
@ -786,6 +823,7 @@ impl StringEncoding {
|
||||||
pub fn encoded_values(&self, row_ids: &[u32], dst: Vec<u32>) -> Vec<u32> {
|
pub fn encoded_values(&self, row_ids: &[u32], dst: Vec<u32>) -> Vec<u32> {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => c.encoded_values(row_ids, dst),
|
Self::RLEDictionary(c) => c.encoded_values(row_ids, dst),
|
||||||
|
Self::Dictionary(c) => c.encoded_values(row_ids, dst),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -795,6 +833,7 @@ impl StringEncoding {
|
||||||
pub fn all_encoded_values(&self, dst: Vec<u32>) -> Vec<u32> {
|
pub fn all_encoded_values(&self, dst: Vec<u32>) -> Vec<u32> {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => c.all_encoded_values(dst),
|
Self::RLEDictionary(c) => c.all_encoded_values(dst),
|
||||||
|
Self::Dictionary(c) => c.all_encoded_values(dst),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -877,7 +916,23 @@ impl StringEncoding {
|
||||||
// generates metadata for an encoded column.
|
// generates metadata for an encoded column.
|
||||||
fn meta(data: &Self) -> MetaData<String> {
|
fn meta(data: &Self) -> MetaData<String> {
|
||||||
match data {
|
match data {
|
||||||
StringEncoding::RLEDictionary(data) => {
|
Self::RLEDictionary(data) => {
|
||||||
|
let dictionary = data.dictionary();
|
||||||
|
let range = if !dictionary.is_empty() {
|
||||||
|
let min = data.dictionary()[0].clone();
|
||||||
|
let max = data.dictionary()[data.dictionary().len() - 1].clone();
|
||||||
|
Some((min, max))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
MetaData {
|
||||||
|
size: data.size(),
|
||||||
|
rows: data.num_rows(),
|
||||||
|
range,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self::Dictionary(data) => {
|
||||||
let dictionary = data.dictionary();
|
let dictionary = data.dictionary();
|
||||||
let range = if !dictionary.is_empty() {
|
let range = if !dictionary.is_empty() {
|
||||||
let min = data.dictionary()[0].clone();
|
let min = data.dictionary()[0].clone();
|
||||||
|
@ -900,7 +955,8 @@ impl StringEncoding {
|
||||||
impl std::fmt::Display for StringEncoding {
|
impl std::fmt::Display for StringEncoding {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
StringEncoding::RLEDictionary(data) => write!(f, "{}", data),
|
Self::RLEDictionary(data) => write!(f, "{}", data),
|
||||||
|
Self::Dictionary(data) => write!(f, "{}", data),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2547,7 +2603,7 @@ mod test {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
meta,
|
meta,
|
||||||
super::MetaData::<String> {
|
super::MetaData::<String> {
|
||||||
size: 0,
|
size: 317,
|
||||||
rows: 4,
|
rows: 4,
|
||||||
range: Some(("hello".to_string(), "world".to_string())),
|
range: Some(("hello".to_string(), "world".to_string())),
|
||||||
}
|
}
|
||||||
|
@ -2577,7 +2633,7 @@ mod test {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
meta,
|
meta,
|
||||||
super::MetaData::<String> {
|
super::MetaData::<String> {
|
||||||
size: 0,
|
size: 301,
|
||||||
rows: 2,
|
rows: 2,
|
||||||
range: Some(("hello".to_string(), "world".to_string())),
|
range: Some(("hello".to_string(), "world".to_string())),
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ use crate::column::{cmp, RowIDs};
|
||||||
/// The encoded id for a NULL value.
|
/// The encoded id for a NULL value.
|
||||||
pub const NULL_ID: u32 = 0;
|
pub const NULL_ID: u32 = 0;
|
||||||
|
|
||||||
enum Encoding {
|
pub enum Encoding {
|
||||||
RLE(RLE),
|
RLE(RLE),
|
||||||
Plain(Plain),
|
Plain(Plain),
|
||||||
}
|
}
|
||||||
|
@ -27,21 +27,28 @@ impl Encoding {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size(&self) -> u64 {
|
pub fn size(&self) -> u64 {
|
||||||
match &self {
|
match &self {
|
||||||
Encoding::RLE(enc) => enc.size(),
|
Encoding::RLE(enc) => enc.size(),
|
||||||
Encoding::Plain(enc) => enc.size(),
|
Encoding::Plain(enc) => enc.size(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push(&mut self, v: String) {
|
pub fn num_rows(&self) -> u32 {
|
||||||
|
match &self {
|
||||||
|
Encoding::RLE(enc) => enc.num_rows(),
|
||||||
|
Encoding::Plain(enc) => enc.num_rows(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push(&mut self, v: String) {
|
||||||
match self {
|
match self {
|
||||||
Encoding::RLE(ref mut enc) => enc.push(v),
|
Encoding::RLE(ref mut enc) => enc.push(v),
|
||||||
Encoding::Plain(ref mut enc) => enc.push(v),
|
Encoding::Plain(ref mut enc) => enc.push(v),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_none(&mut self) {
|
pub fn push_none(&mut self) {
|
||||||
match self {
|
match self {
|
||||||
Encoding::RLE(ref mut enc) => enc.push_none(),
|
Encoding::RLE(ref mut enc) => enc.push_none(),
|
||||||
Encoding::Plain(ref mut enc) => enc.push_none(),
|
Encoding::Plain(ref mut enc) => enc.push_none(),
|
||||||
|
@ -51,7 +58,7 @@ impl Encoding {
|
||||||
/// Adds additional repetitions of the provided value to the encoded data.
|
/// Adds additional repetitions of the provided value to the encoded data.
|
||||||
/// It is the caller's responsibility to ensure that the dictionary encoded
|
/// It is the caller's responsibility to ensure that the dictionary encoded
|
||||||
/// remains sorted.
|
/// remains sorted.
|
||||||
fn push_additional(&mut self, v: Option<String>, additional: u32) {
|
pub fn push_additional(&mut self, v: Option<String>, additional: u32) {
|
||||||
match self {
|
match self {
|
||||||
Encoding::RLE(ref mut env) => env.push_additional(v, additional),
|
Encoding::RLE(ref mut env) => env.push_additional(v, additional),
|
||||||
Encoding::Plain(ref mut env) => env.push_additional(v, additional),
|
Encoding::Plain(ref mut env) => env.push_additional(v, additional),
|
||||||
|
@ -121,14 +128,10 @@ impl Encoding {
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
|
|
||||||
fn dictionary(&self) -> &[String] {
|
pub fn dictionary(&self) -> Vec<&String> {
|
||||||
match self {
|
match self {
|
||||||
Encoding::RLE(enc) => enc.dictionary(),
|
Encoding::RLE(enc) => enc.dictionary(),
|
||||||
Encoding::Plain(enc) => {
|
Encoding::Plain(enc) => enc.dictionary(),
|
||||||
todo!() // figure out storing entries.
|
|
||||||
// let v = enc.dictionary();
|
|
||||||
// v.as_slice()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -775,7 +778,7 @@ mod test {
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
enc.dictionary(),
|
enc.dictionary(),
|
||||||
&["east".to_string(), "west".to_string(), "zoo".to_string()],
|
vec![&"east".to_string(), &"west".to_string(), &"zoo".to_string()],
|
||||||
"{}",
|
"{}",
|
||||||
name
|
name
|
||||||
);
|
);
|
||||||
|
|
|
@ -373,7 +373,7 @@ impl Plain {
|
||||||
// TODO(edd): rethink returning `Vec<String>` by looking at if we can store
|
// TODO(edd): rethink returning `Vec<String>` by looking at if we can store
|
||||||
// entries in a `Vec<String>` rather than a `Vec<Option<String>>`. It would
|
// entries in a `Vec<String>` rather than a `Vec<Option<String>>`. It would
|
||||||
// then allow us to return a `&[String]` here.
|
// then allow us to return a `&[String]` here.
|
||||||
pub fn dictionary(&self) -> Vec<String> {
|
pub fn dictionary(&self) -> Vec<&String> {
|
||||||
if self.entries.len() == 1 {
|
if self.entries.len() == 1 {
|
||||||
// no non-null entries.
|
// no non-null entries.
|
||||||
return vec![];
|
return vec![];
|
||||||
|
@ -382,8 +382,8 @@ impl Plain {
|
||||||
self.entries
|
self.entries
|
||||||
.iter()
|
.iter()
|
||||||
.skip(1)
|
.skip(1)
|
||||||
.filter_map(|v| v.clone())
|
.filter_map(|v| v.as_ref())
|
||||||
.collect::<Vec<String>>()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the logical value present at the provided row id. Panics if the
|
/// Returns the logical value present at the provided row id. Panics if the
|
||||||
|
|
|
@ -381,8 +381,8 @@ impl RLE {
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
|
|
||||||
pub fn dictionary(&self) -> &[String] {
|
pub fn dictionary(&self) -> Vec<&String> {
|
||||||
&self.index_entries[1..]
|
self.index_entries.iter().skip(1).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the logical value present at the provided row id.
|
/// Returns the logical value present at the provided row id.
|
||||||
|
|
Loading…
Reference in New Issue