refactor: shuffle string encodings
parent
f86e0641fd
commit
482e4dab86
|
@ -3,7 +3,7 @@ use rand::distributions::Alphanumeric;
|
|||
use rand::prelude::*;
|
||||
use rand::Rng;
|
||||
|
||||
use read_buffer::benchmarks::{dictionary, Operator, RowIDs};
|
||||
use read_buffer::benchmarks::{string, Operator, RowIDs};
|
||||
|
||||
const ROWS: [usize; 3] = [100_000, 1_000_000, 10_000_000];
|
||||
const LOCATIONS: [Location; 3] = [Location::Start, Location::Middle, Location::End];
|
||||
|
@ -17,7 +17,7 @@ enum Location {
|
|||
}
|
||||
|
||||
enum EncType {
|
||||
RleDictionary,
|
||||
Rle,
|
||||
Dictionary,
|
||||
}
|
||||
|
||||
|
@ -25,8 +25,8 @@ fn select(c: &mut Criterion) {
|
|||
let mut rng = rand::thread_rng();
|
||||
benchmark_select(
|
||||
c,
|
||||
"encoding_rle_select",
|
||||
EncType::RleDictionary,
|
||||
"select",
|
||||
EncType::Rle,
|
||||
&ROWS,
|
||||
&LOCATIONS,
|
||||
&ROWS_MATCHING_VALUE,
|
||||
|
@ -35,7 +35,7 @@ fn select(c: &mut Criterion) {
|
|||
|
||||
benchmark_select(
|
||||
c,
|
||||
"encoding_dict_select",
|
||||
"_select",
|
||||
EncType::Dictionary,
|
||||
&ROWS,
|
||||
&LOCATIONS,
|
||||
|
@ -82,22 +82,22 @@ fn benchmark_select(
|
|||
};
|
||||
|
||||
group.throughput(Throughput::Elements(num_rows as u64));
|
||||
let encoding: dictionary::Encoding = match enc_type {
|
||||
EncType::RleDictionary => {
|
||||
let mut encoding = dictionary::RLE::with_dictionary(col_dict);
|
||||
let encoding: string::Encoding = match enc_type {
|
||||
EncType::Rle => {
|
||||
let mut encoding = string::RLE::with_dictionary(col_dict);
|
||||
// Could be faster but it's just the bench setup...
|
||||
for v in &col_data {
|
||||
encoding.push(v.to_owned());
|
||||
}
|
||||
dictionary::Encoding::RLE(encoding)
|
||||
string::Encoding::RLE(encoding)
|
||||
}
|
||||
EncType::Dictionary => {
|
||||
let mut encoding = dictionary::Plain::with_dictionary(col_dict);
|
||||
let mut encoding = string::Dictionary::with_dictionary(col_dict);
|
||||
// Could be faster but it's just the bench setup...
|
||||
for v in &col_data {
|
||||
encoding.push(v.to_owned());
|
||||
}
|
||||
dictionary::Encoding::Plain(encoding)
|
||||
string::Encoding::Plain(encoding)
|
||||
}
|
||||
};
|
||||
|
|
@ -15,7 +15,7 @@ use arrow::array::Array;
|
|||
use crate::schema::LogicalDataType;
|
||||
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, Values};
|
||||
use boolean::BooleanEncoding;
|
||||
use encoding::{bool, dictionary, fixed_null};
|
||||
use encoding::{bool, fixed_null, string::NULL_ID};
|
||||
use float::FloatEncoding;
|
||||
use integer::IntegerEncoding;
|
||||
use string::StringEncoding;
|
||||
|
@ -1659,7 +1659,7 @@ mod test {
|
|||
let col = Column::from(&input[..]);
|
||||
assert_eq!(
|
||||
col.encoded_values(&[0, 1, 2, 3, 4], EncodedValues::U32(vec![])),
|
||||
EncodedValues::U32(vec![1, dictionary::NULL_ID, 2, 1, 2])
|
||||
EncodedValues::U32(vec![1, NULL_ID, 2, 1, 2])
|
||||
);
|
||||
|
||||
let res = col.encoded_values(&[2, 3], EncodedValues::U32(Vec::with_capacity(100)));
|
||||
|
@ -1697,7 +1697,7 @@ mod test {
|
|||
let col = Column::from(&input[..]);
|
||||
assert_eq!(
|
||||
col.all_encoded_values(EncodedValues::U32(vec![])),
|
||||
EncodedValues::U32(vec![1, dictionary::NULL_ID, 2, 1, 2])
|
||||
EncodedValues::U32(vec![1, NULL_ID, 2, 1, 2])
|
||||
);
|
||||
|
||||
// timestamp column
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
pub mod bool;
|
||||
pub mod dictionary;
|
||||
pub mod fixed;
|
||||
pub mod fixed_null;
|
||||
pub mod string;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
pub mod plain;
|
||||
pub mod dictionary;
|
||||
pub mod rle;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
@ -6,8 +6,8 @@ use std::collections::BTreeSet;
|
|||
use either::Either;
|
||||
|
||||
// This makes the encoding types available under the dictionary module.
|
||||
pub use self::plain::Plain;
|
||||
pub use self::rle::RLE;
|
||||
pub use dictionary::Dictionary;
|
||||
pub use rle::RLE;
|
||||
|
||||
use crate::column::{cmp, RowIDs};
|
||||
|
||||
|
@ -17,7 +17,7 @@ pub const NULL_ID: u32 = 0;
|
|||
#[allow(clippy::upper_case_acronyms)] // this looks weird as `Rle`
|
||||
pub enum Encoding {
|
||||
RLE(RLE),
|
||||
Plain(Plain),
|
||||
Plain(Dictionary),
|
||||
}
|
||||
|
||||
impl Encoding {
|
||||
|
@ -311,7 +311,7 @@ mod test {
|
|||
fn push() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::from(vec!["hello", "hello", "hello", "hello"])),
|
||||
Encoding::Plain(Plain::from(vec!["hello", "hello", "hello", "hello"])),
|
||||
Encoding::Plain(Dictionary::from(vec!["hello", "hello", "hello", "hello"])),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -375,7 +375,7 @@ mod test {
|
|||
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary.clone())),
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary)),
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary)),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -414,7 +414,7 @@ mod test {
|
|||
fn row_ids_filter_equal() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -461,7 +461,7 @@ mod test {
|
|||
fn row_ids_filter_equal_no_null() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -482,7 +482,7 @@ mod test {
|
|||
fn row_ids_filter_cmp() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -620,7 +620,7 @@ mod test {
|
|||
fn row_ids_filter_cmp_single() {
|
||||
let mut encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings.iter_mut() {
|
||||
|
@ -667,7 +667,7 @@ mod test {
|
|||
|
||||
let mut encodings = vec![
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary.clone())),
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary)),
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary)),
|
||||
];
|
||||
|
||||
for enc in encodings.iter_mut() {
|
||||
|
@ -713,7 +713,7 @@ mod test {
|
|||
fn row_ids_null() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -742,7 +742,7 @@ mod test {
|
|||
fn group_row_ids() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -792,7 +792,7 @@ mod test {
|
|||
fn dictionary() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -821,7 +821,7 @@ mod test {
|
|||
fn value() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -849,7 +849,7 @@ mod test {
|
|||
enc.push("b".to_string());
|
||||
enc.value(100);
|
||||
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push("b".to_string());
|
||||
enc.value(100);
|
||||
}
|
||||
|
@ -858,7 +858,7 @@ mod test {
|
|||
fn values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -894,7 +894,7 @@ mod test {
|
|||
fn all_values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::from(vec!["hello", "zoo"])),
|
||||
Encoding::Plain(Plain::from(vec!["hello", "zoo"])),
|
||||
Encoding::Plain(Dictionary::from(vec!["hello", "zoo"])),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -919,7 +919,7 @@ mod test {
|
|||
fn encoded_values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -950,7 +950,7 @@ mod test {
|
|||
fn all_encoded_values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -975,7 +975,7 @@ mod test {
|
|||
fn min_max() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -1020,7 +1020,7 @@ mod test {
|
|||
fn distinct_values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
||||
|
@ -1081,7 +1081,7 @@ mod test {
|
|||
fn has_other_non_null_values() {
|
||||
let encodings = vec![
|
||||
Encoding::RLE(RLE::default()),
|
||||
Encoding::Plain(Plain::default()),
|
||||
Encoding::Plain(Dictionary::default()),
|
||||
];
|
||||
|
||||
for enc in encodings {
|
|
@ -13,11 +13,11 @@ use std::mem::size_of;
|
|||
|
||||
use arrow::array::{Array, StringArray};
|
||||
|
||||
use crate::column::dictionary::NULL_ID;
|
||||
use super::NULL_ID;
|
||||
use crate::column::{cmp, RowIDs};
|
||||
|
||||
pub const ENCODING_NAME: &str = "DICT";
|
||||
pub struct Plain {
|
||||
pub struct Dictionary {
|
||||
// The sorted set of logical values that are contained within this column
|
||||
// encoding. Entries always contains None, which is used to reserve the
|
||||
// encoded id of `0` for NULL values.
|
||||
|
@ -32,9 +32,9 @@ pub struct Plain {
|
|||
contains_null: bool,
|
||||
}
|
||||
|
||||
// The default initialisation of an Plain involves reserving the first id/index
|
||||
// `0`, which is the encoded representation of the NULL value.
|
||||
impl Default for Plain {
|
||||
// The default initialisation of this Dictionary involves reserving the first
|
||||
// id/index `0`, which is the encoded representation of the NULL value.
|
||||
impl Default for Dictionary {
|
||||
fn default() -> Self {
|
||||
// for this to make sense NULL_ID must be `0`.
|
||||
assert_eq!(NULL_ID, 0);
|
||||
|
@ -46,8 +46,8 @@ impl Default for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl Plain {
|
||||
/// Initialises an Plain encoding with a set of logical values.
|
||||
impl Dictionary {
|
||||
/// Initialises an Dictionar encoding with a set of logical values.
|
||||
/// Creating an encoding using `with_dictionary` ensures that the dictionary
|
||||
/// is in the correct order, and will allow values to be inserted with any
|
||||
/// value in the dictionary.
|
||||
|
@ -746,7 +746,7 @@ impl Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<Vec<&str>> for Plain {
|
||||
impl<'a> From<Vec<&str>> for Dictionary {
|
||||
fn from(vec: Vec<&str>) -> Self {
|
||||
let mut enc = Self::default();
|
||||
for v in vec {
|
||||
|
@ -756,7 +756,7 @@ impl<'a> From<Vec<&str>> for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<Vec<String>> for Plain {
|
||||
impl<'a> From<Vec<String>> for Dictionary {
|
||||
fn from(vec: Vec<String>) -> Self {
|
||||
let mut enc = Self::default();
|
||||
for v in vec {
|
||||
|
@ -766,7 +766,7 @@ impl<'a> From<Vec<String>> for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<Vec<Option<&str>>> for Plain {
|
||||
impl<'a> From<Vec<Option<&str>>> for Dictionary {
|
||||
fn from(vec: Vec<Option<&str>>) -> Self {
|
||||
let mut drle = Self::default();
|
||||
for v in vec {
|
||||
|
@ -779,7 +779,7 @@ impl<'a> From<Vec<Option<&str>>> for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<Vec<Option<String>>> for Plain {
|
||||
impl<'a> From<Vec<Option<String>>> for Dictionary {
|
||||
fn from(vec: Vec<Option<String>>) -> Self {
|
||||
let mut drle = Self::default();
|
||||
for v in vec {
|
||||
|
@ -792,7 +792,7 @@ impl<'a> From<Vec<Option<String>>> for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<StringArray> for Plain {
|
||||
impl<'a> From<StringArray> for Dictionary {
|
||||
fn from(arr: StringArray) -> Self {
|
||||
let mut drle = Self::default();
|
||||
for i in 0..arr.len() {
|
||||
|
@ -806,7 +806,7 @@ impl<'a> From<StringArray> for Plain {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Plain {
|
||||
impl std::fmt::Display for Dictionary {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
|
@ -829,7 +829,7 @@ mod test {
|
|||
dictionary.insert("hello".to_string());
|
||||
dictionary.insert("world".to_string());
|
||||
|
||||
let enc = Plain::with_dictionary(dictionary);
|
||||
let enc = Dictionary::with_dictionary(dictionary);
|
||||
assert_eq!(
|
||||
enc.entries,
|
||||
vec![None, Some("hello".to_string()), Some("world".to_string()),]
|
||||
|
@ -838,7 +838,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn size() {
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push_additional(Some("east".to_string()), 3);
|
||||
enc.push_additional(Some("north".to_string()), 1);
|
||||
enc.push_additional(Some("east".to_string()), 5);
|
||||
|
@ -878,7 +878,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn null_count() {
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push_additional(Some("east".to_string()), 3);
|
||||
assert_eq!(enc.null_count(), 0);
|
||||
|
||||
|
@ -896,14 +896,14 @@ mod test {
|
|||
#[test]
|
||||
#[should_panic]
|
||||
fn push_wrong_order() {
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push("b".to_string());
|
||||
enc.push("a".to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn has_non_null_value() {
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push_none();
|
||||
enc.push_none();
|
||||
|
||||
|
@ -917,7 +917,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn has_any_non_null_value() {
|
||||
let mut enc = Plain::default();
|
||||
let mut enc = Dictionary::default();
|
||||
enc.push_none();
|
||||
enc.push_none();
|
||||
|
|
@ -7,7 +7,7 @@ use croaring::Bitmap;
|
|||
|
||||
use arrow::array::{Array, StringArray};
|
||||
|
||||
use crate::column::dictionary::NULL_ID;
|
||||
use super::NULL_ID;
|
||||
use crate::column::{cmp, RowIDs};
|
||||
|
||||
pub const ENCODING_NAME: &str = "RLE";
|
|
@ -4,8 +4,8 @@ use arrow::{self, array::Array};
|
|||
use either::Either;
|
||||
|
||||
use super::cmp;
|
||||
use super::encoding::dictionary::{plain, rle};
|
||||
use super::encoding::dictionary::{Encoding, Plain, RLE};
|
||||
use super::encoding::string::{dictionary, rle};
|
||||
use super::encoding::string::{Dictionary, Encoding, RLE};
|
||||
use crate::column::{RowIDs, Statistics, Value, Values};
|
||||
|
||||
// Edd's totally made up magic constant. This determines whether we would use
|
||||
|
@ -20,7 +20,7 @@ pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 100_000;
|
|||
|
||||
pub enum StringEncoding {
|
||||
RleDictionary(RLE),
|
||||
Dictionary(Plain),
|
||||
Dictionary(Dictionary),
|
||||
// TODO - simple array encoding, e.g., via Arrow String array.
|
||||
}
|
||||
|
||||
|
@ -64,7 +64,7 @@ impl StringEncoding {
|
|||
Statistics {
|
||||
enc_type: match self {
|
||||
Self::RleDictionary(_) => rle::ENCODING_NAME,
|
||||
Self::Dictionary(_) => plain::ENCODING_NAME,
|
||||
Self::Dictionary(_) => dictionary::ENCODING_NAME,
|
||||
},
|
||||
log_data_type: "string",
|
||||
values: self.num_rows(),
|
||||
|
@ -287,7 +287,7 @@ impl From<arrow::array::StringArray> for StringEncoding {
|
|||
}
|
||||
|
||||
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary))
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary))
|
||||
} else {
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary))
|
||||
};
|
||||
|
@ -352,7 +352,7 @@ impl From<arrow::array::DictionaryArray<arrow::datatypes::Int32Type>> for String
|
|||
let dictionary: BTreeSet<_> = values.iter().flatten().map(Into::into).collect();
|
||||
|
||||
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary))
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary))
|
||||
} else {
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary))
|
||||
};
|
||||
|
@ -407,7 +407,7 @@ impl From<&[Option<&str>]> for StringEncoding {
|
|||
}
|
||||
|
||||
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary))
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary))
|
||||
} else {
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary))
|
||||
};
|
||||
|
@ -448,7 +448,7 @@ impl From<&[&str]> for StringEncoding {
|
|||
let dictionary = arr.iter().map(|x| x.to_string()).collect::<BTreeSet<_>>();
|
||||
|
||||
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
|
||||
Encoding::Plain(Plain::with_dictionary(dictionary))
|
||||
Encoding::Plain(Dictionary::with_dictionary(dictionary))
|
||||
} else {
|
||||
Encoding::RLE(RLE::with_dictionary(dictionary))
|
||||
};
|
||||
|
|
|
@ -22,8 +22,8 @@ pub use table::ReadFilterResults;
|
|||
/// It should not be imported into any non-testing or benchmarking crates.
|
||||
pub mod benchmarks {
|
||||
pub use crate::column::{
|
||||
cmp::Operator, encoding::dictionary, encoding::fixed::Fixed,
|
||||
encoding::fixed_null::FixedNull, Column, RowIDs,
|
||||
cmp::Operator, encoding::fixed::Fixed, encoding::fixed_null::FixedNull, encoding::string,
|
||||
Column, RowIDs,
|
||||
};
|
||||
|
||||
pub use crate::row_group::{ColumnType, RowGroup};
|
||||
|
|
Loading…
Reference in New Issue