refactor: shuffle string encodings

pull/24376/head
Edd Robinson 2021-05-11 22:10:45 +01:00
parent f86e0641fd
commit 482e4dab86
8 changed files with 68 additions and 68 deletions

View File

@ -3,7 +3,7 @@ use rand::distributions::Alphanumeric;
use rand::prelude::*;
use rand::Rng;
use read_buffer::benchmarks::{dictionary, Operator, RowIDs};
use read_buffer::benchmarks::{string, Operator, RowIDs};
const ROWS: [usize; 3] = [100_000, 1_000_000, 10_000_000];
const LOCATIONS: [Location; 3] = [Location::Start, Location::Middle, Location::End];
@ -17,7 +17,7 @@ enum Location {
}
enum EncType {
RleDictionary,
Rle,
Dictionary,
}
@ -25,8 +25,8 @@ fn select(c: &mut Criterion) {
let mut rng = rand::thread_rng();
benchmark_select(
c,
"encoding_rle_select",
EncType::RleDictionary,
"select",
EncType::Rle,
&ROWS,
&LOCATIONS,
&ROWS_MATCHING_VALUE,
@ -35,7 +35,7 @@ fn select(c: &mut Criterion) {
benchmark_select(
c,
"encoding_dict_select",
"_select",
EncType::Dictionary,
&ROWS,
&LOCATIONS,
@ -82,22 +82,22 @@ fn benchmark_select(
};
group.throughput(Throughput::Elements(num_rows as u64));
let encoding: dictionary::Encoding = match enc_type {
EncType::RleDictionary => {
let mut encoding = dictionary::RLE::with_dictionary(col_dict);
let encoding: string::Encoding = match enc_type {
EncType::Rle => {
let mut encoding = string::RLE::with_dictionary(col_dict);
// Could be faster but it's just the bench setup...
for v in &col_data {
encoding.push(v.to_owned());
}
dictionary::Encoding::RLE(encoding)
string::Encoding::RLE(encoding)
}
EncType::Dictionary => {
let mut encoding = dictionary::Plain::with_dictionary(col_dict);
let mut encoding = string::Dictionary::with_dictionary(col_dict);
// Could be faster but it's just the bench setup...
for v in &col_data {
encoding.push(v.to_owned());
}
dictionary::Encoding::Plain(encoding)
string::Encoding::Plain(encoding)
}
};

View File

@ -15,7 +15,7 @@ use arrow::array::Array;
use crate::schema::LogicalDataType;
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, Values};
use boolean::BooleanEncoding;
use encoding::{bool, dictionary, fixed_null};
use encoding::{bool, fixed_null, string::NULL_ID};
use float::FloatEncoding;
use integer::IntegerEncoding;
use string::StringEncoding;
@ -1659,7 +1659,7 @@ mod test {
let col = Column::from(&input[..]);
assert_eq!(
col.encoded_values(&[0, 1, 2, 3, 4], EncodedValues::U32(vec![])),
EncodedValues::U32(vec![1, dictionary::NULL_ID, 2, 1, 2])
EncodedValues::U32(vec![1, NULL_ID, 2, 1, 2])
);
let res = col.encoded_values(&[2, 3], EncodedValues::U32(Vec::with_capacity(100)));
@ -1697,7 +1697,7 @@ mod test {
let col = Column::from(&input[..]);
assert_eq!(
col.all_encoded_values(EncodedValues::U32(vec![])),
EncodedValues::U32(vec![1, dictionary::NULL_ID, 2, 1, 2])
EncodedValues::U32(vec![1, NULL_ID, 2, 1, 2])
);
// timestamp column

View File

@ -1,4 +1,4 @@
pub mod bool;
pub mod dictionary;
pub mod fixed;
pub mod fixed_null;
pub mod string;

View File

@ -1,4 +1,4 @@
pub mod plain;
pub mod dictionary;
pub mod rle;
use std::collections::BTreeSet;
@ -6,8 +6,8 @@ use std::collections::BTreeSet;
use either::Either;
// This makes the encoding types available under the dictionary module.
pub use self::plain::Plain;
pub use self::rle::RLE;
pub use dictionary::Dictionary;
pub use rle::RLE;
use crate::column::{cmp, RowIDs};
@ -17,7 +17,7 @@ pub const NULL_ID: u32 = 0;
#[allow(clippy::upper_case_acronyms)] // this looks weird as `Rle`
pub enum Encoding {
RLE(RLE),
Plain(Plain),
Plain(Dictionary),
}
impl Encoding {
@ -311,7 +311,7 @@ mod test {
fn push() {
let encodings = vec![
Encoding::RLE(RLE::from(vec!["hello", "hello", "hello", "hello"])),
Encoding::Plain(Plain::from(vec!["hello", "hello", "hello", "hello"])),
Encoding::Plain(Dictionary::from(vec!["hello", "hello", "hello", "hello"])),
];
for enc in encodings {
@ -375,7 +375,7 @@ mod test {
let encodings = vec![
Encoding::RLE(RLE::with_dictionary(dictionary.clone())),
Encoding::Plain(Plain::with_dictionary(dictionary)),
Encoding::Plain(Dictionary::with_dictionary(dictionary)),
];
for enc in encodings {
@ -414,7 +414,7 @@ mod test {
fn row_ids_filter_equal() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -461,7 +461,7 @@ mod test {
fn row_ids_filter_equal_no_null() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -482,7 +482,7 @@ mod test {
fn row_ids_filter_cmp() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -620,7 +620,7 @@ mod test {
fn row_ids_filter_cmp_single() {
let mut encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings.iter_mut() {
@ -667,7 +667,7 @@ mod test {
let mut encodings = vec![
Encoding::RLE(RLE::with_dictionary(dictionary.clone())),
Encoding::Plain(Plain::with_dictionary(dictionary)),
Encoding::Plain(Dictionary::with_dictionary(dictionary)),
];
for enc in encodings.iter_mut() {
@ -713,7 +713,7 @@ mod test {
fn row_ids_null() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -742,7 +742,7 @@ mod test {
fn group_row_ids() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -792,7 +792,7 @@ mod test {
fn dictionary() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -821,7 +821,7 @@ mod test {
fn value() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -849,7 +849,7 @@ mod test {
enc.push("b".to_string());
enc.value(100);
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push("b".to_string());
enc.value(100);
}
@ -858,7 +858,7 @@ mod test {
fn values() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -894,7 +894,7 @@ mod test {
fn all_values() {
let encodings = vec![
Encoding::RLE(RLE::from(vec!["hello", "zoo"])),
Encoding::Plain(Plain::from(vec!["hello", "zoo"])),
Encoding::Plain(Dictionary::from(vec!["hello", "zoo"])),
];
for enc in encodings {
@ -919,7 +919,7 @@ mod test {
fn encoded_values() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -950,7 +950,7 @@ mod test {
fn all_encoded_values() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -975,7 +975,7 @@ mod test {
fn min_max() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -1020,7 +1020,7 @@ mod test {
fn distinct_values() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {
@ -1081,7 +1081,7 @@ mod test {
fn has_other_non_null_values() {
let encodings = vec![
Encoding::RLE(RLE::default()),
Encoding::Plain(Plain::default()),
Encoding::Plain(Dictionary::default()),
];
for enc in encodings {

View File

@ -13,11 +13,11 @@ use std::mem::size_of;
use arrow::array::{Array, StringArray};
use crate::column::dictionary::NULL_ID;
use super::NULL_ID;
use crate::column::{cmp, RowIDs};
pub const ENCODING_NAME: &str = "DICT";
pub struct Plain {
pub struct Dictionary {
// The sorted set of logical values that are contained within this column
// encoding. Entries always contains None, which is used to reserve the
// encoded id of `0` for NULL values.
@ -32,9 +32,9 @@ pub struct Plain {
contains_null: bool,
}
// The default initialisation of an Plain involves reserving the first id/index
// `0`, which is the encoded representation of the NULL value.
impl Default for Plain {
// The default initialisation of this Dictionary involves reserving the first
// id/index `0`, which is the encoded representation of the NULL value.
impl Default for Dictionary {
fn default() -> Self {
// for this to make sense NULL_ID must be `0`.
assert_eq!(NULL_ID, 0);
@ -46,8 +46,8 @@ impl Default for Plain {
}
}
impl Plain {
/// Initialises an Plain encoding with a set of logical values.
impl Dictionary {
/// Initialises an Dictionar encoding with a set of logical values.
/// Creating an encoding using `with_dictionary` ensures that the dictionary
/// is in the correct order, and will allow values to be inserted with any
/// value in the dictionary.
@ -746,7 +746,7 @@ impl Plain {
}
}
impl<'a> From<Vec<&str>> for Plain {
impl<'a> From<Vec<&str>> for Dictionary {
fn from(vec: Vec<&str>) -> Self {
let mut enc = Self::default();
for v in vec {
@ -756,7 +756,7 @@ impl<'a> From<Vec<&str>> for Plain {
}
}
impl<'a> From<Vec<String>> for Plain {
impl<'a> From<Vec<String>> for Dictionary {
fn from(vec: Vec<String>) -> Self {
let mut enc = Self::default();
for v in vec {
@ -766,7 +766,7 @@ impl<'a> From<Vec<String>> for Plain {
}
}
impl<'a> From<Vec<Option<&str>>> for Plain {
impl<'a> From<Vec<Option<&str>>> for Dictionary {
fn from(vec: Vec<Option<&str>>) -> Self {
let mut drle = Self::default();
for v in vec {
@ -779,7 +779,7 @@ impl<'a> From<Vec<Option<&str>>> for Plain {
}
}
impl<'a> From<Vec<Option<String>>> for Plain {
impl<'a> From<Vec<Option<String>>> for Dictionary {
fn from(vec: Vec<Option<String>>) -> Self {
let mut drle = Self::default();
for v in vec {
@ -792,7 +792,7 @@ impl<'a> From<Vec<Option<String>>> for Plain {
}
}
impl<'a> From<StringArray> for Plain {
impl<'a> From<StringArray> for Dictionary {
fn from(arr: StringArray) -> Self {
let mut drle = Self::default();
for i in 0..arr.len() {
@ -806,7 +806,7 @@ impl<'a> From<StringArray> for Plain {
}
}
impl std::fmt::Display for Plain {
impl std::fmt::Display for Dictionary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
@ -829,7 +829,7 @@ mod test {
dictionary.insert("hello".to_string());
dictionary.insert("world".to_string());
let enc = Plain::with_dictionary(dictionary);
let enc = Dictionary::with_dictionary(dictionary);
assert_eq!(
enc.entries,
vec![None, Some("hello".to_string()), Some("world".to_string()),]
@ -838,7 +838,7 @@ mod test {
#[test]
fn size() {
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push_additional(Some("east".to_string()), 3);
enc.push_additional(Some("north".to_string()), 1);
enc.push_additional(Some("east".to_string()), 5);
@ -878,7 +878,7 @@ mod test {
#[test]
fn null_count() {
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push_additional(Some("east".to_string()), 3);
assert_eq!(enc.null_count(), 0);
@ -896,14 +896,14 @@ mod test {
#[test]
#[should_panic]
fn push_wrong_order() {
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push("b".to_string());
enc.push("a".to_string());
}
#[test]
fn has_non_null_value() {
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push_none();
enc.push_none();
@ -917,7 +917,7 @@ mod test {
#[test]
fn has_any_non_null_value() {
let mut enc = Plain::default();
let mut enc = Dictionary::default();
enc.push_none();
enc.push_none();

View File

@ -7,7 +7,7 @@ use croaring::Bitmap;
use arrow::array::{Array, StringArray};
use crate::column::dictionary::NULL_ID;
use super::NULL_ID;
use crate::column::{cmp, RowIDs};
pub const ENCODING_NAME: &str = "RLE";

View File

@ -4,8 +4,8 @@ use arrow::{self, array::Array};
use either::Either;
use super::cmp;
use super::encoding::dictionary::{plain, rle};
use super::encoding::dictionary::{Encoding, Plain, RLE};
use super::encoding::string::{dictionary, rle};
use super::encoding::string::{Dictionary, Encoding, RLE};
use crate::column::{RowIDs, Statistics, Value, Values};
// Edd's totally made up magic constant. This determines whether we would use
@ -20,7 +20,7 @@ pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 100_000;
pub enum StringEncoding {
RleDictionary(RLE),
Dictionary(Plain),
Dictionary(Dictionary),
// TODO - simple array encoding, e.g., via Arrow String array.
}
@ -64,7 +64,7 @@ impl StringEncoding {
Statistics {
enc_type: match self {
Self::RleDictionary(_) => rle::ENCODING_NAME,
Self::Dictionary(_) => plain::ENCODING_NAME,
Self::Dictionary(_) => dictionary::ENCODING_NAME,
},
log_data_type: "string",
values: self.num_rows(),
@ -287,7 +287,7 @@ impl From<arrow::array::StringArray> for StringEncoding {
}
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
Encoding::Plain(Plain::with_dictionary(dictionary))
Encoding::Plain(Dictionary::with_dictionary(dictionary))
} else {
Encoding::RLE(RLE::with_dictionary(dictionary))
};
@ -352,7 +352,7 @@ impl From<arrow::array::DictionaryArray<arrow::datatypes::Int32Type>> for String
let dictionary: BTreeSet<_> = values.iter().flatten().map(Into::into).collect();
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
Encoding::Plain(Plain::with_dictionary(dictionary))
Encoding::Plain(Dictionary::with_dictionary(dictionary))
} else {
Encoding::RLE(RLE::with_dictionary(dictionary))
};
@ -407,7 +407,7 @@ impl From<&[Option<&str>]> for StringEncoding {
}
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
Encoding::Plain(Plain::with_dictionary(dictionary))
Encoding::Plain(Dictionary::with_dictionary(dictionary))
} else {
Encoding::RLE(RLE::with_dictionary(dictionary))
};
@ -448,7 +448,7 @@ impl From<&[&str]> for StringEncoding {
let dictionary = arr.iter().map(|x| x.to_string()).collect::<BTreeSet<_>>();
let mut data: Encoding = if dictionary.len() > TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT {
Encoding::Plain(Plain::with_dictionary(dictionary))
Encoding::Plain(Dictionary::with_dictionary(dictionary))
} else {
Encoding::RLE(RLE::with_dictionary(dictionary))
};

View File

@ -22,8 +22,8 @@ pub use table::ReadFilterResults;
/// It should not be imported into any non-testing or benchmarking crates.
pub mod benchmarks {
pub use crate::column::{
cmp::Operator, encoding::dictionary, encoding::fixed::Fixed,
encoding::fixed_null::FixedNull, Column, RowIDs,
cmp::Operator, encoding::fixed::Fixed, encoding::fixed_null::FixedNull, encoding::string,
Column, RowIDs,
};
pub use crate::row_group::{ColumnType, RowGroup};