feat: implement push on plain dict encoding

pull/24376/head
Edd Robinson 2020-11-11 13:09:36 +00:00
parent d8f382e5b7
commit a6627aa5db
3 changed files with 501 additions and 20 deletions

View File

@ -20,6 +20,13 @@ enum Encoding {
}
impl Encoding {
fn debug_name(&self) -> &'static str {
match &self {
Encoding::RLE(_) => "RLE encoder",
Encoding::Plain(_) => "plain encoder",
}
}
fn size(&self) -> u64 {
match &self {
Encoding::RLE(enc) => enc.size(),
@ -312,12 +319,23 @@ mod test {
}
#[test]
fn rle_push() {
let mut enc = Encoding::RLE(RLE::from(vec!["hello", "hello", "hello", "hello"]));
fn push() {
let encodings = vec![
Encoding::RLE(RLE::from(vec!["hello", "hello", "hello", "hello"])),
Encoding::Plain(Plain::from(vec!["hello", "hello", "hello", "hello"])),
];
for enc in encodings {
_push(enc);
}
}
fn _push(mut enc: Encoding) {
enc.push_additional(Some("hello".to_string()), 1);
enc.push_additional(None, 3);
enc.push("world".to_string());
let name = enc.debug_name();
assert_eq!(
enc.all_values(vec![]),
[
@ -330,7 +348,9 @@ mod test {
None,
None,
Some(&"world".to_string()),
]
],
"{}",
name
);
enc.push_additional(Some("zoo".to_string()), 3);
@ -351,26 +371,42 @@ mod test {
Some(&"zoo".to_string()),
Some(&"zoo".to_string()),
None,
]
],
"{}",
name
);
}
// tests a defect I discovered.
#[test]
fn push_additional_first_run_length() {
let arr = vec!["world".to_string(), "hello".to_string()];
let dictionary = vec!["world".to_string(), "hello".to_string()]
.into_iter()
.collect::<BTreeSet<String>>();
let encodings = vec![
Encoding::RLE(RLE::with_dictionary(dictionary.clone())),
Encoding::Plain(Plain::with_dictionary(dictionary)),
];
for enc in encodings {
_push_additional_first_run_length(enc);
}
}
fn _push_additional_first_run_length(mut enc: Encoding) {
let name = enc.debug_name();
let mut enc = Encoding::RLE(RLE::with_dictionary(
arr.into_iter().collect::<BTreeSet<String>>(),
));
enc.push_additional(Some("world".to_string()), 1);
enc.push_additional(Some("hello".to_string()), 1);
assert_eq!(
enc.all_values(vec![]),
vec![Some(&"world".to_string()), Some(&"hello".to_string())]
vec![Some(&"world".to_string()), Some(&"hello".to_string())],
"{}",
name
);
assert_eq!(enc.all_encoded_values(vec![]), vec![2, 1]);
assert_eq!(enc.all_encoded_values(vec![]), vec![2, 1], "{}", name);
enc = Encoding::RLE(RLE::default());
enc.push_additional(Some("hello".to_string()), 1);
@ -378,17 +414,11 @@ mod test {
assert_eq!(
enc.all_values(vec![]),
vec![Some(&"hello".to_string()), Some(&"world".to_string())]
vec![Some(&"hello".to_string()), Some(&"world".to_string())],
"{}",
name
);
assert_eq!(enc.all_encoded_values(vec![]), vec![1, 2]);
}
#[test]
#[should_panic]
fn rle_push_wrong_order() {
let mut enc = Encoding::RLE(RLE::default());
enc.push("b".to_string());
enc.push("a".to_string());
assert_eq!(enc.all_encoded_values(vec![]), vec![1, 2], "{}", name);
}
#[test]

View File

@ -0,0 +1,443 @@
use std::collections::{BTreeMap, BTreeSet};
use std::convert::From;
use croaring::Bitmap;
use arrow_deps::arrow::array::{Array, StringArray};
use crate::column::dictionary::NULL_ID;
use crate::column::{cmp, RowIDs};
pub struct Plain {
// The sorted set of logical values that are contained within this column
// encoding. Entries always contains None, which is used to reserve the
// encoded id of `0` for NULL values.
entries: Vec<Option<String>>,
// A vector of encoded ids used to represent logical values within the
// column encoding.
encoded_data: Vec<u32>,
// marker indicating if the encoding contains a NULL value in one or more
// rows.
contains_null: bool,
}
// The default initialisation of an Plain involves reserving the first id/index 0
// for the NULL value.
impl Default for Plain {
fn default() -> Self {
Self {
entries: vec![None],
encoded_data: vec![],
contains_null: false,
}
}
}
impl Plain {
/// Initialises an Plain encoding with a set of logical values.
/// Creating an encoding using `with_dictionary` ensures that the dictionary
/// is in the correct order, and will allow values to be inserted with any
/// value in the dictionary.
///
/// Callers are not required to provide a logical NULL value as part of the
/// dictionary. The encoding already reserves a representation for that.
pub fn with_dictionary(dictionary: BTreeSet<String>) -> Self {
let mut _self = Self::default();
_self.entries.extend(dictionary.into_iter().map(Some));
_self
}
/// A reasonable estimation of the on-heap size this encoding takes up.
pub fn size(&self) -> u64 {
todo!()
}
/// Adds the provided string value to the encoded data. It is the caller's
/// responsibility to ensure that the dictionary encoded remains sorted.
pub fn push(&mut self, v: String) {
self.push_additional(Some(v), 1);
}
/// Adds a NULL value to the encoded data. It is the caller's
/// responsibility to ensure that the dictionary encoded remains sorted.
pub fn push_none(&mut self) {
self.push_encoded_values(NULL_ID, 1);
}
/// Adds additional repetitions of the provided value to the encoded data.
/// It is the caller's responsibility to ensure that the dictionary remains
/// sorted. `push_additional` will panic if that invariant is broken.
pub fn push_additional(&mut self, v: Option<String>, additional: u32) {
if v.is_none() {
self.push_encoded_values(NULL_ID, additional);
return;
}
match self.encoded_id(&v) {
Ok(id) => self.push_encoded_values(id, additional),
Err(idx) => {
// check this value can be inserted into the dictionary whilst
// maintaining order.
assert_eq!(idx, self.entries.len() as u32);
self.entries.push(v);
self.push_encoded_values(idx, additional);
}
}
}
// Preferred to add values to the column. `id` is the encoded
// representation of a logical string value.
fn push_encoded_values(&mut self, id: u32, additional: u32) {
self.encoded_data
.extend(std::iter::repeat(id).take(additional as usize));
}
// Preferred way to lookup the encoded id for a given logical string value.
// If the value exists then the `Ok` variant with the encoded id is returned,
// otherwise the `Err` variant is returned with the appropriate ordinal id
// for the value if that value did exist.
//
// The `Err` variant can be useful when applying predicates directly to the
// encoded data.
fn encoded_id(&self, value: &Option<String>) -> Result<u32, u32> {
match self.entries.binary_search(value) {
Ok(id) => Ok(id as u32),
Err(id) => Err(id as u32),
}
}
// correct way to determine next encoded id for a new value.
fn next_encoded_id(&self) -> u32 {
todo!()
}
/// The number of logical rows encoded in this column.
pub fn num_rows(&self) -> u32 {
self.entries.len() as u32
}
/// Determine if NULL is encoded in the column.
pub fn contains_null(&self) -> bool {
self.contains_null
}
//
//
// ---- Methods for getting row ids from values.
//
//
/// Populates the provided destination container with the row ids satisfying
/// the provided predicate.
pub fn row_ids_filter(&self, value: &str, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
match op {
cmp::Operator::Equal | cmp::Operator::NotEqual => self.row_ids_equal(value, op, dst),
cmp::Operator::LT | cmp::Operator::LTE | cmp::Operator::GT | cmp::Operator::GTE => {
self.row_ids_cmp(value, op, dst)
}
}
}
// Finds row ids based on = or != operator.
fn row_ids_equal(&self, value: &str, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
dst.clear();
dst
}
// Finds row ids based on <, <=, > or >= operator.
fn row_ids_cmp(&self, value: &str, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
dst.clear();
todo!()
}
/// Populates the provided destination container with the row ids for rows
/// that null.
pub fn row_ids_null(&self, dst: RowIDs) -> RowIDs {
self.row_ids_is_null(true, dst)
}
/// Populates the provided destination container with the row ids for rows
/// that are not null.
pub fn row_ids_not_null(&self, dst: RowIDs) -> RowIDs {
self.row_ids_is_null(false, dst)
}
// All row ids that have either NULL or not NULL values.
fn row_ids_is_null(&self, is_null: bool, mut dst: RowIDs) -> RowIDs {
dst.clear();
todo!()
}
// The set of row ids for each distinct value in the column.
pub fn group_row_ids(&self) -> &BTreeMap<u32, Bitmap> {
todo!()
}
//
//
// ---- Methods for getting materialised values.
//
//
pub fn dictionary(&self) -> &[String] {
todo!()
}
/// Returns the logical value present at the provided row id.
///
/// N.B right now this doesn't discern between an invalid row id and a NULL
/// value at a valid location.
pub fn value(&self, row_id: u32) -> Option<&String> {
todo!()
}
/// Materialises the decoded value belonging to the provided encoded id.
///
/// Panics if there is no decoded value for the provided id
pub fn decode_id(&self, encoded_id: u32) -> Option<String> {
todo!()
}
/// Materialises a vector of references to the decoded values in the
/// provided row ids.
///
/// NULL values are represented by None. It is the caller's responsibility
/// to ensure row ids are a monotonically increasing set.
pub fn values<'a>(
&'a self,
row_ids: &[u32],
mut dst: Vec<Option<&'a str>>,
) -> Vec<Option<&'a str>> {
dst.clear();
dst.reserve(row_ids.len());
todo!()
}
/// Returns the lexicographical minimum value for the provided set of row
/// ids. NULL values are not considered the minimum value if any non-null
/// value exists at any of the provided row ids.
pub fn min<'a>(&'a self, row_ids: &[u32]) -> Option<&'a String> {
todo!()
}
/// Returns the lexicographical maximum value for the provided set of row
/// ids. NULL values are not considered the maximum value if any non-null
/// value exists at any of the provided row ids.
pub fn max<'a>(&'a self, row_ids: &[u32]) -> Option<&'a String> {
todo!()
}
/// Returns the total number of non-null values found at the provided set of
/// row ids.
pub fn count(&self, row_ids: &[u32]) -> u32 {
todo!()
}
/// Returns references to the logical (decoded) values for all the rows in
/// the column.
///
/// NULL values are represented by None.
///
pub fn all_values<'a>(
&'a mut self,
mut dst: Vec<Option<&'a String>>,
) -> Vec<Option<&'a String>> {
dst.clear();
dst.reserve(self.entries.len());
for chunks in self.encoded_data.chunks_exact(4) {
dst.push(self.entries[chunks[0] as usize].as_ref());
dst.push(self.entries[chunks[1] as usize].as_ref());
dst.push(self.entries[chunks[2] as usize].as_ref());
dst.push(self.entries[chunks[3] as usize].as_ref());
}
for &v in &self.encoded_data[dst.len()..self.encoded_data.len()] {
dst.push(self.entries[v as usize].as_ref());
}
dst
}
/// Returns references to the unique set of values encoded at each of the
/// provided ids.
///
/// It is the caller's responsibility to ensure row ids are a monotonically
/// increasing set.
pub fn distinct_values<'a>(
&'a self,
row_ids: &[u32],
mut dst: BTreeSet<Option<&'a String>>,
) -> BTreeSet<Option<&'a String>> {
// TODO(edd): Perf... We can improve on this if we know the column is
// totally ordered.
dst.clear();
todo!()
}
//
//
// ---- Methods for getting encoded values directly, typically to be used
// as part of group keys.
//
//
/// Return the raw encoded values for the provided logical row ids.
/// Encoded values for NULL values are included.
///
pub fn encoded_values(&self, row_ids: &[u32], mut dst: Vec<u32>) -> Vec<u32> {
dst.clear();
dst.reserve(row_ids.len());
todo!()
}
/// Returns all encoded values for the column including the encoded value
/// for any NULL values.
pub fn all_encoded_values(&self, mut dst: Vec<u32>) -> Vec<u32> {
dst.clear();
dst.reserve(self.entries.len());
dst.extend(self.encoded_data.iter());
dst
}
//
//
// ---- Methods for optimising schema exploration.
//
//
/// Efficiently determines if this column contains non-null values that
/// differ from the provided set of values.
///
/// Informally, this method provides an efficient way of answering "is it
/// worth spending time reading this column for values or do I already have
/// all the values in a set".
///
/// More formally, this method returns the relative complement of this
/// column's values in the provided set of values.
///
/// This method would be useful when the same column is being read across
/// many segments, and one wants to determine to the total distinct set of
/// values. By exposing the current result set to each column (as an
/// argument to `contains_other_values`) columns can be short-circuited when
/// they only contain values that have already been discovered.
///
pub fn contains_other_values(&self, values: &BTreeSet<Option<&String>>) -> bool {
todo!()
}
/// Determines if the column contains at least one non-null value at
/// any of the provided row ids.
///
/// It is the caller's responsibility to ensure row ids are a monotonically
/// increasing set.
pub fn has_non_null_value(&self, row_ids: &[u32]) -> bool {
todo!()
}
// Returns true if there exists an encoded non-null value at any of the row
// ids.
fn find_non_null_value(&self, row_ids: &[u32]) -> bool {
todo!()
}
}
impl<'a> From<Vec<&str>> for Plain {
fn from(vec: Vec<&str>) -> Self {
let mut enc = Self::default();
for v in vec {
enc.push(v.to_string());
}
enc
}
}
impl<'a> From<Vec<String>> for Plain {
fn from(vec: Vec<String>) -> Self {
let mut drle = Self::default();
for v in vec {
drle.push(v);
}
drle
}
}
impl<'a> From<Vec<Option<&str>>> for Plain {
fn from(vec: Vec<Option<&str>>) -> Self {
let mut drle = Self::default();
for v in vec {
match v {
Some(x) => drle.push(x.to_string()),
None => drle.push_none(),
}
}
drle
}
}
impl<'a> From<Vec<Option<String>>> for Plain {
fn from(vec: Vec<Option<String>>) -> Self {
let mut drle = Self::default();
for v in vec {
match v {
Some(x) => drle.push(x),
None => drle.push_none(),
}
}
drle
}
}
impl<'a> From<StringArray> for Plain {
fn from(arr: StringArray) -> Self {
let mut drle = Self::default();
for i in 0..arr.len() {
if arr.is_null(i) {
drle.push_none();
} else {
drle.push(arr.value(i).to_string());
}
}
drle
}
}
impl std::fmt::Display for Plain {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
todo!()
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn with_dictionary() {
let mut dictionary = BTreeSet::new();
dictionary.insert("hello".to_string());
dictionary.insert("world".to_string());
let enc = Plain::with_dictionary(dictionary);
assert_eq!(
enc.entries,
vec![None, Some("hello".to_string()), Some("world".to_string()),]
);
}
#[test]
#[should_panic]
fn push_wrong_order() {
let mut enc = Plain::default();
enc.push("b".to_string());
enc.push("a".to_string());
}
}

View File

@ -944,4 +944,12 @@ mod test {
vec![0, 1, 2]
)
}
#[test]
#[should_panic]
fn push_wrong_order() {
let mut enc = RLE::default();
enc.push("b".to_string());
enc.push("a".to_string());
}
}