Merge pull request #371 from influxdata/er/feat/dict-encoding

feat: add dictionary RLE encoding to Segment Store
pull/24376/head
Edd Robinson 2020-10-21 14:00:32 +01:00 committed by GitHub
commit daf89c7d22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1279 additions and 0 deletions

View File

@ -0,0 +1,269 @@
use std::mem::size_of;
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use rand::prelude::*;
use delorean_arrow::arrow::datatypes::*;
use delorean_segment_store::column::fixed::Fixed;
use delorean_segment_store::column::fixed_null::FixedNull;
const ROWS: [usize; 5] = [10, 100, 1_000, 10_000, 60_000];
const CHUNKS: [Chunks; 4] = [
Chunks::All,
Chunks::Even,
Chunks::ManySmall,
Chunks::RandomTenPercent,
];
const PHYSICAL_TYPES: [PhysicalType; 3] = [PhysicalType::I64, PhysicalType::I32, PhysicalType::I16];
#[derive(Debug)]
enum Chunks {
All, // sum up the entire column
Even, // sum up the even rows
ManySmall, // sum up chunks of 10 values
RandomTenPercent, // sum up random 10% of values
}
enum EncType {
Fixed,
Arrow,
}
enum PhysicalType {
I64,
I32,
I16,
}
fn encoding_sum(c: &mut Criterion) {
benchmark_plain_sum(
c,
"encoding_fixed_sum",
EncType::Fixed,
&ROWS,
&CHUNKS,
&PHYSICAL_TYPES,
);
benchmark_plain_sum(
c,
"encoding_arrow_sum",
EncType::Arrow,
&ROWS,
&CHUNKS,
&PHYSICAL_TYPES,
);
}
fn benchmark_plain_sum(
c: &mut Criterion,
benchmark_group_name: &str,
enc_type: EncType,
row_size: &[usize],
chunks: &[Chunks],
physical_type: &[PhysicalType],
) {
let mut group = c.benchmark_group(benchmark_group_name);
for &num_rows in row_size {
for chunk in chunks {
for pt in physical_type {
// Encoded incrementing values.
let input: Vec<usize>;
match chunk {
Chunks::All => input = (0..num_rows).collect(),
Chunks::Even => input = gen_even_chunk(num_rows),
Chunks::ManySmall => input = gen_many_small_chunk(num_rows),
Chunks::RandomTenPercent => input = gen_random_10_percent(num_rows),
}
match pt {
PhysicalType::I64 => {
group
.throughput(Throughput::Bytes((input.len() * size_of::<i64>()) as u64));
match enc_type {
EncType::Fixed => {
let encoding = Fixed::<i64>::from(
(0..num_rows as i64).collect::<Vec<i64>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i64",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum::<i64>(&input);
});
},
);
}
EncType::Arrow => {
let encoding = FixedNull::<Int64Type>::from(
(0..num_rows as i64).collect::<Vec<i64>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i64",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum(&input);
});
},
);
}
}
}
PhysicalType::I32 => {
group
.throughput(Throughput::Bytes((input.len() * size_of::<i64>()) as u64));
match enc_type {
EncType::Fixed => {
let encoding = Fixed::<i32>::from(
(0..num_rows as i32).collect::<Vec<i32>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i32",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum::<i32>(&input);
});
},
);
}
EncType::Arrow => {
let encoding = FixedNull::<Int32Type>::from(
(0..num_rows as i32).collect::<Vec<i32>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i32",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum(&input);
});
},
);
}
}
}
PhysicalType::I16 => {
group
.throughput(Throughput::Bytes((input.len() * size_of::<i64>()) as u64));
match enc_type {
EncType::Fixed => {
let encoding = Fixed::<i16>::from(
(0..num_rows as i16).collect::<Vec<i16>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i16",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum::<i16>(&input);
});
},
);
}
EncType::Arrow => {
let encoding = FixedNull::<Int16Type>::from(
(0..num_rows as i16).collect::<Vec<i16>>().as_slice(),
);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"{:?}_{:?}_i16",
num_rows, chunk
)),
&input,
|b, input| {
b.iter(|| {
// do work
let _ = encoding.sum(&input);
});
},
);
}
}
}
}
}
}
}
group.finish();
}
// results in about 50% rows being requested.
fn gen_even_chunk(rows: usize) -> Vec<usize> {
(0..rows).filter(|x| x % 2 == 0).collect::<Vec<_>>()
}
// generate small sequences of 3 rows periodically. This leads to about 34% of
// rows being requested.
fn gen_many_small_chunk(rows: usize) -> Vec<usize> {
let mut input = vec![];
let mut emit_chunk = false;
let mut chunk_size = 0;
for i in 0..rows {
if i % 9 == 0 {
emit_chunk = true;
}
if emit_chunk {
input.push(i);
chunk_size += 1;
}
if chunk_size == 3 {
chunk_size = 0;
emit_chunk = false;
}
}
input
}
// generate random 10% sequence.
fn gen_random_10_percent(rows: usize) -> Vec<usize> {
let mut rnd = thread_rng();
let mut input = vec![];
for i in 0..rows {
if rnd.gen::<f64>() < 0.1 {
input.push(i);
}
}
input
}
criterion_group!(benches, encoding_sum,);
criterion_main!(benches);

View File

@ -1,7 +1,10 @@
pub mod cmp;
pub mod dictionary;
pub mod fixed;
pub mod fixed_null;
use croaring::Bitmap;
use delorean_arrow::arrow;
/// The possible logical types that column values can have. All values in a
@ -104,3 +107,41 @@ pub enum Values {
// Arbitrary byte arrays
ByteArray(arrow::array::UInt8Array),
}
/// Represents vectors of row IDs, which are usually used for intermediate
/// results as a method of late materialisation.
#[derive(PartialEq, Debug)]
pub enum RowIDs {
Bitmap(Bitmap),
Vector(Vec<u32>),
}
impl RowIDs {
pub fn len(&self) -> usize {
match self {
RowIDs::Bitmap(ids) => ids.cardinality() as usize,
RowIDs::Vector(ids) => ids.len(),
}
}
pub fn is_empty(&self) -> bool {
match self {
RowIDs::Bitmap(ids) => ids.is_empty(),
RowIDs::Vector(ids) => ids.is_empty(),
}
}
pub fn clear(&mut self) {
match self {
RowIDs::Bitmap(ids) => ids.clear(),
RowIDs::Vector(ids) => ids.clear(),
}
}
pub fn add_range(&mut self, from: u32, to: u32) {
match self {
RowIDs::Bitmap(ids) => ids.add_range(from as u64..to as u64),
RowIDs::Vector(ids) => ids.extend(from..to),
}
}
}

View File

@ -0,0 +1,969 @@
use std::collections::{BTreeMap, BTreeSet};
use std::convert::From;
use std::iter;
use croaring::Bitmap;
use delorean_arrow::arrow::array::{Array, StringArray};
use crate::column::{cmp, RowIDs};
// `RLE` is a run-length encoding for dictionary columns, where all dictionary
// entries are utf-8 valid strings.
#[derive(Default)]
pub struct RLE {
// TODO(edd): revisit choice of storing owned string versus references.
// The mapping between an entry and its assigned index.
entry_index: BTreeMap<Option<String>, u32>,
// The mapping between an index and its entry.
index_entries: Vec<Option<String>>,
// The set of rows that belong to each distinct value in the dictionary.
// This allows essentially constant time grouping of rows on the column by
// value.
index_row_ids: BTreeMap<u32, Bitmap>,
// stores tuples where each pair refers to a dictionary entry and the number
// of times the entry repeats.
run_lengths: Vec<(u32, u32)>,
num_rows: u32,
}
impl RLE {
/// Adds the provided string value to the encoded data. It is the caller's
/// responsibility to ensure that the dictionary encoded remains sorted.
pub fn push(&mut self, v: String) {
self.push_additional(Some(v), 1);
}
/// Adds a NULL value to the encoded data. It is the caller's
/// responsibility to ensure that the dictionary encoded remains sorted.
pub fn push_none(&mut self) {
self.push_additional(None, 1);
}
/// Adds additional repetitions of the provided value to the encoded data.
/// It is the caller's responsibility to ensure that the dictionary encoded
/// remains sorted.
pub fn push_additional(&mut self, v: Option<String>, additional: u32) {
let idx = self.entry_index.get(&v);
match idx {
Some(idx) => {
if let Some((last_idx, rl)) = self.run_lengths.last_mut() {
if last_idx == idx {
// update the existing run-length
*rl += additional;
} else {
// start a new run-length
self.run_lengths.push((*idx, additional));
}
self.index_row_ids
.get_mut(&(*idx as u32))
.unwrap()
.add_range(self.num_rows as u64..self.num_rows as u64 + additional as u64);
}
}
None => {
// New dictionary entry.
let idx = self.index_entries.len() as u32;
if idx > 0 {
match (&self.index_entries[idx as usize - 1], &v) {
(None, Some(_)) => panic!("out of order dictionary insertion"),
(Some(_), None) => {}
(Some(a), Some(b)) => assert!(a < b),
(_, _) => unreachable!("multiple None values"),
}
}
self.index_entries.push(v.clone());
self.entry_index.insert(v, idx);
self.index_row_ids.insert(idx, Bitmap::create());
self.run_lengths.push((idx, additional));
self.index_row_ids
.get_mut(&(idx as u32))
.unwrap()
.add_range(self.num_rows as u64..self.num_rows as u64 + additional as u64);
}
}
self.num_rows += additional;
}
//
//
// ---- Methods for getting row ids from values.
//
//
/// Populates the provided destination container with the row ids satisfying
/// the provided predicate.
pub fn row_ids_filter(&self, value: Option<String>, op: cmp::Operator, dst: RowIDs) -> RowIDs {
match op {
cmp::Operator::Equal | cmp::Operator::NotEqual => self.row_ids_equal(value, op, dst),
cmp::Operator::LT | cmp::Operator::LTE | cmp::Operator::GT | cmp::Operator::GTE => {
self.row_ids_cmp(value, op, dst)
}
}
}
// Finds row ids based on = or != operator.
fn row_ids_equal(&self, value: Option<String>, op: cmp::Operator, mut dst: RowIDs) -> RowIDs {
dst.clear();
let include = match op {
cmp::Operator::Equal => true,
cmp::Operator::NotEqual => false,
_ => unreachable!("invalid operator"),
};
if let Some(encoded_id) = self.entry_index.get(&value) {
let mut index: u32 = 0;
for (other_encoded_id, other_rl) in &self.run_lengths {
let start = index;
index += *other_rl;
if (other_encoded_id == encoded_id) == include {
dst.add_range(start, index)
}
}
} else if let cmp::Operator::NotEqual = op {
// special case - the column does not contain the provided
// value and the operator is != so we need to return all
// row ids.
dst.add_range(0, self.num_rows)
}
dst
}
// Finds row ids based on <, <=, > or >= operator.
fn row_ids_cmp(&self, value: Option<String>, op: cmp::Operator, mut dst: RowIDs) -> RowIDs {
dst.clear();
// happy path - the value exists in the column
if let Some(encoded_id) = self.entry_index.get(&value) {
let cmp = match op {
cmp::Operator::GT => PartialOrd::gt,
cmp::Operator::GTE => PartialOrd::ge,
cmp::Operator::LT => PartialOrd::lt,
cmp::Operator::LTE => PartialOrd::le,
_ => unreachable!("operator not supported"),
};
let mut index: u32 = 0; // current position in the column.
for (other_encoded_id, other_rl) in &self.run_lengths {
let start = index;
index += *other_rl;
if cmp(other_encoded_id, encoded_id) {
dst.add_range(start, index)
}
}
return dst;
}
match op {
cmp::Operator::GT | cmp::Operator::GTE => {
// find the first decoded value that satisfies the predicate.
for (other, other_encoded_id) in &self.entry_index {
if other > &value {
// change filter from either `x > value` or `x >= value` to `x >= other`
return self.row_ids_cmp(other.clone(), cmp::Operator::GTE, dst);
}
}
}
cmp::Operator::LT | cmp::Operator::LTE => {
// find the first decoded value that satisfies the predicate.
// Note iteration is in reverse
for (other, other_encoded_id) in self.entry_index.iter().rev() {
if other < &value {
// change filter from either `x < value` or `x <= value` to `x <= other`
return self.row_ids_cmp(other.clone(), cmp::Operator::LTE, dst);
}
}
}
_ => unreachable!("operator not supported"),
}
dst
}
// The set of row ids for each distinct value in the column.
pub fn group_row_ids(&self) -> &BTreeMap<u32, Bitmap> {
&self.index_row_ids
}
//
//
// ---- Methods for getting materialised values.
//
//
pub fn dictionary(&self) -> &[Option<String>] {
&self.index_entries
}
/// Returns the logical value present at the provided row id.
///
/// N.B right now this doesn't discern between an invalid row id and a NULL
/// value at a valid location.
pub fn value(&self, row_id: u32) -> &Option<String> {
if row_id < self.num_rows {
let mut total = 0;
for (encoded_id, rl) in &self.run_lengths {
if total + rl > row_id {
// this run-length overlaps desired row id
return &self.index_entries[*encoded_id as usize];
}
total += rl;
}
}
&None
}
/// Materialises the decoded value belonging to the provided encoded id.
///
/// Panics if there is no decoded value for the provided id
pub fn decode_id(&self, encoded_id: u32) -> Option<String> {
self.index_entries[encoded_id as usize].clone()
}
/// Materialises a vector of references to the decoded values in the
/// provided row ids.
///
/// NULL values are represented by None. It is the caller's responsibility
/// to ensure row ids are a monotonically increasing set.
pub fn values<'a>(
&'a self,
row_ids: &[u32],
mut dst: Vec<&'a Option<String>>,
) -> Vec<&'a Option<String>> {
dst.clear();
dst.reserve(row_ids.len());
let mut curr_logical_row_id = 0;
let (mut curr_entry_id, mut curr_entry_rl) = self.run_lengths[0];
let mut i = 1;
for row_id in row_ids {
if row_id >= &self.num_rows {
return dst; // row ids beyond length of column
}
while curr_logical_row_id + curr_entry_rl <= *row_id {
// this encoded entry does not cover the row we need.
// move on to next entry
curr_logical_row_id += curr_entry_rl;
curr_entry_id = self.run_lengths[i].0;
curr_entry_rl = self.run_lengths[i].1;
i += 1;
}
// this encoded entry covers the row_id we want.
// let value = &self.index_entries[curr_entry_id as usize];
dst.push(&self.index_entries[curr_entry_id as usize]);
curr_logical_row_id += 1;
curr_entry_rl -= 1;
}
assert_eq!(row_ids.len(), dst.len());
dst
}
/// Returns references to the logical (decoded) values for all the rows in
/// the column.
///
/// NULL values are represented by None.
///
pub fn all_values<'a>(
&'a mut self,
mut dst: Vec<&'a Option<String>>,
) -> Vec<&'a Option<String>> {
dst.clear();
dst.reserve(self.num_rows as usize);
for (idx, rl) in &self.run_lengths {
let v = &self.index_entries[*idx as usize];
dst.extend(iter::repeat(v).take(*rl as usize));
}
dst
}
/// Returns references to the unique set of values encoded at each of the
/// provided ids.
///
/// It is the caller's responsibility to ensure row ids are a monotonically
/// increasing set.
pub fn distinct_values<'a>(
&'a self,
row_ids: &[u32],
mut dst: BTreeSet<&'a String>,
) -> BTreeSet<&'a String> {
// TODO(edd): Perf... We can improve on this if we know the column is
// totally ordered.
dst.clear();
// Used to mark off when a decoded value has been added to the result
// set. TODO(perf) - this might benefit from being pooled somehow.
let mut encoded_values = Vec::with_capacity(self.index_entries.len());
encoded_values.resize(self.index_entries.len(), false);
let mut found = 0;
if let Some(i) = self.entry_index.get(&None) {
// the encoding contains NULL values, but we don't return those as
// distinct values. So we will mark them.
encoded_values[*i as usize] = true;
found += 1;
}
let mut curr_logical_row_id = 0;
let (mut curr_entry_id, mut curr_entry_rl) = self.run_lengths[0];
let mut i = 1;
'by_row: for row_id in row_ids {
if row_id >= &self.num_rows {
return dst; // rows beyond the column size
}
while curr_logical_row_id + curr_entry_rl <= *row_id {
// this encoded entry does not cover the row we need.
// move on to next entry
curr_logical_row_id += curr_entry_rl;
curr_entry_id = self.run_lengths[i].0;
curr_entry_rl = self.run_lengths[i].1;
i += 1;
}
// encoded value not already in result set.
if !encoded_values[curr_entry_id as usize] {
// annoying unwrap. We know that there can't be None here as
// we removed that at the top of the method.
dst.insert(self.index_entries[curr_entry_id as usize].as_ref().unwrap());
encoded_values[curr_entry_id as usize] = true;
found += 1;
}
if found == encoded_values.len() {
// all distinct values have been read
break 'by_row;
}
curr_logical_row_id += 1;
curr_entry_rl -= 1;
}
assert!(dst.len() <= self.index_entries.len());
dst
}
//
//
// ---- Methods for getting encoded values directly, typically to be used
// as part of group keys.
//
//
/// Return the raw encoded values for the provided logical row ids.
/// Encoded values for NULL values are included.
///
pub fn encoded_values(&self, row_ids: &[u32], mut dst: Vec<u32>) -> Vec<u32> {
dst.clear();
dst.reserve(row_ids.len());
let mut curr_logical_row_id = 0;
let (mut curr_entry_id, mut curr_entry_rl) = self.run_lengths[0];
let mut i = 1;
for row_id in row_ids {
while curr_logical_row_id + curr_entry_rl <= *row_id {
// this encoded entry does not cover the row we need.
// move on to next entry
curr_logical_row_id += curr_entry_rl;
curr_entry_id = self.run_lengths[i].0;
curr_entry_rl = self.run_lengths[i].1;
i += 1;
}
// this entry covers the row_id we want.
dst.push(curr_entry_id);
curr_logical_row_id += 1;
curr_entry_rl -= 1;
}
assert_eq!(row_ids.len(), dst.len());
dst
}
/// Returns all encoded values for the column including the encoded value
/// for any NULL values.
pub fn all_encoded_values(&self, mut dst: Vec<u32>) -> Vec<u32> {
dst.clear();
dst.reserve(self.num_rows as usize);
for (idx, rl) in &self.run_lengths {
dst.extend(iter::repeat(*idx).take(*rl as usize));
}
dst
}
//
//
// ---- Methods for optimising schema exploration.
//
//
/// Efficiently determines if this column contains non-null values that
/// differ from the provided set of values.
///
/// Informally, this method provides an efficient way of answering "is it
/// worth spending time reading this column for values or do I already have
/// all the values in a set".
///
/// More formally, this method returns the relative complement of this
/// column's values in the provided set of values.
///
/// This method would be useful when the same column is being read across
/// many segments, and one wants to determine to the total distinct set of
/// values. By exposing the current result set to each column (as an
/// argument to `contains_other_values`) columns can be short-circuited when
/// they only contain values that have already been discovered.
///
pub fn contains_other_values(&self, values: &BTreeSet<&String>) -> bool {
let mut encoded_values = self.index_entries.len();
if self.entry_index.contains_key(&None) {
encoded_values -= 1;
}
if encoded_values > values.len() {
return true;
}
for key in self.entry_index.keys() {
if let Some(key) = key {
if !values.contains(key) {
return true;
}
}
// skip NULL entry
}
false
}
/// Determines if the column contains at least one non-null value at
/// any of the provided row ids.
///
/// It is the caller's responsibility to ensure row ids are a monotonically
/// increasing set.
pub fn has_non_null_value(&self, row_ids: &[u32]) -> bool {
match self.entry_index.get(&None) {
Some(&id) => self.find_non_null_value(id, row_ids),
None => {
// There are no NULL entries in this column so just find a row id
// that falls on any row in the column.
for &id in row_ids {
if id < self.num_rows {
return true;
}
}
false
}
}
}
// Returns true if there exists an encoded non-null value at any of the row
// ids.
fn find_non_null_value(&self, null_encoded_id: u32, row_ids: &[u32]) -> bool {
let mut curr_logical_row_id = 0;
let (mut curr_encoded_id, mut curr_entry_rl) = self.run_lengths[0];
let mut i = 1;
for &row_id in row_ids {
if row_id >= self.num_rows {
return false; // all other row ids beyond column.
}
while curr_logical_row_id + curr_entry_rl <= row_id {
// this encoded entry does not cover the row we need.
// move on to next encoded id
curr_logical_row_id += curr_entry_rl;
curr_encoded_id = self.run_lengths[i].0;
curr_entry_rl = self.run_lengths[i].1;
i += 1;
}
// this entry covers the row_id we want if it points to a non-null value.
if curr_encoded_id != null_encoded_id {
return true;
}
curr_logical_row_id += 1;
curr_entry_rl -= 1;
}
false
}
}
impl<'a> From<Vec<&str>> for RLE {
fn from(vec: Vec<&str>) -> Self {
let mut drle = Self::default();
for v in vec {
drle.push(v.to_string());
}
drle
}
}
impl<'a> From<Vec<String>> for RLE {
fn from(vec: Vec<String>) -> Self {
let mut drle = Self::default();
for v in vec {
drle.push(v);
}
drle
}
}
impl<'a> From<Vec<Option<&str>>> for RLE {
fn from(vec: Vec<Option<&str>>) -> Self {
let mut drle = Self::default();
for v in vec {
match v {
Some(x) => drle.push(x.to_string()),
None => drle.push_none(),
}
}
drle
}
}
impl<'a> From<Vec<Option<String>>> for RLE {
fn from(vec: Vec<Option<String>>) -> Self {
let mut drle = Self::default();
for v in vec {
match v {
Some(x) => drle.push(x),
None => drle.push_none(),
}
}
drle
}
}
impl<'a> From<StringArray> for RLE {
fn from(arr: StringArray) -> Self {
let mut drle = Self::default();
for i in 0..arr.len() {
if arr.is_null(i) {
drle.push_none();
} else {
drle.push(arr.value(i).to_string());
}
}
drle
}
}
impl std::fmt::Display for RLE {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"[RLE] rows: {:?} dict entries: {}, runs: {} ",
self.num_rows,
self.index_entries.len(),
self.run_lengths.len()
)
}
}
#[cfg(test)]
mod test {
use std::collections::BTreeSet;
use crate::column::{cmp, RowIDs};
#[test]
fn rle_push() {
let mut drle = super::RLE::from(vec!["hello", "hello", "hello", "hello"]);
drle.push_additional(Some("hello".to_string()), 1);
drle.push("world".to_string());
assert_eq!(
drle.all_values(vec![]),
[
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("world".to_string()),
]
);
drle.push_additional(Some("zoo".to_string()), 3);
drle.push_none();
assert_eq!(
drle.all_values(vec![]),
[
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("hello".to_string()),
&Some("world".to_string()),
&Some("zoo".to_string()),
&Some("zoo".to_string()),
&Some("zoo".to_string()),
&None,
]
);
}
#[test]
#[should_panic]
fn rle_push_none_first() {
let mut drle = super::RLE::default();
drle.push_none();
drle.push_additional(Some("hello".to_string()), 1);
}
#[test]
#[should_panic]
fn rle_push_wrong_order() {
let mut drle = super::RLE::default();
drle.push("b".to_string());
drle.push("a".to_string());
}
#[test]
fn all_values() {
let mut drle = super::RLE::from(vec!["hello", "zoo"]);
let zoo = Some("zoo".to_string());
let dst = vec![&zoo, &zoo, &zoo, &zoo];
let got = drle.all_values(dst);
assert_eq!(got, [&Some("hello".to_string()), &Some("zoo".to_string()),]);
assert_eq!(got.capacity(), 4);
}
#[test]
fn row_ids_filter_equal() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
let ids = drle.row_ids_filter(
Some("east".to_string()),
cmp::Operator::Equal,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8]));
let ids = drle.row_ids_filter(
Some("south".to_string()),
cmp::Operator::Equal,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![9, 10]));
let ids = drle.row_ids_filter(
Some("foo".to_string()),
cmp::Operator::Equal,
RowIDs::Vector(vec![]),
);
assert!(ids.is_empty());
let ids = drle.row_ids_filter(
Some("foo".to_string()),
cmp::Operator::NotEqual,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector((0..11).collect::<Vec<_>>()));
let ids = drle.row_ids_filter(
Some("east".to_string()),
cmp::Operator::NotEqual,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![3, 9, 10]));
}
#[test]
fn row_ids_filter_cmp() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3); // 0,1,2
drle.push_additional(Some("north".to_string()), 1); // 3
drle.push_additional(Some("east".to_string()), 5); // 4,5,6,7,8
drle.push_additional(Some("south".to_string()), 2); // 9,10
drle.push_additional(Some("west".to_string()), 1); // 11
drle.push_additional(Some("north".to_string()), 1); // 12
drle.push_additional(Some("west".to_string()), 5); // 13,14,15,16,17
let ids = drle.row_ids_filter(
Some("east".to_string()),
cmp::Operator::LTE,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8]));
let ids = drle.row_ids_filter(
Some("east".to_string()),
cmp::Operator::LT,
RowIDs::Vector(vec![]),
);
assert!(ids.is_empty());
let ids = drle.row_ids_filter(
Some("north".to_string()),
cmp::Operator::GT,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![9, 10, 11, 13, 14, 15, 16, 17]));
let ids = drle.row_ids_filter(
Some("north".to_string()),
cmp::Operator::GTE,
RowIDs::Vector(vec![]),
);
assert_eq!(
ids,
RowIDs::Vector(vec![3, 9, 10, 11, 12, 13, 14, 15, 16, 17])
);
// The encoding also supports comparisons on values that don't directly exist in the column.
let ids = drle.row_ids_filter(
Some("abba".to_string()),
cmp::Operator::GT,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector((0..18).collect::<Vec<u32>>()));
let ids = drle.row_ids_filter(
Some("east1".to_string()),
cmp::Operator::GT,
RowIDs::Vector(vec![]),
);
assert_eq!(
ids,
RowIDs::Vector(vec![3, 9, 10, 11, 12, 13, 14, 15, 16, 17])
);
let ids = drle.row_ids_filter(
Some("east1".to_string()),
cmp::Operator::GTE,
RowIDs::Vector(vec![]),
);
assert_eq!(
ids,
RowIDs::Vector(vec![3, 9, 10, 11, 12, 13, 14, 15, 16, 17])
);
let ids = drle.row_ids_filter(
Some("east1".to_string()),
cmp::Operator::LTE,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8]));
let ids = drle.row_ids_filter(
Some("region".to_string()),
cmp::Operator::LT,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 12]));
let ids = drle.row_ids_filter(
Some("zoo".to_string()),
cmp::Operator::LTE,
RowIDs::Vector(vec![]),
);
assert_eq!(ids, RowIDs::Vector((0..18).collect::<Vec<u32>>()));
}
#[test]
fn value() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
assert_eq!(drle.value(3), &Some("north".to_string()));
assert_eq!(drle.value(0), &Some("east".to_string()));
assert_eq!(drle.value(10), &Some("south".to_string()));
assert_eq!(drle.value(22), &None);
}
#[test]
fn values() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
drle.push_none();
let mut dst = Vec::with_capacity(1000);
dst = drle.values(&[0, 1, 3, 4], dst);
assert_eq!(
dst,
vec![
&Some("east".to_string()),
&Some("east".to_string()),
&Some("north".to_string()),
&Some("east".to_string())
]
);
dst = drle.values(&[8, 10, 11], dst);
assert_eq!(
dst,
vec![&Some("east".to_string()), &Some("south".to_string()), &None]
);
assert_eq!(dst.capacity(), 1000);
assert!(drle.values(&[1000], dst).is_empty());
}
#[test]
fn distinct_values() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 100);
let values = drle.distinct_values((0..100).collect::<Vec<_>>().as_slice(), BTreeSet::new());
assert_eq!(
values,
vec!["east".to_string()].iter().collect::<BTreeSet<_>>()
);
drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
drle.push_none();
let values = drle.distinct_values((0..11).collect::<Vec<_>>().as_slice(), BTreeSet::new());
assert_eq!(
values,
vec!["east".to_string(), "north".to_string(), "south".to_string(),]
.iter()
.collect::<BTreeSet<_>>()
);
let values = drle.distinct_values((0..4).collect::<Vec<_>>().as_slice(), BTreeSet::new());
assert_eq!(
values,
vec!["east".to_string(), "north".to_string(),]
.iter()
.collect::<BTreeSet<_>>()
);
let values = drle.distinct_values(&[3, 10], BTreeSet::new());
assert_eq!(
values,
vec!["north".to_string(), "south".to_string(),]
.iter()
.collect::<BTreeSet<_>>()
);
let values = drle.distinct_values(&[100], BTreeSet::new());
assert!(values.is_empty());
}
#[test]
fn contains_other_values() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
drle.push_none();
let east = "east".to_string();
let north = "north".to_string();
let south = "south".to_string();
let mut others = BTreeSet::new();
others.insert(&east);
others.insert(&north);
assert!(drle.contains_other_values(&others));
let f1 = "foo".to_string();
others.insert(&f1);
assert!(drle.contains_other_values(&others));
others.insert(&south);
assert!(!drle.contains_other_values(&others));
let f2 = "bar".to_string();
others.insert(&f2);
assert!(!drle.contains_other_values(&others));
assert!(drle.contains_other_values(&BTreeSet::new()));
}
#[test]
fn has_non_null_value() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
drle.push_none();
assert!(drle.has_non_null_value(&[0]));
assert!(drle.has_non_null_value(&[0, 1, 2]));
assert!(drle.has_non_null_value(&[10]));
assert!(!drle.has_non_null_value(&[11]));
assert!(!drle.has_non_null_value(&[11, 12, 100]));
drle = super::RLE::default();
drle.push_additional(None, 10);
assert!(!drle.has_non_null_value(&[0]));
assert!(!drle.has_non_null_value(&[4, 7]));
}
#[test]
fn encoded_values() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 1);
drle.push_additional(Some("east".to_string()), 5);
drle.push_additional(Some("south".to_string()), 2);
drle.push_none();
let mut encoded = drle.encoded_values(&[0], vec![]);
assert_eq!(encoded, vec![0]);
encoded = drle.encoded_values(&[1, 3, 5, 6], vec![]);
assert_eq!(encoded, vec![0, 1, 0, 0]);
encoded = drle.encoded_values(&[9, 10, 11], vec![]);
assert_eq!(encoded, vec![2, 2, 3]);
}
#[test]
fn all_encoded_values() {
let mut drle = super::RLE::default();
drle.push_additional(Some("east".to_string()), 3);
drle.push_additional(Some("north".to_string()), 2);
let dst = Vec::with_capacity(100);
let dst = drle.all_encoded_values(dst);
assert_eq!(dst, vec![0, 0, 0, 1, 1]);
assert_eq!(dst.capacity(), 100);
}
}