feat: add RLE support to integer encodings

pull/24376/head
Edd Robinson 2021-06-01 16:41:35 +01:00
parent 71598d9b3e
commit a5b554d2c3
1 changed files with 669 additions and 69 deletions

View File

@ -1,15 +1,17 @@
use std::fmt::Display;
use std::iter::FromIterator;
use std::mem::size_of;
use arrow::array::PrimitiveArray;
use arrow::{self, array::Array, datatypes::*};
use arrow::array::{Array, PrimitiveArray};
use arrow::{self, datatypes::*};
use either::Either;
use rle::RLE;
use super::encoding::scalar::{
transcoders::{ByteTrimmer, NoOpTranscoder, Transcoder},
ScalarEncoding,
};
use super::encoding::{scalar::Fixed, scalar::FixedNull};
use super::encoding::{scalar::rle, scalar::Fixed, scalar::FixedNull};
use super::{cmp, Statistics};
use crate::column::{RowIDs, Scalar, Value, Values};
@ -247,6 +249,19 @@ impl Display for IntegerEncoding {
}
}
/// A lever to decide the minimum size in bytes that run-length encoding the
/// column needs to reduce the overall footprint by. 0.1 means that the size of
/// the column must be reduced by at least 10%
pub const MIN_RLE_SIZE_REDUCTION: f64 = 0.1; // 10%
// Applies a heuristic to decide whether the input data should be encoded using
// run-length encoding.
fn should_rle_from<T: PartialOrd>(arr: &[T]) -> bool {
let base_size = arr.len() * size_of::<T>();
(base_size as f64 - rle::estimated_size_from(arr) as f64) / base_size as f64
>= MIN_RLE_SIZE_REDUCTION
}
/// Converts a slice of i64 values into an IntegerEncoding.
///
/// The most compact physical type needed to store the columnar values is
@ -265,19 +280,26 @@ impl From<&[i64]> for IntegerEncoding {
max = max.max(v);
}
// If true then use RLE after byte trimming.
let rle = should_rle_from(arr);
// This match is carefully ordered. It prioritises smaller physical
// datatypes that can safely represent the provided logical data
let transcoder = ByteTrimmer {};
match (min, max) {
let (enc, name) = match (min, max) {
// encode as u8 values
(min, max) if min >= 0 && max <= u8::MAX as i64 => {
let arr = arr
.iter()
.map::<u8, _>(|v| transcoder.encode(*v))
.collect::<Vec<_>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U8-{}", name))
(enc, format!("BT_U8-{}", name))
}
// encode as i8 values
(min, max) if min >= i8::MIN as i64 && max <= i8::MAX as i64 => {
@ -285,9 +307,13 @@ impl From<&[i64]> for IntegerEncoding {
.iter()
.map(|v| transcoder.encode(*v))
.collect::<Vec<i8>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I8-{}", name))
(enc, format!("BT_I8-{}", name))
}
// encode as u16 values
(min, max) if min >= 0 && max <= u16::MAX as i64 => {
@ -295,9 +321,13 @@ impl From<&[i64]> for IntegerEncoding {
.iter()
.map::<u16, _>(|v| transcoder.encode(*v))
.collect::<Vec<u16>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U16-{}", name))
(enc, format!("BT_U16-{}", name))
}
// encode as i16 values
(min, max) if min >= i16::MIN as i64 && max <= i16::MAX as i64 => {
@ -305,9 +335,13 @@ impl From<&[i64]> for IntegerEncoding {
.iter()
.map(|v| transcoder.encode(*v))
.collect::<Vec<i16>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I16-{}", name))
(enc, format!("BT_I16-{}", name))
}
// encode as u32 values
(min, max) if min >= 0 && max <= u32::MAX as i64 => {
@ -315,9 +349,13 @@ impl From<&[i64]> for IntegerEncoding {
.iter()
.map(|v| transcoder.encode(*v))
.collect::<Vec<u32>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U32-{}", name))
(enc, format!("BT_U32-{}", name))
}
// encode as i32 values
(min, max) if min >= i32::MIN as i64 && max <= i32::MAX as i64 => {
@ -325,20 +363,40 @@ impl From<&[i64]> for IntegerEncoding {
.iter()
.map(|v| transcoder.encode(*v))
.collect::<Vec<i32>>();
let enc = Box::new(Fixed::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I32-{}", name))
(enc, format!("BT_I32-{}", name))
}
// otherwise, encode with the same physical type (i64)
(_, _) => {
let enc = Box::new(Fixed::new(arr.to_vec(), NoOpTranscoder {}));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter(
arr.to_vec().into_iter(),
NoOpTranscoder {},
))
} else {
Box::new(Fixed::new(arr.to_vec(), NoOpTranscoder {}))
};
let name = enc.name();
Self::I64(enc, format!("None-{}", name))
(enc, name.to_owned())
}
}
};
Self::I64(enc, name)
}
}
// Applies a heuristic to decide whether the input data should be encoded using
// run-length encoding.
fn should_rle_from_iter<T: PartialOrd>(len: usize, iter: impl Iterator<Item = Option<T>>) -> bool {
let base_size = len * size_of::<T>();
(base_size as f64 - rle::estimated_size_from_iter(iter) as f64) / base_size as f64
>= MIN_RLE_SIZE_REDUCTION
}
/// Converts an Arrow array into an IntegerEncoding.
///
/// The most compact physical Arrow array type is used to store the column
@ -353,17 +411,23 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
let min = arrow::compute::kernels::aggregate::min(&arr);
let max = arrow::compute::kernels::aggregate::max(&arr);
// If true then use RLE after byte trimming.
let rle = should_rle_from_iter(arr.len(), arr.iter());
// This match is carefully ordered. It prioritises smaller physical
// datatypes that can safely represent the provided logical data
let transcoder = ByteTrimmer {};
match (min, max) {
// data is all NULL. Store as single byte column for now.
// TODO(edd): this will be smaller when stored using RLE
let (enc, name) = match (min, max) {
// data is all NULL. Store u8 RLE
(None, None) => {
let arr = PrimitiveArray::from_iter(arr.iter().map::<Option<u8>, _>(|_| None));
let enc = Box::new(FixedNull::<UInt8Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt8Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U8-{}", name))
(enc, format!("BT_U8-{}", name))
}
// encode as u8 values
(min, max) if min >= Some(0) && max <= Some(u8::MAX as i64) => {
@ -371,9 +435,13 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u8
);
let enc = Box::new(FixedNull::<UInt8Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt8Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U8-{}", name))
(enc, format!("BT_U8-{}", name))
}
// encode as i8 values
(min, max) if min >= Some(i8::MIN as i64) && max <= Some(i8::MAX as i64) => {
@ -381,9 +449,13 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i8
);
let enc = Box::new(FixedNull::<Int8Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<Int8Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I8-{}", name))
(enc, format!("BT_I8-{}", name))
}
// encode as u16 values
(min, max) if min >= Some(0) && max <= Some(u16::MAX as i64) => {
@ -391,9 +463,13 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u16
);
let enc = Box::new(FixedNull::<UInt16Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt16Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U16-{}", name))
(enc, format!("BT_U16-{}", name))
}
// encode as i16 values
(min, max) if min >= Some(i16::MIN as i64) && max <= Some(i16::MAX as i64) => {
@ -401,9 +477,13 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i16
);
let enc = Box::new(FixedNull::<Int16Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<Int16Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I16-{}", name))
(enc, format!("BT_I16-{}", name))
}
// encode as u32 values
(min, max) if min >= Some(0) && max <= Some(u32::MAX as i64) => {
@ -411,9 +491,13 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u32
);
let enc = Box::new(FixedNull::<UInt32Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt32Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_U32-{}", name))
(enc, format!("BT_U32-{}", name))
}
// encode as i32 values
(min, max) if min >= Some(i32::MIN as i64) && max <= Some(i32::MAX as i64) => {
@ -421,17 +505,26 @@ impl From<arrow::array::Int64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i32
);
let enc = Box::new(FixedNull::<Int32Type, i64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<Int32Type, i64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::I64(enc, format!("BT_I32-{}", name))
(enc, format!("BT_I32-{}", name))
}
// otherwise, encode with the same physical type (i64)
(_, _) => {
let enc = Box::new(FixedNull::<Int64Type, i64, _>::new(arr, NoOpTranscoder {}));
let enc: Box<dyn ScalarEncoding<i64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), NoOpTranscoder {}))
} else {
Box::new(FixedNull::<Int64Type, i64, _>::new(arr, NoOpTranscoder {}))
};
let name = enc.name();
Self::I64(enc, format!("None-{}", name))
(enc, name.to_owned())
}
}
};
Self::I64(enc, name)
}
}
@ -449,47 +542,67 @@ impl From<&[u64]> for IntegerEncoding {
max = max.max(v);
}
// If true then use RLE after byte trimming.
let rle = should_rle_from(arr);
// This match is carefully ordered. It prioritises smaller physical
// datatypes that can safely represent the provided logical data
let transcoder = ByteTrimmer {};
match max {
let (enc, name) = match max {
// encode as u8 values
max if max <= u8::MAX as u64 => {
let arr = arr
.iter()
.map::<u8, _>(|v| transcoder.encode(*v)) // u64 -> u8
.collect::<Vec<u8>>();
let enc = Box::new(Fixed::<u8, u64, _>::new(arr, transcoder));
.map::<u8, _>(|v| transcoder.encode(*v))
.collect::<Vec<_>>();
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U8-{}", name))
(enc, format!("BT_U8-{}", name))
}
// encode as u16 values
max if max <= u16::MAX as u64 => {
let arr = arr
.iter()
.map::<u16, _>(|v| transcoder.encode(*v)) // u64 -> u16
.map::<u16, _>(|v| transcoder.encode(*v))
.collect::<Vec<u16>>();
let enc = Box::new(Fixed::<u16, u64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U16-{}", name))
(enc, format!("BT_U16-{}", name))
}
// encode as u32 values
max if max <= u32::MAX as u64 => {
let arr = arr
.iter()
.map::<u32, _>(|v| transcoder.encode(*v)) // u64 -> u32
.map(|v| transcoder.encode(*v))
.collect::<Vec<u32>>();
let enc = Box::new(Fixed::<u32, u64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter(arr.into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U32-{}", name))
(enc, format!("BT_U32-{}", name))
}
// otherwise, encode with the same physical type (u64)
// otherwise, encode with the same physical type (i64)
_ => {
let enc = Box::new(Fixed::<u64, u64, _>::new(arr.to_vec(), NoOpTranscoder {})); // no transcoding needed
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter(arr.to_vec().into_iter(), transcoder))
} else {
Box::new(Fixed::new(arr.to_vec(), transcoder))
};
let name = enc.name();
Self::U64(enc, format!("None-{}", name))
(enc, name.to_owned())
}
}
};
Self::U64(enc, name)
}
}
@ -506,19 +619,38 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {
// determine max value.
let max = arrow::compute::kernels::aggregate::max(&arr);
// If true then use RLE after byte trimming.
let rle = should_rle_from_iter(arr.len(), arr.iter());
// This match is carefully ordered. It prioritises smaller physical
// datatypes that can safely represent the provided logical data
let transcoder = ByteTrimmer {};
match max {
let (enc, name) = match max {
// data is all NULL. Store as single byte column for now.
// TODO(edd): this will be smaller when stored using RLE
None => {
let arr = PrimitiveArray::from_iter(arr.iter().map::<Option<u8>, _>(|_| None));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt8Type, u64, _>::new(arr, transcoder))
};
let name = enc.name();
(enc, format!("BT_U8-{}", name))
}
// encode as u8 values
max if max <= Some(u8::MAX as u64) => {
let arr = PrimitiveArray::from_iter(
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u8
);
let enc = Box::new(FixedNull::<UInt8Type, u64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt8Type, u64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U8-{}", name))
(enc, format!("BT_U8-{}", name))
}
// encode as u16 values
max if max <= Some(u16::MAX as u64) => {
@ -526,9 +658,13 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u16
);
let enc = Box::new(FixedNull::<UInt16Type, u64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt16Type, u64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U16-{}", name))
(enc, format!("BT_U16-{}", name))
}
// encode as u32 values
max if max <= Some(u32::MAX as u64) => {
@ -536,25 +672,35 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {
arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u32
);
let enc = Box::new(FixedNull::<UInt32Type, u64, _>::new(arr, transcoder));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder))
} else {
Box::new(FixedNull::<UInt32Type, u64, _>::new(arr, transcoder))
};
let name = enc.name();
Self::U64(enc, format!("BT_U32-{}", name))
(enc, format!("BT_U32-{}", name))
}
// otherwise, encode with the same physical type (u64)
_ => {
let enc = Box::new(FixedNull::<UInt64Type, u64, _>::new(arr, NoOpTranscoder {}));
let enc: Box<dyn ScalarEncoding<u64>> = if rle {
Box::new(RLE::new_from_iter_opt(arr.iter(), NoOpTranscoder {}))
} else {
Box::new(FixedNull::<UInt64Type, u64, _>::new(arr, NoOpTranscoder {}))
};
let name = enc.name();
Self::U64(enc, format!("None-{}", name))
(enc, name.to_owned())
}
}
};
Self::U64(enc, name)
}
}
#[cfg(test)]
mod test {
use arrow::array::{Int64Array, UInt64Array};
use itertools::Itertools;
use super::*;
use arrow::array::{Int64Array, UInt64Array};
#[test]
// Tests that input data gets byte trimmed correctly.
@ -570,9 +716,133 @@ mod test {
(vec![0_i64, 2, 245, i64::MIN], 56), // i64 fixed array
];
for (case, size) in cases.into_iter() {
for (case, name) in cases.into_iter() {
let enc = IntegerEncoding::from(case.as_slice());
assert_eq!(enc.size(), size, "failed: {:?}", enc);
assert_eq!(enc.size(), name, "failed: {:?}", enc);
}
}
#[test]
// Tests that input data gets encoded correctly
fn from_slice_i64_encoding() {
let cases = vec![
(
vec![1_i64; 1000]
.into_iter()
.interleave(vec![2_i64; 1000].into_iter()) // 1,2,1,2,1,2....
.collect::<Vec<i64>>(),
// byte trimmed to u8
"BT_U8-FIXED",
),
(
vec![1_i64; 1000]
.into_iter()
.chain(vec![2_i64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2....
.collect::<Vec<i64>>(),
// byte trimmed to u8 then RLE.
"BT_U8-RLE",
),
(
vec![-1_i64; 1000]
.into_iter()
.interleave(vec![-2_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
// byte trimmed to i8
"BT_I8-FIXED",
),
(
vec![-1_i64; 1000]
.into_iter()
.chain(vec![-2_i64; 1000])
.collect::<Vec<i64>>(),
// byte trimmed to i8 then RLE.
"BT_I8-RLE",
),
(
vec![500_i64; 1000]
.into_iter()
.interleave(vec![600_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
// byte trimmed to u16
"BT_U16-FIXED",
),
(
vec![500_i64; 1000]
.into_iter()
.chain(vec![600_i64; 1000])
.collect::<Vec<i64>>(),
// byte trimmed to u16 then RLE.
"BT_U16-RLE",
),
(
vec![-500_i64; 1000]
.into_iter()
.interleave(vec![-600_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
// byte trimmed to i16
"BT_I16-FIXED",
),
(
vec![-500_i64; 1000]
.into_iter()
.chain(vec![-600_i64; 1000])
.collect::<Vec<i64>>(),
// byte trimmed to i16 then RLE.
"BT_I16-RLE",
),
(
vec![100_000_i64; 1000]
.into_iter()
.interleave(vec![200_000_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
// byte trimmed to u32
"BT_U32-FIXED",
),
(
vec![100_000_i64; 1000]
.into_iter()
.chain(vec![200_000_i64; 1000])
.collect::<Vec<i64>>(),
// byte trimmed to u16 then RLE.
"BT_U32-RLE",
),
(
vec![-100_000_i64; 1000]
.into_iter()
.interleave(vec![-200_000_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
// byte trimmed to i32
"BT_I32-FIXED",
),
(
vec![-100_000_i64; 1000]
.into_iter()
.chain(vec![-200_000_i64; 1000])
.collect::<Vec<i64>>(),
// byte trimmed to i16 then RLE.
"BT_I32-RLE",
),
(
vec![i64::MIN; 1000]
.into_iter()
.interleave(vec![i64::MIN + 1; 1000].into_iter())
.collect::<Vec<i64>>(),
// vanilla i64
"FIXED",
),
(
vec![i64::MIN; 1000]
.into_iter()
.chain(vec![i64::MIN + 1; 1000])
.collect::<Vec<i64>>(),
// RLE i64
"RLE",
),
];
for (case, name) in cases.into_iter() {
let enc = IntegerEncoding::from(case.as_slice());
assert_eq!(enc.name(), name, "failed: {:?}", enc);
}
}
@ -591,6 +861,82 @@ mod test {
}
}
#[test]
// Tests that input data gets encoded correctly
fn from_slice_u64_encoding() {
let cases = vec![
(
vec![1_u64; 1000]
.into_iter()
.interleave(vec![2_u64; 1000].into_iter()) // 1,2,1,2,1,2....
.collect::<Vec<u64>>(),
// byte trimmed to u8
"BT_U8-FIXED",
),
(
vec![1_u64; 1000]
.into_iter()
.chain(vec![2_u64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2....
.collect::<Vec<u64>>(),
// byte trimmed to u8 then RLE.
"BT_U8-RLE",
),
(
vec![500_u64; 1000]
.into_iter()
.interleave(vec![600_u64; 1000].into_iter())
.collect::<Vec<u64>>(),
// byte trimmed to u16
"BT_U16-FIXED",
),
(
vec![500_u64; 1000]
.into_iter()
.chain(vec![600_u64; 1000])
.collect::<Vec<u64>>(),
// byte trimmed to u16 then RLE.
"BT_U16-RLE",
),
(
vec![100_000_u64; 1000]
.into_iter()
.interleave(vec![200_000_u64; 1000].into_iter())
.collect::<Vec<u64>>(),
// byte trimmed to u32
"BT_U32-FIXED",
),
(
vec![100_000_u64; 1000]
.into_iter()
.chain(vec![200_000_u64; 1000])
.collect::<Vec<u64>>(),
// byte trimmed to u16 then RLE.
"BT_U32-RLE",
),
(
vec![u64::MAX; 1000]
.into_iter()
.interleave(vec![u64::MAX - 1; 1000].into_iter())
.collect::<Vec<u64>>(),
// vanilla u64
"FIXED",
),
(
vec![u64::MAX; 1000]
.into_iter()
.chain(vec![u64::MAX - 1; 1000])
.collect::<Vec<u64>>(),
// RLE u64
"RLE",
),
];
for (case, name) in cases.into_iter() {
let enc = IntegerEncoding::from(case.as_slice());
assert_eq!(enc.name(), name, "failed: {:?}", enc);
}
}
#[test]
fn from_arrow_i64_array() {
let cases = vec![
@ -623,13 +969,170 @@ mod test {
(vec![None, Some(u32::MAX as i64 + 1)], 344), //u64
];
for (case, size) in cases.iter().cloned() {
for (case, name) in cases.iter().cloned() {
let arr = Int64Array::from(case);
let enc = IntegerEncoding::from(arr);
assert_eq!(enc.size(), size, "failed: {:?}", enc);
assert_eq!(enc.size(), name, "failed: {:?}", enc);
}
}
#[test]
// Tests that input data gets encoded correctly
fn from_arrow_i64_array_encoding() {
let cases = vec![
(
Int64Array::from(
vec![1_i64; 1000]
.into_iter()
.interleave(vec![2_i64; 1000].into_iter()) // 1,2,1,2,1,2....
.collect::<Vec<i64>>(),
),
// byte trimmed to u8
"BT_U8-FIXED",
),
(
Int64Array::from(
vec![1_i64; 1000]
.into_iter()
.chain(vec![2_i64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2....
.collect::<Vec<i64>>(),
),
// byte trimmed to u8 then RLE.
"BT_U8-RLE",
),
(
Int64Array::from(
vec![-1_i64; 1000]
.into_iter()
.interleave(vec![-2_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// byte trimmed to i8
"BT_I8-FIXED",
),
(
Int64Array::from(
vec![-1_i64; 1000]
.into_iter()
.chain(vec![-2_i64; 1000])
.collect::<Vec<i64>>(),
),
// byte trimmed to i8 then RLE.
"BT_I8-RLE",
),
(
Int64Array::from(
vec![500_i64; 1000]
.into_iter()
.interleave(vec![600_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// byte trimmed to u16
"BT_U16-FIXED",
),
(
Int64Array::from(
vec![500_i64; 1000]
.into_iter()
.chain(vec![600_i64; 1000])
.collect::<Vec<i64>>(),
),
// byte trimmed to u16 then RLE.
"BT_U16-RLE",
),
(
Int64Array::from(
vec![-500_i64; 1000]
.into_iter()
.interleave(vec![-600_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// byte trimmed to i16
"BT_I16-FIXED",
),
(
Int64Array::from(
vec![-500_i64; 1000]
.into_iter()
.chain(vec![-600_i64; 1000])
.collect::<Vec<i64>>(),
),
// byte trimmed to i16 then RLE.
"BT_I16-RLE",
),
(
Int64Array::from(
vec![100_000_i64; 1000]
.into_iter()
.interleave(vec![200_000_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// byte trimmed to u32
"BT_U32-FIXED",
),
(
Int64Array::from(
vec![100_000_i64; 1000]
.into_iter()
.chain(vec![200_000_i64; 1000])
.collect::<Vec<i64>>(),
),
// byte trimmed to u16 then RLE.
"BT_U32-RLE",
),
(
Int64Array::from(
vec![-100_000_i64; 1000]
.into_iter()
.interleave(vec![-200_000_i64; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// byte trimmed to i32
"BT_I32-FIXED",
),
(
Int64Array::from(
vec![-100_000_i64; 1000]
.into_iter()
.chain(vec![-200_000_i64; 1000])
.collect::<Vec<i64>>(),
),
// byte trimmed to i16 then RLE.
"BT_I32-RLE",
),
(
Int64Array::from(
vec![i64::MIN; 1000]
.into_iter()
.interleave(vec![i64::MIN + 1; 1000].into_iter())
.collect::<Vec<i64>>(),
),
// vanilla i64
"FIXED",
),
(
Int64Array::from(
vec![i64::MIN; 1000]
.into_iter()
.chain(vec![i64::MIN + 1; 1000])
.collect::<Vec<i64>>(),
),
// RLE i64
"RLE",
),
];
for (case, name) in cases.into_iter() {
let enc = IntegerEncoding::from(case);
assert_eq!(enc.name(), name, "failed: {:?}", enc);
}
// All NULL array
let arr = Int64Array::from(vec![None; 1000].into_iter().collect::<Vec<Option<i64>>>());
let enc = IntegerEncoding::from(arr);
assert_eq!(enc.name(), "BT_U8-RLE", "failed: {:?}", enc);
}
#[test]
fn from_arrow_u64_array() {
let cases = vec![
@ -661,4 +1164,101 @@ mod test {
assert_eq!(enc.size(), size, "failed: {:?}", enc);
}
}
#[test]
// Tests that input data gets encoded correctly
fn from_arrow_u64_array_encoding() {
let cases = vec![
(
UInt64Array::from(
vec![1_u64; 1000]
.into_iter()
.interleave(vec![2_u64; 1000].into_iter()) // 1,2,1,2,1,2....
.collect::<Vec<u64>>(),
),
// byte trimmed to u8
"BT_U8-FIXED",
),
(
UInt64Array::from(
vec![1_u64; 1000]
.into_iter()
.chain(vec![2_u64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2....
.collect::<Vec<u64>>(),
),
// byte trimmed to u8 then RLE.
"BT_U8-RLE",
),
(
UInt64Array::from(
vec![500_u64; 1000]
.into_iter()
.interleave(vec![600_u64; 1000].into_iter())
.collect::<Vec<u64>>(),
),
// byte trimmed to u16
"BT_U16-FIXED",
),
(
UInt64Array::from(
vec![500_u64; 1000]
.into_iter()
.chain(vec![600_u64; 1000])
.collect::<Vec<u64>>(),
),
// byte trimmed to u16 then RLE.
"BT_U16-RLE",
),
(
UInt64Array::from(
vec![100_000_u64; 1000]
.into_iter()
.interleave(vec![200_000_u64; 1000].into_iter())
.collect::<Vec<u64>>(),
),
// byte trimmed to u32
"BT_U32-FIXED",
),
(
UInt64Array::from(
vec![100_000_u64; 1000]
.into_iter()
.chain(vec![200_000_u64; 1000])
.collect::<Vec<u64>>(),
),
// byte trimmed to u16 then RLE.
"BT_U32-RLE",
),
(
UInt64Array::from(
vec![u64::MAX; 1000]
.into_iter()
.interleave(vec![u64::MAX - 1; 1000].into_iter())
.collect::<Vec<u64>>(),
),
// vanilla i64
"FIXED",
),
(
UInt64Array::from(
vec![u64::MAX; 1000]
.into_iter()
.chain(vec![u64::MAX - 1; 1000])
.collect::<Vec<u64>>(),
),
// RLE i64
"RLE",
),
];
for (case, name) in cases.into_iter() {
let enc = IntegerEncoding::from(case);
assert_eq!(enc.name(), name, "failed: {:?}", enc);
}
// All NULL array
let arr = UInt64Array::from(vec![None; 1000].into_iter().collect::<Vec<Option<u64>>>());
let enc = IntegerEncoding::from(arr);
assert_eq!(enc.name(), "BT_U8-RLE", "failed: {:?}", enc);
}
}