From a5b554d2c3f5c948455879b7600606916afb02b9 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 1 Jun 2021 16:41:35 +0100 Subject: [PATCH] feat: add RLE support to integer encodings --- read_buffer/src/column/integer.rs | 738 +++++++++++++++++++++++++++--- 1 file changed, 669 insertions(+), 69 deletions(-) diff --git a/read_buffer/src/column/integer.rs b/read_buffer/src/column/integer.rs index 6eff0fb2cc..0fed2ce163 100644 --- a/read_buffer/src/column/integer.rs +++ b/read_buffer/src/column/integer.rs @@ -1,15 +1,17 @@ use std::fmt::Display; use std::iter::FromIterator; +use std::mem::size_of; -use arrow::array::PrimitiveArray; -use arrow::{self, array::Array, datatypes::*}; +use arrow::array::{Array, PrimitiveArray}; +use arrow::{self, datatypes::*}; use either::Either; +use rle::RLE; use super::encoding::scalar::{ transcoders::{ByteTrimmer, NoOpTranscoder, Transcoder}, ScalarEncoding, }; -use super::encoding::{scalar::Fixed, scalar::FixedNull}; +use super::encoding::{scalar::rle, scalar::Fixed, scalar::FixedNull}; use super::{cmp, Statistics}; use crate::column::{RowIDs, Scalar, Value, Values}; @@ -247,6 +249,19 @@ impl Display for IntegerEncoding { } } +/// A lever to decide the minimum size in bytes that run-length encoding the +/// column needs to reduce the overall footprint by. 0.1 means that the size of +/// the column must be reduced by at least 10% +pub const MIN_RLE_SIZE_REDUCTION: f64 = 0.1; // 10% + +// Applies a heuristic to decide whether the input data should be encoded using +// run-length encoding. +fn should_rle_from(arr: &[T]) -> bool { + let base_size = arr.len() * size_of::(); + (base_size as f64 - rle::estimated_size_from(arr) as f64) / base_size as f64 + >= MIN_RLE_SIZE_REDUCTION +} + /// Converts a slice of i64 values into an IntegerEncoding. /// /// The most compact physical type needed to store the columnar values is @@ -265,19 +280,26 @@ impl From<&[i64]> for IntegerEncoding { max = max.max(v); } + // If true then use RLE after byte trimming. + let rle = should_rle_from(arr); + // This match is carefully ordered. It prioritises smaller physical // datatypes that can safely represent the provided logical data let transcoder = ByteTrimmer {}; - match (min, max) { + let (enc, name) = match (min, max) { // encode as u8 values (min, max) if min >= 0 && max <= u8::MAX as i64 => { let arr = arr .iter() .map::(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U8-{}", name)) + (enc, format!("BT_U8-{}", name)) } // encode as i8 values (min, max) if min >= i8::MIN as i64 && max <= i8::MAX as i64 => { @@ -285,9 +307,13 @@ impl From<&[i64]> for IntegerEncoding { .iter() .map(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I8-{}", name)) + (enc, format!("BT_I8-{}", name)) } // encode as u16 values (min, max) if min >= 0 && max <= u16::MAX as i64 => { @@ -295,9 +321,13 @@ impl From<&[i64]> for IntegerEncoding { .iter() .map::(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U16-{}", name)) + (enc, format!("BT_U16-{}", name)) } // encode as i16 values (min, max) if min >= i16::MIN as i64 && max <= i16::MAX as i64 => { @@ -305,9 +335,13 @@ impl From<&[i64]> for IntegerEncoding { .iter() .map(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I16-{}", name)) + (enc, format!("BT_I16-{}", name)) } // encode as u32 values (min, max) if min >= 0 && max <= u32::MAX as i64 => { @@ -315,9 +349,13 @@ impl From<&[i64]> for IntegerEncoding { .iter() .map(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U32-{}", name)) + (enc, format!("BT_U32-{}", name)) } // encode as i32 values (min, max) if min >= i32::MIN as i64 && max <= i32::MAX as i64 => { @@ -325,20 +363,40 @@ impl From<&[i64]> for IntegerEncoding { .iter() .map(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I32-{}", name)) + (enc, format!("BT_I32-{}", name)) } // otherwise, encode with the same physical type (i64) (_, _) => { - let enc = Box::new(Fixed::new(arr.to_vec(), NoOpTranscoder {})); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter( + arr.to_vec().into_iter(), + NoOpTranscoder {}, + )) + } else { + Box::new(Fixed::new(arr.to_vec(), NoOpTranscoder {})) + }; let name = enc.name(); - Self::I64(enc, format!("None-{}", name)) + (enc, name.to_owned()) } - } + }; + Self::I64(enc, name) } } +// Applies a heuristic to decide whether the input data should be encoded using +// run-length encoding. +fn should_rle_from_iter(len: usize, iter: impl Iterator>) -> bool { + let base_size = len * size_of::(); + (base_size as f64 - rle::estimated_size_from_iter(iter) as f64) / base_size as f64 + >= MIN_RLE_SIZE_REDUCTION +} + /// Converts an Arrow array into an IntegerEncoding. /// /// The most compact physical Arrow array type is used to store the column @@ -353,17 +411,23 @@ impl From for IntegerEncoding { let min = arrow::compute::kernels::aggregate::min(&arr); let max = arrow::compute::kernels::aggregate::max(&arr); + // If true then use RLE after byte trimming. + let rle = should_rle_from_iter(arr.len(), arr.iter()); + // This match is carefully ordered. It prioritises smaller physical // datatypes that can safely represent the provided logical data let transcoder = ByteTrimmer {}; - match (min, max) { - // data is all NULL. Store as single byte column for now. - // TODO(edd): this will be smaller when stored using RLE + let (enc, name) = match (min, max) { + // data is all NULL. Store u8 RLE (None, None) => { let arr = PrimitiveArray::from_iter(arr.iter().map::, _>(|_| None)); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U8-{}", name)) + (enc, format!("BT_U8-{}", name)) } // encode as u8 values (min, max) if min >= Some(0) && max <= Some(u8::MAX as i64) => { @@ -371,9 +435,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u8 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U8-{}", name)) + (enc, format!("BT_U8-{}", name)) } // encode as i8 values (min, max) if min >= Some(i8::MIN as i64) && max <= Some(i8::MAX as i64) => { @@ -381,9 +449,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i8 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I8-{}", name)) + (enc, format!("BT_I8-{}", name)) } // encode as u16 values (min, max) if min >= Some(0) && max <= Some(u16::MAX as i64) => { @@ -391,9 +463,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u16 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U16-{}", name)) + (enc, format!("BT_U16-{}", name)) } // encode as i16 values (min, max) if min >= Some(i16::MIN as i64) && max <= Some(i16::MAX as i64) => { @@ -401,9 +477,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i16 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I16-{}", name)) + (enc, format!("BT_I16-{}", name)) } // encode as u32 values (min, max) if min >= Some(0) && max <= Some(u32::MAX as i64) => { @@ -411,9 +491,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as u32 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_U32-{}", name)) + (enc, format!("BT_U32-{}", name)) } // encode as i32 values (min, max) if min >= Some(i32::MIN as i64) && max <= Some(i32::MAX as i64) => { @@ -421,17 +505,26 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode i64 as i32 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::I64(enc, format!("BT_I32-{}", name)) + (enc, format!("BT_I32-{}", name)) } // otherwise, encode with the same physical type (i64) (_, _) => { - let enc = Box::new(FixedNull::::new(arr, NoOpTranscoder {})); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), NoOpTranscoder {})) + } else { + Box::new(FixedNull::::new(arr, NoOpTranscoder {})) + }; let name = enc.name(); - Self::I64(enc, format!("None-{}", name)) + (enc, name.to_owned()) } - } + }; + Self::I64(enc, name) } } @@ -449,47 +542,67 @@ impl From<&[u64]> for IntegerEncoding { max = max.max(v); } + // If true then use RLE after byte trimming. + let rle = should_rle_from(arr); + // This match is carefully ordered. It prioritises smaller physical // datatypes that can safely represent the provided logical data let transcoder = ByteTrimmer {}; - match max { + let (enc, name) = match max { // encode as u8 values max if max <= u8::MAX as u64 => { let arr = arr .iter() - .map::(|v| transcoder.encode(*v)) // u64 -> u8 - .collect::>(); - let enc = Box::new(Fixed::::new(arr, transcoder)); + .map::(|v| transcoder.encode(*v)) + .collect::>(); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U8-{}", name)) + (enc, format!("BT_U8-{}", name)) } // encode as u16 values max if max <= u16::MAX as u64 => { let arr = arr .iter() - .map::(|v| transcoder.encode(*v)) // u64 -> u16 + .map::(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U16-{}", name)) + (enc, format!("BT_U16-{}", name)) } // encode as u32 values max if max <= u32::MAX as u64 => { let arr = arr .iter() - .map::(|v| transcoder.encode(*v)) // u64 -> u32 + .map(|v| transcoder.encode(*v)) .collect::>(); - let enc = Box::new(Fixed::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U32-{}", name)) + (enc, format!("BT_U32-{}", name)) } - // otherwise, encode with the same physical type (u64) + // otherwise, encode with the same physical type (i64) _ => { - let enc = Box::new(Fixed::::new(arr.to_vec(), NoOpTranscoder {})); // no transcoding needed + let enc: Box> = if rle { + Box::new(RLE::new_from_iter(arr.to_vec().into_iter(), transcoder)) + } else { + Box::new(Fixed::new(arr.to_vec(), transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("None-{}", name)) + (enc, name.to_owned()) } - } + }; + Self::U64(enc, name) } } @@ -506,19 +619,38 @@ impl From for IntegerEncoding { // determine max value. let max = arrow::compute::kernels::aggregate::max(&arr); + // If true then use RLE after byte trimming. + let rle = should_rle_from_iter(arr.len(), arr.iter()); + // This match is carefully ordered. It prioritises smaller physical // datatypes that can safely represent the provided logical data let transcoder = ByteTrimmer {}; - match max { + let (enc, name) = match max { + // data is all NULL. Store as single byte column for now. + // TODO(edd): this will be smaller when stored using RLE + None => { + let arr = PrimitiveArray::from_iter(arr.iter().map::, _>(|_| None)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; + let name = enc.name(); + (enc, format!("BT_U8-{}", name)) + } // encode as u8 values max if max <= Some(u8::MAX as u64) => { let arr = PrimitiveArray::from_iter( arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u8 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U8-{}", name)) + (enc, format!("BT_U8-{}", name)) } // encode as u16 values max if max <= Some(u16::MAX as u64) => { @@ -526,9 +658,13 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u16 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U16-{}", name)) + (enc, format!("BT_U16-{}", name)) } // encode as u32 values max if max <= Some(u32::MAX as u64) => { @@ -536,25 +672,35 @@ impl From for IntegerEncoding { arr.into_iter().map(|v| v.map(|v| transcoder.encode(v))), // encode u64 as u32 ); - let enc = Box::new(FixedNull::::new(arr, transcoder)); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), transcoder)) + } else { + Box::new(FixedNull::::new(arr, transcoder)) + }; let name = enc.name(); - Self::U64(enc, format!("BT_U32-{}", name)) + (enc, format!("BT_U32-{}", name)) } // otherwise, encode with the same physical type (u64) _ => { - let enc = Box::new(FixedNull::::new(arr, NoOpTranscoder {})); + let enc: Box> = if rle { + Box::new(RLE::new_from_iter_opt(arr.iter(), NoOpTranscoder {})) + } else { + Box::new(FixedNull::::new(arr, NoOpTranscoder {})) + }; let name = enc.name(); - Self::U64(enc, format!("None-{}", name)) + (enc, name.to_owned()) } - } + }; + Self::U64(enc, name) } } #[cfg(test)] mod test { - use arrow::array::{Int64Array, UInt64Array}; + use itertools::Itertools; use super::*; + use arrow::array::{Int64Array, UInt64Array}; #[test] // Tests that input data gets byte trimmed correctly. @@ -570,9 +716,133 @@ mod test { (vec![0_i64, 2, 245, i64::MIN], 56), // i64 fixed array ]; - for (case, size) in cases.into_iter() { + for (case, name) in cases.into_iter() { let enc = IntegerEncoding::from(case.as_slice()); - assert_eq!(enc.size(), size, "failed: {:?}", enc); + assert_eq!(enc.size(), name, "failed: {:?}", enc); + } + } + + #[test] + // Tests that input data gets encoded correctly + fn from_slice_i64_encoding() { + let cases = vec![ + ( + vec![1_i64; 1000] + .into_iter() + .interleave(vec![2_i64; 1000].into_iter()) // 1,2,1,2,1,2.... + .collect::>(), + // byte trimmed to u8 + "BT_U8-FIXED", + ), + ( + vec![1_i64; 1000] + .into_iter() + .chain(vec![2_i64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2.... + .collect::>(), + // byte trimmed to u8 then RLE. + "BT_U8-RLE", + ), + ( + vec![-1_i64; 1000] + .into_iter() + .interleave(vec![-2_i64; 1000].into_iter()) + .collect::>(), + // byte trimmed to i8 + "BT_I8-FIXED", + ), + ( + vec![-1_i64; 1000] + .into_iter() + .chain(vec![-2_i64; 1000]) + .collect::>(), + // byte trimmed to i8 then RLE. + "BT_I8-RLE", + ), + ( + vec![500_i64; 1000] + .into_iter() + .interleave(vec![600_i64; 1000].into_iter()) + .collect::>(), + // byte trimmed to u16 + "BT_U16-FIXED", + ), + ( + vec![500_i64; 1000] + .into_iter() + .chain(vec![600_i64; 1000]) + .collect::>(), + // byte trimmed to u16 then RLE. + "BT_U16-RLE", + ), + ( + vec![-500_i64; 1000] + .into_iter() + .interleave(vec![-600_i64; 1000].into_iter()) + .collect::>(), + // byte trimmed to i16 + "BT_I16-FIXED", + ), + ( + vec![-500_i64; 1000] + .into_iter() + .chain(vec![-600_i64; 1000]) + .collect::>(), + // byte trimmed to i16 then RLE. + "BT_I16-RLE", + ), + ( + vec![100_000_i64; 1000] + .into_iter() + .interleave(vec![200_000_i64; 1000].into_iter()) + .collect::>(), + // byte trimmed to u32 + "BT_U32-FIXED", + ), + ( + vec![100_000_i64; 1000] + .into_iter() + .chain(vec![200_000_i64; 1000]) + .collect::>(), + // byte trimmed to u16 then RLE. + "BT_U32-RLE", + ), + ( + vec![-100_000_i64; 1000] + .into_iter() + .interleave(vec![-200_000_i64; 1000].into_iter()) + .collect::>(), + // byte trimmed to i32 + "BT_I32-FIXED", + ), + ( + vec![-100_000_i64; 1000] + .into_iter() + .chain(vec![-200_000_i64; 1000]) + .collect::>(), + // byte trimmed to i16 then RLE. + "BT_I32-RLE", + ), + ( + vec![i64::MIN; 1000] + .into_iter() + .interleave(vec![i64::MIN + 1; 1000].into_iter()) + .collect::>(), + // vanilla i64 + "FIXED", + ), + ( + vec![i64::MIN; 1000] + .into_iter() + .chain(vec![i64::MIN + 1; 1000]) + .collect::>(), + // RLE i64 + "RLE", + ), + ]; + + for (case, name) in cases.into_iter() { + let enc = IntegerEncoding::from(case.as_slice()); + assert_eq!(enc.name(), name, "failed: {:?}", enc); } } @@ -591,6 +861,82 @@ mod test { } } + #[test] + // Tests that input data gets encoded correctly + fn from_slice_u64_encoding() { + let cases = vec![ + ( + vec![1_u64; 1000] + .into_iter() + .interleave(vec![2_u64; 1000].into_iter()) // 1,2,1,2,1,2.... + .collect::>(), + // byte trimmed to u8 + "BT_U8-FIXED", + ), + ( + vec![1_u64; 1000] + .into_iter() + .chain(vec![2_u64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2.... + .collect::>(), + // byte trimmed to u8 then RLE. + "BT_U8-RLE", + ), + ( + vec![500_u64; 1000] + .into_iter() + .interleave(vec![600_u64; 1000].into_iter()) + .collect::>(), + // byte trimmed to u16 + "BT_U16-FIXED", + ), + ( + vec![500_u64; 1000] + .into_iter() + .chain(vec![600_u64; 1000]) + .collect::>(), + // byte trimmed to u16 then RLE. + "BT_U16-RLE", + ), + ( + vec![100_000_u64; 1000] + .into_iter() + .interleave(vec![200_000_u64; 1000].into_iter()) + .collect::>(), + // byte trimmed to u32 + "BT_U32-FIXED", + ), + ( + vec![100_000_u64; 1000] + .into_iter() + .chain(vec![200_000_u64; 1000]) + .collect::>(), + // byte trimmed to u16 then RLE. + "BT_U32-RLE", + ), + ( + vec![u64::MAX; 1000] + .into_iter() + .interleave(vec![u64::MAX - 1; 1000].into_iter()) + .collect::>(), + // vanilla u64 + "FIXED", + ), + ( + vec![u64::MAX; 1000] + .into_iter() + .chain(vec![u64::MAX - 1; 1000]) + .collect::>(), + // RLE u64 + "RLE", + ), + ]; + + for (case, name) in cases.into_iter() { + let enc = IntegerEncoding::from(case.as_slice()); + assert_eq!(enc.name(), name, "failed: {:?}", enc); + } + } + #[test] fn from_arrow_i64_array() { let cases = vec![ @@ -623,13 +969,170 @@ mod test { (vec![None, Some(u32::MAX as i64 + 1)], 344), //u64 ]; - for (case, size) in cases.iter().cloned() { + for (case, name) in cases.iter().cloned() { let arr = Int64Array::from(case); let enc = IntegerEncoding::from(arr); - assert_eq!(enc.size(), size, "failed: {:?}", enc); + assert_eq!(enc.size(), name, "failed: {:?}", enc); } } + #[test] + // Tests that input data gets encoded correctly + fn from_arrow_i64_array_encoding() { + let cases = vec![ + ( + Int64Array::from( + vec![1_i64; 1000] + .into_iter() + .interleave(vec![2_i64; 1000].into_iter()) // 1,2,1,2,1,2.... + .collect::>(), + ), + // byte trimmed to u8 + "BT_U8-FIXED", + ), + ( + Int64Array::from( + vec![1_i64; 1000] + .into_iter() + .chain(vec![2_i64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2.... + .collect::>(), + ), + // byte trimmed to u8 then RLE. + "BT_U8-RLE", + ), + ( + Int64Array::from( + vec![-1_i64; 1000] + .into_iter() + .interleave(vec![-2_i64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to i8 + "BT_I8-FIXED", + ), + ( + Int64Array::from( + vec![-1_i64; 1000] + .into_iter() + .chain(vec![-2_i64; 1000]) + .collect::>(), + ), + // byte trimmed to i8 then RLE. + "BT_I8-RLE", + ), + ( + Int64Array::from( + vec![500_i64; 1000] + .into_iter() + .interleave(vec![600_i64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to u16 + "BT_U16-FIXED", + ), + ( + Int64Array::from( + vec![500_i64; 1000] + .into_iter() + .chain(vec![600_i64; 1000]) + .collect::>(), + ), + // byte trimmed to u16 then RLE. + "BT_U16-RLE", + ), + ( + Int64Array::from( + vec![-500_i64; 1000] + .into_iter() + .interleave(vec![-600_i64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to i16 + "BT_I16-FIXED", + ), + ( + Int64Array::from( + vec![-500_i64; 1000] + .into_iter() + .chain(vec![-600_i64; 1000]) + .collect::>(), + ), + // byte trimmed to i16 then RLE. + "BT_I16-RLE", + ), + ( + Int64Array::from( + vec![100_000_i64; 1000] + .into_iter() + .interleave(vec![200_000_i64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to u32 + "BT_U32-FIXED", + ), + ( + Int64Array::from( + vec![100_000_i64; 1000] + .into_iter() + .chain(vec![200_000_i64; 1000]) + .collect::>(), + ), + // byte trimmed to u16 then RLE. + "BT_U32-RLE", + ), + ( + Int64Array::from( + vec![-100_000_i64; 1000] + .into_iter() + .interleave(vec![-200_000_i64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to i32 + "BT_I32-FIXED", + ), + ( + Int64Array::from( + vec![-100_000_i64; 1000] + .into_iter() + .chain(vec![-200_000_i64; 1000]) + .collect::>(), + ), + // byte trimmed to i16 then RLE. + "BT_I32-RLE", + ), + ( + Int64Array::from( + vec![i64::MIN; 1000] + .into_iter() + .interleave(vec![i64::MIN + 1; 1000].into_iter()) + .collect::>(), + ), + // vanilla i64 + "FIXED", + ), + ( + Int64Array::from( + vec![i64::MIN; 1000] + .into_iter() + .chain(vec![i64::MIN + 1; 1000]) + .collect::>(), + ), + // RLE i64 + "RLE", + ), + ]; + + for (case, name) in cases.into_iter() { + let enc = IntegerEncoding::from(case); + assert_eq!(enc.name(), name, "failed: {:?}", enc); + } + + // All NULL array + let arr = Int64Array::from(vec![None; 1000].into_iter().collect::>>()); + let enc = IntegerEncoding::from(arr); + assert_eq!(enc.name(), "BT_U8-RLE", "failed: {:?}", enc); + } + #[test] fn from_arrow_u64_array() { let cases = vec![ @@ -661,4 +1164,101 @@ mod test { assert_eq!(enc.size(), size, "failed: {:?}", enc); } } + + #[test] + // Tests that input data gets encoded correctly + fn from_arrow_u64_array_encoding() { + let cases = vec![ + ( + UInt64Array::from( + vec![1_u64; 1000] + .into_iter() + .interleave(vec![2_u64; 1000].into_iter()) // 1,2,1,2,1,2.... + .collect::>(), + ), + // byte trimmed to u8 + "BT_U8-FIXED", + ), + ( + UInt64Array::from( + vec![1_u64; 1000] + .into_iter() + .chain(vec![2_u64; 1000]) // 1,1,1,1,1,2,2,2,2,2,2.... + .collect::>(), + ), + // byte trimmed to u8 then RLE. + "BT_U8-RLE", + ), + ( + UInt64Array::from( + vec![500_u64; 1000] + .into_iter() + .interleave(vec![600_u64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to u16 + "BT_U16-FIXED", + ), + ( + UInt64Array::from( + vec![500_u64; 1000] + .into_iter() + .chain(vec![600_u64; 1000]) + .collect::>(), + ), + // byte trimmed to u16 then RLE. + "BT_U16-RLE", + ), + ( + UInt64Array::from( + vec![100_000_u64; 1000] + .into_iter() + .interleave(vec![200_000_u64; 1000].into_iter()) + .collect::>(), + ), + // byte trimmed to u32 + "BT_U32-FIXED", + ), + ( + UInt64Array::from( + vec![100_000_u64; 1000] + .into_iter() + .chain(vec![200_000_u64; 1000]) + .collect::>(), + ), + // byte trimmed to u16 then RLE. + "BT_U32-RLE", + ), + ( + UInt64Array::from( + vec![u64::MAX; 1000] + .into_iter() + .interleave(vec![u64::MAX - 1; 1000].into_iter()) + .collect::>(), + ), + // vanilla i64 + "FIXED", + ), + ( + UInt64Array::from( + vec![u64::MAX; 1000] + .into_iter() + .chain(vec![u64::MAX - 1; 1000]) + .collect::>(), + ), + // RLE i64 + "RLE", + ), + ]; + + for (case, name) in cases.into_iter() { + let enc = IntegerEncoding::from(case); + assert_eq!(enc.name(), name, "failed: {:?}", enc); + } + + // All NULL array + let arr = UInt64Array::from(vec![None; 1000].into_iter().collect::>>()); + let enc = IntegerEncoding::from(arr); + assert_eq!(enc.name(), "BT_U8-RLE", "failed: {:?}", enc); + } }