test: add test for float encoding rules
parent
1ac949e7ea
commit
1fa08d0de5
|
@ -1,5 +1,4 @@
|
||||||
use std::cmp::Ordering;
|
use std::{cmp::Ordering, mem::size_of};
|
||||||
use std::mem::size_of;
|
|
||||||
|
|
||||||
use arrow::{self, array::Array};
|
use arrow::{self, array::Array};
|
||||||
|
|
||||||
|
@ -13,8 +12,13 @@ use crate::column::{RowIDs, Scalar, Value, Values};
|
||||||
#[allow(clippy::upper_case_acronyms)] // TODO(edd): these will be OK in 1.52
|
#[allow(clippy::upper_case_acronyms)] // TODO(edd): these will be OK in 1.52
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum FloatEncoding {
|
pub enum FloatEncoding {
|
||||||
|
// A fixed-width "no compression" vector of non-nullable values
|
||||||
Fixed64(Fixed<f64>),
|
Fixed64(Fixed<f64>),
|
||||||
|
|
||||||
|
// A fixed-width "no compression" vector of nullable values (as Arrow array)
|
||||||
FixedNull64(FixedNull<arrow::datatypes::Float64Type>),
|
FixedNull64(FixedNull<arrow::datatypes::Float64Type>),
|
||||||
|
|
||||||
|
// A RLE compressed encoding of nullable values.
|
||||||
RLE64(RLE<f64>),
|
RLE64(RLE<f64>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,31 +257,45 @@ impl std::fmt::Display for FloatEncoding {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_run_lengths_above(arr: &[f64], min_rl: usize) -> usize {
|
fn rle_rows(arr: &[f64]) -> usize {
|
||||||
if min_rl < 1 || arr.len() < min_rl {
|
let mut v = arr[0];
|
||||||
return 0;
|
let mut total_rows = 0;
|
||||||
}
|
|
||||||
|
|
||||||
let (mut rl, mut v) = (1, arr[0]);
|
|
||||||
let mut total_matching_rl = 0;
|
|
||||||
for next in arr.iter().skip(1) {
|
for next in arr.iter().skip(1) {
|
||||||
if let Some(Ordering::Equal) = v.partial_cmp(next) {
|
if let Some(Ordering::Equal) = v.partial_cmp(next) {
|
||||||
rl += 1;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// run length was big enough to be considered
|
total_rows += 1;
|
||||||
if rl > min_rl {
|
|
||||||
total_matching_rl += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
rl = 1;
|
|
||||||
v = *next;
|
v = *next;
|
||||||
}
|
}
|
||||||
|
|
||||||
total_matching_rl
|
total_rows + 1 // account for original run
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn rle_rows_opt(mut itr: impl Iterator<Item = Option<f64>>) -> usize {
|
||||||
|
let mut v = match itr.next() {
|
||||||
|
Some(v) => v,
|
||||||
|
None => return 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut total_rows = 0;
|
||||||
|
for next in itr {
|
||||||
|
if let Some(Ordering::Equal) = v.partial_cmp(&next) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
total_rows += 1;
|
||||||
|
v = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
total_rows + 1 // account for original run
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A lever to decide the minimum size in bytes that RLE the column needs to
|
||||||
|
/// reduce the overall footprint by. 0.1 means that the size of the column must
|
||||||
|
/// be reduced by 10%
|
||||||
|
pub const MIN_RLE_SIZE_REDUCTION: f64 = 0.3; // 30%
|
||||||
|
|
||||||
/// Converts a slice of `f64` values into a `FloatEncoding`.
|
/// Converts a slice of `f64` values into a `FloatEncoding`.
|
||||||
///
|
///
|
||||||
/// TODO(edd): figure out what sensible heuristics look like.
|
/// TODO(edd): figure out what sensible heuristics look like.
|
||||||
|
@ -290,12 +308,11 @@ fn check_run_lengths_above(arr: &[f64], min_rl: usize) -> usize {
|
||||||
/// The encoding is chosen based on the heuristics in the `From` implementation
|
/// The encoding is chosen based on the heuristics in the `From` implementation
|
||||||
impl From<&[f64]> for FloatEncoding {
|
impl From<&[f64]> for FloatEncoding {
|
||||||
fn from(arr: &[f64]) -> Self {
|
fn from(arr: &[f64]) -> Self {
|
||||||
// The total number of run-lengths to find in order to decide to RLE
|
// The number of rows we would reduce the column by if we encoded it
|
||||||
// this column is in the range `[10, 1/10th column size]`
|
// as RLE.
|
||||||
// For example, if the columns is 1000 rows then we need to find 100
|
let base_size = arr.len() * size_of::<f64>();
|
||||||
// run lengths to RLE encode it.
|
let rle_size = rle_rows(arr) * size_of::<(u32, Option<f64>)>(); // size of a run length
|
||||||
let total_rl_required = 10.max(arr.len() / 10);
|
if (base_size as f64 - rle_size as f64) / base_size as f64 >= MIN_RLE_SIZE_REDUCTION {
|
||||||
if check_run_lengths_above(arr, 3) >= total_rl_required {
|
|
||||||
return Self::RLE64(RLE::from(arr));
|
return Self::RLE64(RLE::from(arr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -320,9 +337,11 @@ impl From<arrow::array::Float64Array> for FloatEncoding {
|
||||||
return Self::from(arr.values());
|
return Self::from(arr.values());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(edd) Right now let's just RLE encode the column if it is 50% NULL.
|
// The number of rows we would reduce the column by if we encoded it
|
||||||
// and has at least 100 values in it.
|
// as RLE.
|
||||||
if arr.len() >= 100 && arr.null_count() >= arr.len() / 2 {
|
let base_size = arr.len() * size_of::<f64>();
|
||||||
|
let rle_size = rle_rows_opt(arr.iter()) * size_of::<(u32, Option<f64>)>(); // size of a run length
|
||||||
|
if (base_size as f64 - rle_size as f64) / base_size as f64 >= MIN_RLE_SIZE_REDUCTION {
|
||||||
return Self::RLE64(RLE::from(arr));
|
return Self::RLE64(RLE::from(arr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -332,7 +351,10 @@ impl From<arrow::array::Float64Array> for FloatEncoding {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use std::iter;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use arrow::array::Float64Array;
|
||||||
use cmp::Operator;
|
use cmp::Operator;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -357,6 +379,60 @@ mod test {
|
||||||
assert_eq!(enc.size_raw(false), 56);
|
assert_eq!(enc.size_raw(false), 56);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn rle_rows() {
|
||||||
|
let cases = vec![
|
||||||
|
(vec![0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 9),
|
||||||
|
(vec![0.0, 0.0], 1),
|
||||||
|
(vec![1.0, 2.0, 1.0], 3),
|
||||||
|
(vec![1.0, 2.0, 1.0, 1.0], 3),
|
||||||
|
(vec![1.0], 1),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, exp) in cases {
|
||||||
|
assert_eq!(super::rle_rows(input.as_slice()), exp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rle_rows_opt() {
|
||||||
|
let cases = vec![
|
||||||
|
(vec![Some(0.0), Some(2.0), Some(1.0)], 3),
|
||||||
|
(vec![Some(0.0), Some(0.0)], 1),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, exp) in cases {
|
||||||
|
assert_eq!(super::rle_rows_opt(input.into_iter()), exp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_arrow_array() {
|
||||||
|
// Rows not reduced
|
||||||
|
let input: Vec<Option<f64>> = vec![Some(33.2), Some(1.2), Some(2.2), None, Some(3.2)];
|
||||||
|
let arr = Float64Array::from(input);
|
||||||
|
let enc = FloatEncoding::from(arr);
|
||||||
|
assert!(matches!(enc, FloatEncoding::FixedNull64(_)));
|
||||||
|
|
||||||
|
// Rows not reduced and no nulls so can go in `Fixed64`.
|
||||||
|
let input: Vec<Option<f64>> = vec![Some(33.2), Some(1.2), Some(2.2), Some(3.2)];
|
||||||
|
let arr = Float64Array::from(input);
|
||||||
|
let enc = FloatEncoding::from(arr);
|
||||||
|
assert!(matches!(enc, FloatEncoding::Fixed64(_)));
|
||||||
|
|
||||||
|
// Goldilocks - encode as RLE
|
||||||
|
let input: Vec<Option<f64>> = vec![Some(33.2); 10];
|
||||||
|
let arr = Float64Array::from(input);
|
||||||
|
let enc = FloatEncoding::from(arr);
|
||||||
|
assert!(matches!(enc, FloatEncoding::RLE64(_)));
|
||||||
|
|
||||||
|
// Goldilocks - encode as RLE
|
||||||
|
let mut input: Vec<Option<f64>> = vec![Some(33.2); 10];
|
||||||
|
input.extend(iter::repeat(None).take(10));
|
||||||
|
let arr = Float64Array::from(input);
|
||||||
|
let enc = FloatEncoding::from(arr);
|
||||||
|
assert!(matches!(enc, FloatEncoding::RLE64(_)));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
// Test NaN behaviour when `FloatEncoder`s are used.
|
// Test NaN behaviour when `FloatEncoder`s are used.
|
||||||
//
|
//
|
||||||
|
|
Loading…
Reference in New Issue