refactor: move rle heuristics to rle module
parent
f58a045d99
commit
71598d9b3e
|
@ -573,6 +573,40 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
// This function returns an estimated size in bytes for an input slice of `T`
|
||||
// were it to be run-length encoded.
|
||||
pub fn estimated_size_from<T: PartialOrd>(arr: &[T]) -> usize {
|
||||
let run_lengths = arr.len()
|
||||
- arr
|
||||
.iter()
|
||||
.zip(arr.iter().skip(1))
|
||||
.filter(|(curr, next)| matches!(curr.partial_cmp(next), Some(Ordering::Equal)))
|
||||
.count();
|
||||
run_lengths * size_of::<(u32, Option<T>)>() + size_of::<Vec<(u32, Option<T>)>>()
|
||||
}
|
||||
|
||||
// This function returns an estimated size in bytes for an input iterator
|
||||
// yielding `Option<T>`, were it to be run-length encoded.
|
||||
pub fn estimated_size_from_iter<T: PartialOrd>(mut itr: impl Iterator<Item = Option<T>>) -> usize {
|
||||
let mut v = match itr.next() {
|
||||
Some(v) => v,
|
||||
None => return 0,
|
||||
};
|
||||
|
||||
let mut total_rows = 0;
|
||||
for next in itr {
|
||||
if let Some(Ordering::Equal) = v.partial_cmp(&next) {
|
||||
continue;
|
||||
}
|
||||
|
||||
total_rows += 1;
|
||||
v = next;
|
||||
}
|
||||
|
||||
// +1 to account for original run
|
||||
(total_rows + 1) * size_of::<(u32, Option<T>)>() + size_of::<Vec<(u32, Option<T>)>>()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use cmp::Operator;
|
||||
|
@ -974,4 +1008,32 @@ mod test {
|
|||
}
|
||||
assert_eq!(transcoder.encodings(), calls * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimated_size_from() {
|
||||
let cases = vec![
|
||||
(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 0.0, 0.0, 0.0], 192),
|
||||
(vec![0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 240),
|
||||
(vec![0.0, 0.0], 48),
|
||||
(vec![1.0, 2.0, 1.0], 96),
|
||||
(vec![1.0, 2.0, 1.0, 1.0], 96),
|
||||
(vec![1.0], 48),
|
||||
];
|
||||
|
||||
for (input, exp) in cases {
|
||||
assert_eq!(super::estimated_size_from(input.as_slice()), exp);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimated_size_from_iter() {
|
||||
let cases = vec![
|
||||
(vec![Some(0.0), Some(2.0), Some(1.0)], 96),
|
||||
(vec![Some(0.0), Some(0.0)], 48),
|
||||
];
|
||||
|
||||
for (input, exp) in cases {
|
||||
assert_eq!(super::estimated_size_from_iter(input.into_iter()), exp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use arrow::array::Array;
|
||||
use arrow::datatypes::Float64Type;
|
||||
use std::{cmp::Ordering, mem::size_of};
|
||||
use std::mem::size_of;
|
||||
|
||||
use super::encoding::scalar::rle;
|
||||
use super::encoding::scalar::transcoders::NoOpTranscoder;
|
||||
use super::encoding::scalar::ScalarEncoding;
|
||||
use super::encoding::{
|
||||
|
@ -202,36 +203,6 @@ impl std::fmt::Display for FloatEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
// helper to determine how many rows the slice would have if it were RLE
|
||||
// encoded.
|
||||
fn rle_rows(arr: &[f64]) -> usize {
|
||||
arr.len()
|
||||
- arr
|
||||
.iter()
|
||||
.zip(arr.iter().skip(1))
|
||||
.filter(|(curr, next)| matches!(curr.partial_cmp(next), Some(Ordering::Equal)))
|
||||
.count()
|
||||
}
|
||||
|
||||
fn rle_rows_opt(mut itr: impl Iterator<Item = Option<f64>>) -> usize {
|
||||
let mut v = match itr.next() {
|
||||
Some(v) => v,
|
||||
None => return 0,
|
||||
};
|
||||
|
||||
let mut total_rows = 0;
|
||||
for next in itr {
|
||||
if let Some(Ordering::Equal) = v.partial_cmp(&next) {
|
||||
continue;
|
||||
}
|
||||
|
||||
total_rows += 1;
|
||||
v = next;
|
||||
}
|
||||
|
||||
total_rows + 1 // account for original run
|
||||
}
|
||||
|
||||
/// A lever to decide the minimum size in bytes that RLE the column needs to
|
||||
/// reduce the overall footprint by. 0.1 means that the size of the column must
|
||||
/// be reduced by 10%
|
||||
|
@ -250,7 +221,7 @@ impl From<&[f64]> for FloatEncoding {
|
|||
// The number of rows we would reduce the column by if we encoded it
|
||||
// as RLE.
|
||||
let base_size = arr.len() * size_of::<f64>();
|
||||
let rle_size = rle_rows(arr) * size_of::<(u32, Option<f64>)>(); // size of a run length
|
||||
let rle_size = rle::estimated_size_from(arr); // size of a run length
|
||||
if (base_size as f64 - rle_size as f64) / base_size as f64 >= MIN_RLE_SIZE_REDUCTION {
|
||||
let enc = Box::new(RLE::new_from_iter(
|
||||
arr.iter().cloned(),
|
||||
|
@ -282,7 +253,7 @@ impl From<arrow::array::Float64Array> for FloatEncoding {
|
|||
// The number of rows we would reduce the column by if we encoded it
|
||||
// as RLE.
|
||||
let base_size = arr.len() * size_of::<f64>();
|
||||
let rle_size = rle_rows_opt(arr.iter()) * size_of::<(u32, Option<f64>)>(); // size of a run length
|
||||
let rle_size = rle::estimated_size_from_iter(arr.iter()); // size of a run length
|
||||
if (base_size as f64 - rle_size as f64) / base_size as f64 >= MIN_RLE_SIZE_REDUCTION {
|
||||
let enc = Box::new(RLE::new_from_iter_opt(
|
||||
arr.iter(),
|
||||
|
@ -310,34 +281,6 @@ mod test {
|
|||
use crate::column::encoding::scalar::{fixed, fixed_null, rle};
|
||||
use cmp::Operator;
|
||||
|
||||
#[test]
|
||||
fn rle_rows() {
|
||||
let cases = vec![
|
||||
(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 0.0, 0.0, 0.0], 7),
|
||||
(vec![0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 9),
|
||||
(vec![0.0, 0.0], 1),
|
||||
(vec![1.0, 2.0, 1.0], 3),
|
||||
(vec![1.0, 2.0, 1.0, 1.0], 3),
|
||||
(vec![1.0], 1),
|
||||
];
|
||||
|
||||
for (input, exp) in cases {
|
||||
assert_eq!(super::rle_rows(input.as_slice()), exp);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rle_rows_opt() {
|
||||
let cases = vec![
|
||||
(vec![Some(0.0), Some(2.0), Some(1.0)], 3),
|
||||
(vec![Some(0.0), Some(0.0)], 1),
|
||||
];
|
||||
|
||||
for (input, exp) in cases {
|
||||
assert_eq!(super::rle_rows_opt(input.into_iter()), exp);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_arrow_array() {
|
||||
// Rows not reduced
|
||||
|
|
Loading…
Reference in New Issue