feat: implement raw column size on integer columns

pull/24376/head
Edd Robinson 2021-05-07 12:33:52 +01:00
parent 850db3f6c2
commit 301df03e72
2 changed files with 89 additions and 1 deletions

View File

@ -14,6 +14,7 @@
//! consumer of these encodings.
use std::cmp::Ordering;
use std::fmt::Debug;
use std::mem::size_of;
use arrow::{
array::{Array, PrimitiveArray},
@ -74,7 +75,21 @@ where
/// Returns an estimation of the total size in bytes used by this column
/// encoding.
pub fn size(&self) -> usize {
std::mem::size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
}
/// The estimated total size in bytes of the underlying values in the
/// column if they were stored contiguously and uncompressed. `include_nulls`
/// will effectively size each NULL value as 8b if `true` because the logical
/// size of all types of `T` is 8b
pub fn size_raw(&self, include_nulls: bool) -> usize {
// hmmm whilst Vec<i64> is probably accurate it's not really correct if
// T is not i64.
let base_size = size_of::<Vec<i64>>();
if !self.contains_null() || include_nulls {
return base_size + (self.num_rows() as usize * 8);
}
base_size + ((self.num_rows() as usize - self.arr.null_count()) * 8)
}
//
@ -607,6 +622,23 @@ mod test {
assert_eq!(v.size(), 344);
}
#[test]
fn size_raw() {
let v = FixedNull::<UInt64Type>::from(vec![None, None, Some(100), Some(2222)].as_slice());
// values = 4 * 8 = 32b
// Vec<u64> = 24b
assert_eq!(v.size_raw(true), 56);
assert_eq!(v.size_raw(false), 40);
let v = FixedNull::<Int64Type>::from(vec![None, None].as_slice());
assert_eq!(v.size_raw(true), 32);
assert_eq!(v.size_raw(false), 24);
let v = FixedNull::<Float64Type>::from(vec![None, None, Some(22.3)].as_slice());
assert_eq!(v.size_raw(true), 48);
assert_eq!(v.size_raw(false), 32);
}
#[test]
fn first_row_id_eq_value() {
let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());

View File

@ -1,3 +1,5 @@
use std::mem::size_of;
use arrow::{self, array::Array};
use super::encoding::{scalar::Fixed, scalar::FixedNull};
@ -43,6 +45,33 @@ impl IntegerEncoding {
}
}
/// The estimated total size in bytes of the underlying integer values in
/// the column if they were stored contiguously and uncompressed (natively
/// as i64/u64). `include_nulls` will effectively size each NULL value as 8b if
/// `true`.
pub fn size_raw(&self, include_nulls: bool) -> usize {
match &self {
Self::I64I64(_)
| Self::I64I32(_)
| Self::I64U32(_)
| Self::I64I16(_)
| Self::I64U16(_)
| Self::I64I8(_)
| Self::I64U8(_)
| Self::U64U64(_)
| Self::U64U32(_)
| Self::U64U16(_)
| Self::U64U8(_) => {
// really one should do the correct i64/u64 in each arm but this
// is terser and still correct.
size_of::<Vec<i64>>() + (size_of::<i64>() * self.num_rows() as usize)
}
Self::I64I64N(enc) => enc.size_raw(include_nulls),
Self::U64U64N(enc) => enc.size_raw(include_nulls),
}
}
/// The total number of rows in the column.
pub fn num_rows(&self) -> u32 {
match self {
@ -585,6 +614,8 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {
#[cfg(test)]
mod test {
use arrow::datatypes::Int64Type;
use super::*;
#[test]
@ -614,4 +645,29 @@ mod test {
//assert_eq!(IntegerEncoding::from(&case), exp);
}
}
#[test]
fn size_raw() {
let enc = IntegerEncoding::I64U8(Fixed::<u8>::from(&[2, 22, 12, 31][..]));
// (4 * 8) + 24
assert_eq!(enc.size_raw(true), 56);
assert_eq!(enc.size_raw(false), 56);
let enc = IntegerEncoding::U64U64(Fixed::<u64>::from(&[2, 22, 12, 31][..]));
// (4 * 8) + 24
assert_eq!(enc.size_raw(true), 56);
assert_eq!(enc.size_raw(false), 56);
let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(&[2, 22, 12, 31][..]));
// (4 * 8) + 24
assert_eq!(enc.size_raw(true), 56);
assert_eq!(enc.size_raw(false), 56);
let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(
&[Some(2), Some(22), Some(12), None, None, Some(31)][..],
));
// (6 * 8) + 24
assert_eq!(enc.size_raw(true), 72);
assert_eq!(enc.size_raw(false), 56);
}
}