feat: implement raw column size on integer columns
parent
850db3f6c2
commit
301df03e72
|
@ -14,6 +14,7 @@
|
||||||
//! consumer of these encodings.
|
//! consumer of these encodings.
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
use std::mem::size_of;
|
||||||
|
|
||||||
use arrow::{
|
use arrow::{
|
||||||
array::{Array, PrimitiveArray},
|
array::{Array, PrimitiveArray},
|
||||||
|
@ -74,7 +75,21 @@ where
|
||||||
/// Returns an estimation of the total size in bytes used by this column
|
/// Returns an estimation of the total size in bytes used by this column
|
||||||
/// encoding.
|
/// encoding.
|
||||||
pub fn size(&self) -> usize {
|
pub fn size(&self) -> usize {
|
||||||
std::mem::size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
|
size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The estimated total size in bytes of the underlying values in the
|
||||||
|
/// column if they were stored contiguously and uncompressed. `include_nulls`
|
||||||
|
/// will effectively size each NULL value as 8b if `true` because the logical
|
||||||
|
/// size of all types of `T` is 8b
|
||||||
|
pub fn size_raw(&self, include_nulls: bool) -> usize {
|
||||||
|
// hmmm whilst Vec<i64> is probably accurate it's not really correct if
|
||||||
|
// T is not i64.
|
||||||
|
let base_size = size_of::<Vec<i64>>();
|
||||||
|
if !self.contains_null() || include_nulls {
|
||||||
|
return base_size + (self.num_rows() as usize * 8);
|
||||||
|
}
|
||||||
|
base_size + ((self.num_rows() as usize - self.arr.null_count()) * 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -607,6 +622,23 @@ mod test {
|
||||||
assert_eq!(v.size(), 344);
|
assert_eq!(v.size(), 344);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn size_raw() {
|
||||||
|
let v = FixedNull::<UInt64Type>::from(vec![None, None, Some(100), Some(2222)].as_slice());
|
||||||
|
// values = 4 * 8 = 32b
|
||||||
|
// Vec<u64> = 24b
|
||||||
|
assert_eq!(v.size_raw(true), 56);
|
||||||
|
assert_eq!(v.size_raw(false), 40);
|
||||||
|
|
||||||
|
let v = FixedNull::<Int64Type>::from(vec![None, None].as_slice());
|
||||||
|
assert_eq!(v.size_raw(true), 32);
|
||||||
|
assert_eq!(v.size_raw(false), 24);
|
||||||
|
|
||||||
|
let v = FixedNull::<Float64Type>::from(vec![None, None, Some(22.3)].as_slice());
|
||||||
|
assert_eq!(v.size_raw(true), 48);
|
||||||
|
assert_eq!(v.size_raw(false), 32);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn first_row_id_eq_value() {
|
fn first_row_id_eq_value() {
|
||||||
let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());
|
let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use std::mem::size_of;
|
||||||
|
|
||||||
use arrow::{self, array::Array};
|
use arrow::{self, array::Array};
|
||||||
|
|
||||||
use super::encoding::{scalar::Fixed, scalar::FixedNull};
|
use super::encoding::{scalar::Fixed, scalar::FixedNull};
|
||||||
|
@ -43,6 +45,33 @@ impl IntegerEncoding {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The estimated total size in bytes of the underlying integer values in
|
||||||
|
/// the column if they were stored contiguously and uncompressed (natively
|
||||||
|
/// as i64/u64). `include_nulls` will effectively size each NULL value as 8b if
|
||||||
|
/// `true`.
|
||||||
|
pub fn size_raw(&self, include_nulls: bool) -> usize {
|
||||||
|
match &self {
|
||||||
|
Self::I64I64(_)
|
||||||
|
| Self::I64I32(_)
|
||||||
|
| Self::I64U32(_)
|
||||||
|
| Self::I64I16(_)
|
||||||
|
| Self::I64U16(_)
|
||||||
|
| Self::I64I8(_)
|
||||||
|
| Self::I64U8(_)
|
||||||
|
| Self::U64U64(_)
|
||||||
|
| Self::U64U32(_)
|
||||||
|
| Self::U64U16(_)
|
||||||
|
| Self::U64U8(_) => {
|
||||||
|
// really one should do the correct i64/u64 in each arm but this
|
||||||
|
// is terser and still correct.
|
||||||
|
size_of::<Vec<i64>>() + (size_of::<i64>() * self.num_rows() as usize)
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::I64I64N(enc) => enc.size_raw(include_nulls),
|
||||||
|
Self::U64U64N(enc) => enc.size_raw(include_nulls),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The total number of rows in the column.
|
/// The total number of rows in the column.
|
||||||
pub fn num_rows(&self) -> u32 {
|
pub fn num_rows(&self) -> u32 {
|
||||||
match self {
|
match self {
|
||||||
|
@ -585,6 +614,8 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use arrow::datatypes::Int64Type;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -614,4 +645,29 @@ mod test {
|
||||||
//assert_eq!(IntegerEncoding::from(&case), exp);
|
//assert_eq!(IntegerEncoding::from(&case), exp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn size_raw() {
|
||||||
|
let enc = IntegerEncoding::I64U8(Fixed::<u8>::from(&[2, 22, 12, 31][..]));
|
||||||
|
// (4 * 8) + 24
|
||||||
|
assert_eq!(enc.size_raw(true), 56);
|
||||||
|
assert_eq!(enc.size_raw(false), 56);
|
||||||
|
|
||||||
|
let enc = IntegerEncoding::U64U64(Fixed::<u64>::from(&[2, 22, 12, 31][..]));
|
||||||
|
// (4 * 8) + 24
|
||||||
|
assert_eq!(enc.size_raw(true), 56);
|
||||||
|
assert_eq!(enc.size_raw(false), 56);
|
||||||
|
|
||||||
|
let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(&[2, 22, 12, 31][..]));
|
||||||
|
// (4 * 8) + 24
|
||||||
|
assert_eq!(enc.size_raw(true), 56);
|
||||||
|
assert_eq!(enc.size_raw(false), 56);
|
||||||
|
|
||||||
|
let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(
|
||||||
|
&[Some(2), Some(22), Some(12), None, None, Some(31)][..],
|
||||||
|
));
|
||||||
|
// (6 * 8) + 24
|
||||||
|
assert_eq!(enc.size_raw(true), 72);
|
||||||
|
assert_eq!(enc.size_raw(false), 56);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue