feat: implement raw column size on integer columns

2021-05-07 12:33:52 +01:00 · 2021-05-07 12:33:52 +01:00 · 301df03e72
parent 850db3f6c2
commit 301df03e72
2 changed files with 89 additions and 1 deletions
--- a/read_buffer/src/column/encoding/scalar/fixed_null.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed_null.rs
@ -14,6 +14,7 @@
 //! consumer of these encodings.
 use std::cmp::Ordering;
 use std::fmt::Debug;
+use std::mem::size_of;

 use arrow::{
    array::{Array, PrimitiveArray},
@ -74,7 +75,21 @@ where
    /// Returns an estimation of the total size in bytes used by this column
    /// encoding.
    pub fn size(&self) -> usize {
-        std::mem::size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
+        size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()
+    }
+
+    /// The estimated total size in bytes of the underlying values in the
+    /// column if they were stored contiguously and uncompressed. `include_nulls`
+    /// will effectively size each NULL value as 8b if `true` because the logical
+    /// size of all types of `T` is 8b
+    pub fn size_raw(&self, include_nulls: bool) -> usize {
+        // hmmm whilst Vec<i64> is probably accurate it's not really correct if
+        // T is not i64.
+        let base_size = size_of::<Vec<i64>>();
+        if !self.contains_null() || include_nulls {
+            return base_size + (self.num_rows() as usize * 8);
+        }
+        base_size + ((self.num_rows() as usize - self.arr.null_count()) * 8)
    }

    //
@ -607,6 +622,23 @@ mod test {
        assert_eq!(v.size(), 344);
    }

+    #[test]
+    fn size_raw() {
+        let v = FixedNull::<UInt64Type>::from(vec![None, None, Some(100), Some(2222)].as_slice());
+        // values   = 4 * 8 = 32b
+        // Vec<u64> = 24b
+        assert_eq!(v.size_raw(true), 56);
+        assert_eq!(v.size_raw(false), 40);
+
+        let v = FixedNull::<Int64Type>::from(vec![None, None].as_slice());
+        assert_eq!(v.size_raw(true), 32);
+        assert_eq!(v.size_raw(false), 24);
+
+        let v = FixedNull::<Float64Type>::from(vec![None, None, Some(22.3)].as_slice());
+        assert_eq!(v.size_raw(true), 48);
+        assert_eq!(v.size_raw(false), 32);
+    }
+
    #[test]
    fn first_row_id_eq_value() {
        let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());
--- a/read_buffer/src/column/integer.rs
+++ b/read_buffer/src/column/integer.rs
@ -1,3 +1,5 @@
+use std::mem::size_of;
+
 use arrow::{self, array::Array};

 use super::encoding::{scalar::Fixed, scalar::FixedNull};
@ -43,6 +45,33 @@ impl IntegerEncoding {
        }
    }

+    /// The estimated total size in bytes of the underlying integer values in
+    /// the column if they were stored contiguously and uncompressed (natively
+    /// as i64/u64). `include_nulls` will effectively size each NULL value as 8b if
+    /// `true`.
+    pub fn size_raw(&self, include_nulls: bool) -> usize {
+        match &self {
+            Self::I64I64(_)
+            | Self::I64I32(_)
+            | Self::I64U32(_)
+            | Self::I64I16(_)
+            | Self::I64U16(_)
+            | Self::I64I8(_)
+            | Self::I64U8(_)
+            | Self::U64U64(_)
+            | Self::U64U32(_)
+            | Self::U64U16(_)
+            | Self::U64U8(_) => {
+                // really one should do the correct i64/u64 in each arm but this
+                // is terser and still correct.
+                size_of::<Vec<i64>>() + (size_of::<i64>() * self.num_rows() as usize)
+            }
+
+            Self::I64I64N(enc) => enc.size_raw(include_nulls),
+            Self::U64U64N(enc) => enc.size_raw(include_nulls),
+        }
+    }
+
    /// The total number of rows in the column.
    pub fn num_rows(&self) -> u32 {
        match self {
@ -585,6 +614,8 @@ impl From<arrow::array::UInt64Array> for IntegerEncoding {

 #[cfg(test)]
 mod test {
+    use arrow::datatypes::Int64Type;
+
    use super::*;

    #[test]
@ -614,4 +645,29 @@ mod test {
            //assert_eq!(IntegerEncoding::from(&case), exp);
        }
    }
+
+    #[test]
+    fn size_raw() {
+        let enc = IntegerEncoding::I64U8(Fixed::<u8>::from(&[2, 22, 12, 31][..]));
+        // (4 * 8) + 24
+        assert_eq!(enc.size_raw(true), 56);
+        assert_eq!(enc.size_raw(false), 56);
+
+        let enc = IntegerEncoding::U64U64(Fixed::<u64>::from(&[2, 22, 12, 31][..]));
+        // (4 * 8) + 24
+        assert_eq!(enc.size_raw(true), 56);
+        assert_eq!(enc.size_raw(false), 56);
+
+        let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(&[2, 22, 12, 31][..]));
+        // (4 * 8) + 24
+        assert_eq!(enc.size_raw(true), 56);
+        assert_eq!(enc.size_raw(false), 56);
+
+        let enc = IntegerEncoding::I64I64N(FixedNull::<Int64Type>::from(
+            &[Some(2), Some(22), Some(12), None, None, Some(31)][..],
+        ));
+        // (6 * 8) + 24
+        assert_eq!(enc.size_raw(true), 72);
+        assert_eq!(enc.size_raw(false), 56);
+    }
 }