diff --git a/Cargo.lock b/Cargo.lock index 6c92c9ccb3..63e340c752 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -819,6 +819,7 @@ dependencies = [ "datafusion 2.0.0-SNAPSHOT (git+https://github.com/alamb/arrow.git?rev=46f18c2602072e083809e0846b810e0cc3c59fdd)", "delorean_table", "env_logger", + "heapsize", "human_format", "log", "parquet 2.0.0-SNAPSHOT (git+https://github.com/alamb/arrow.git?rev=46f18c2602072e083809e0846b810e0cc3c59fdd)", @@ -1356,6 +1357,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "heapsize" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1679e6ea370dee694f91f1dc469bf94cf8f52051d147aec3e1f9497c6fc22461" +dependencies = [ + "winapi 0.3.8", +] + [[package]] name = "heck" version = "0.3.1" diff --git a/delorean_mem_qe/Cargo.toml b/delorean_mem_qe/Cargo.toml index c2086ac66f..742df5ba19 100644 --- a/delorean_mem_qe/Cargo.toml +++ b/delorean_mem_qe/Cargo.toml @@ -12,9 +12,7 @@ delorean_table = { path = "../delorean_table" } arrow = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" } parquet = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" } datafusion = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" } -#arrow = { path = "/Users/alamb/Software/arrow/rust/arrow" } -#parquet = { path = "/Users/alamb/Software/arrow/rust/parquet" } -#datafusion = { path = "/Users/alamb/Software/arrow/rust/datafusion" } +heapsize = "0.4.2" snafu = "0.6.8" croaring = "0.4.5" crossbeam = "0.7.3" diff --git a/delorean_mem_qe/src/bin/main.rs b/delorean_mem_qe/src/bin/main.rs index b172338da8..90acdd167f 100644 --- a/delorean_mem_qe/src/bin/main.rs +++ b/delorean_mem_qe/src/bin/main.rs @@ -134,6 +134,7 @@ fn build_store( let mut segment = Segment::new(rb.num_rows(), schema); convert_record_batch(rb, &mut segment)?; + println!("{}", &segment); store.add_segment(segment); } Ok(None) => { diff --git a/delorean_mem_qe/src/column.rs b/delorean_mem_qe/src/column.rs index fcef27c1b8..e0b5df55dc 100644 --- a/delorean_mem_qe/src/column.rs +++ b/delorean_mem_qe/src/column.rs @@ -873,6 +873,23 @@ impl Column { } } +impl std::fmt::Display for Column { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Column::String(c) => { + write!(f, "{}", c)?; + } + Column::Float(c) => { + write!(f, "{}", c)?; + } + Column::Integer(c) => { + write!(f, "{}", c)?; + } + } + Ok(()) + } +} + impl AggregatableByRange for &Column { fn aggregate_by_id_range( &self, @@ -964,6 +981,12 @@ impl String { } } +impl std::fmt::Display for String { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Meta: {}, Data: {}", self.meta, self.data) + } +} + #[derive(Debug)] pub struct Float { meta: metadata::F64, @@ -1015,6 +1038,12 @@ impl Float { } } +impl std::fmt::Display for Float { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Meta: {}, Data: {}", self.meta, self.data) + } +} + impl From<&[f64]> for Float { fn from(values: &[f64]) -> Self { let len = values.len(); @@ -1034,6 +1063,32 @@ impl From<&[f64]> for Float { } } +// use arrow::array::Array; +// impl From> for Float { +// fn from(arr: arrow::array::PrimitiveArray) -> Self { +// let len = arr.len(); +// let mut min = std::f64::MAX; +// let mut max = std::f64::MIN; + +// // calculate min/max for meta data +// // TODO(edd): can use compute kernels for this. +// for i in 0..arr.len() { +// if arr.is_null(i) { +// continue; +// } + +// let v = arr.value(i); +// min = min.min(v); +// max = max.max(v); +// } + +// Self { +// meta: metadata::F64::new((min, max), len), +// data: Box::new(encoding::PlainArrow { arr }), +// } +// } +// } + #[derive(Debug)] pub struct Integer { meta: metadata::I64, @@ -1080,6 +1135,12 @@ impl Integer { } } +impl std::fmt::Display for Integer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Meta: {}, Data: {}", self.meta, self.data) + } +} + impl From<&[i64]> for Integer { fn from(values: &[i64]) -> Self { let len = values.len(); @@ -1100,11 +1161,12 @@ impl From<&[i64]> for Integer { } pub mod metadata { + use std::mem::size_of; + #[derive(Debug, Default)] pub struct Str { range: (Option, Option), num_rows: usize, - // sparse_index: BTreeMap, } impl Str { @@ -1145,8 +1207,20 @@ pub mod metadata { } pub fn size(&self) -> usize { - // TODO!!!! - 0 //self.range.0.len() + self.range.1.len() + std::mem::size_of::() + // size of types for num_rows and range + let base_size = size_of::() + (2 * size_of::>()); + match &self.range { + (None, None) => base_size, + (Some(min), None) => base_size + min.len(), + (None, Some(max)) => base_size + max.len(), + (Some(min), Some(max)) => base_size + min.len() + max.len(), + } + } + } + + impl std::fmt::Display for Str { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Range: ({:?})", self.range) } } @@ -1184,7 +1258,13 @@ pub mod metadata { } pub fn size(&self) -> usize { - std::mem::size_of::<(f64, f64)>() + std::mem::size_of::() + size_of::() + (size_of::<(f64, f64)>()) + } + } + + impl std::fmt::Display for F64 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Range: ({:?})", self.range) } } @@ -1219,7 +1299,13 @@ pub mod metadata { } pub fn size(&self) -> usize { - std::mem::size_of::<(i64, i64)>() + std::mem::size_of::() + size_of::() + (size_of::<(i64, i64)>()) + } + } + + impl std::fmt::Display for I64 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Range: ({:?})", self.range) } } } diff --git a/delorean_mem_qe/src/encoding.rs b/delorean_mem_qe/src/encoding.rs index e19d533306..54b90b0a72 100644 --- a/delorean_mem_qe/src/encoding.rs +++ b/delorean_mem_qe/src/encoding.rs @@ -1,11 +1,11 @@ use std::collections::{BTreeMap, BTreeSet}; use std::iter; +use std::mem::size_of; use arrow::array::{Array, PrimitiveArray}; use arrow::datatypes::ArrowNumericType; -use arrow::datatypes::*; -pub trait Encoding: Send + Sync { +pub trait NumericEncoding: Send + Sync + std::fmt::Display + std::fmt::Debug { type Item; fn size(&self) -> usize; @@ -15,39 +15,15 @@ pub trait Encoding: Send + Sync { fn all_encoded_values(&self) -> Vec; fn scan_from(&self, row_id: usize) -> &[Self::Item]; - fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize; - - // TODO(edd): clean up the API for getting row ids that match predicates. - // - // Ideally you should be able to provide a collection of predicates to - // match on. - // - // A simpler approach would be to provide a method that matches on a single - // predicate and then call that multiple times, unioning or intersecting the - // resulting row sets. - fn row_id_eq_value(&self, v: Self::Item) -> Option; - fn row_ids_single_cmp_roaring( - &self, - wanted: &Self::Item, - order: std::cmp::Ordering, - ) -> croaring::Bitmap; - fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap; -} - -pub trait NumericEncoding: Send + Sync { - type Item; - - fn size(&self) -> usize; - fn value(&self, row_id: usize) -> Self::Item; - fn values(&self, row_ids: &[usize]) -> Vec; - fn encoded_values(&self, row_ids: &[usize]) -> Vec; - fn all_encoded_values(&self) -> Vec; - fn scan_from(&self, row_id: usize) -> &[Self::Item]; fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item; fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item; + fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize; + fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64; fn row_id_eq_value(&self, v: Self::Item) -> Option; + fn row_id_ge_value(&self, v: Self::Item) -> Option; + fn row_ids_single_cmp_roaring( &self, wanted: &Self::Item, @@ -56,39 +32,37 @@ pub trait NumericEncoding: Send + Sync { fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap; } -impl std::fmt::Debug for dyn NumericEncoding { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", "todo") - } -} - -impl std::fmt::Debug for dyn NumericEncoding { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", "todo") - } -} - +#[derive(Debug)] pub struct PlainArrow where - // T: ArrowNumericType + std::ops::Add, T: ArrowNumericType, - // T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign, + T::Native: Default + + PartialEq + + PartialOrd + + Copy + + std::fmt::Debug + + std::ops::Add, { arr: PrimitiveArray, - // _phantom: T, } -impl PlainArrow +impl NumericEncoding for PlainArrow where - // T: ArrowNumericType + std::ops::Add, - T: ArrowNumericType, - // T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign, + T: ArrowNumericType + std::fmt::Debug, + T::Native: Default + + PartialEq + + PartialOrd + + Copy + + std::fmt::Debug + + std::ops::Add, { - pub fn size(&self) -> usize { + type Item = Option; + + fn size(&self) -> usize { self.arr.len() } - pub fn value(&self, row_id: usize) -> Option { + fn value(&self, row_id: usize) -> Option { if self.arr.is_null(row_id) { return None; } @@ -109,12 +83,12 @@ where } /// Well this is terribly slow - pub fn encoded_values(&self, row_ids: &[usize]) -> Vec> { + fn encoded_values(&self, row_ids: &[usize]) -> Vec> { self.values(row_ids) } /// TODO(edd): there must be a more efficient way. - pub fn all_encoded_values(&self) -> Vec> { + fn all_encoded_values(&self) -> Vec> { let mut out = Vec::with_capacity(self.arr.len()); for i in 0..self.arr.len() { if self.arr.is_null(i) { @@ -127,11 +101,10 @@ where out } - pub fn scan_from(&self, row_id: usize) -> &[Option] { - // todo - - &[] - + // TODO(edd): problem here is returning a slice because we need to own the + // backing vector. + fn scan_from(&self, row_id: usize) -> &[Option] { + unimplemented!("need to figure out returning a slice"); // let mut out = Vec::with_capacity(self.arr.len() - row_id); // for i in row_id..self.arr.len() { // if self.arr.is_null(i) { @@ -144,18 +117,22 @@ where // out.as_slice() } - pub fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Option { - // let mut res = T::Native::default(); - - // // HMMMMM - materialising which has a memory cost. - // let vec = row_ids.to_vec(); - // for v in vec { - // res += self.arr.value(v as usize); - // } - None // todo + fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Option { + // TODO(edd): this is expensive - may pay to expose method to do this + // where you accept an array. + let mut res = T::Native::default(); + let vec = row_ids.to_vec(); + for row_id in vec { + let i = row_id as usize; + if self.arr.is_null(i) { + return None; + } + res = res + self.arr.value(i); + } + Some(res) } - pub fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Option { + fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Option { // if the column contains a null value between the range then the result // will be None. for i in from_row_id..to_row_id { @@ -165,57 +142,35 @@ where } // Otherwise sum all the values between in the range. - // let mut res = f64::from(self.arr.value(from_row_id)); - // for i in from_row_id + 1..to_row_id { - // res = res + self.arr.value(i); - // } - // Some(res) - None - } - - pub fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { - // TODO - count values that are not null in the row range. - 0 // todo - } -} - -impl NumericEncoding for PlainArrow { - type Item = Option; - - fn size(&self) -> usize { - self.size() - } - - fn value(&self, row_id: usize) -> Self::Item { - self.value(row_id) - } - - fn values(&self, row_ids: &[usize]) -> Vec { - self.values(row_ids) - } - - fn encoded_values(&self, row_ids: &[usize]) -> Vec { - self.encoded_values(row_ids) - } - - fn all_encoded_values(&self) -> Vec { - self.all_encoded_values() - } - - fn scan_from(&self, row_id: usize) -> &[Self::Item] { - self.scan_from(row_id) - } - - fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item { - self.sum_by_ids(row_ids) - } - - fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item { - self.sum_by_id_range(from_row_id, to_row_id) + let mut res = T::Native::default(); + for i in from_row_id..to_row_id { + res = res + self.arr.value(i); + } + Some(res) } fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { - self.count_by_id_range(from_row_id, to_row_id) + // TODO - count values that are not null in the row range. + let mut count = 0; + for i in from_row_id..to_row_id { + if self.arr.is_null(i) { + continue; + } + count += 1; + } + count + } + + fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 { + todo!() + } + + fn row_id_eq_value(&self, v: Self::Item) -> Option { + todo!() + } + + fn row_id_ge_value(&self, v: Self::Item) -> Option { + todo!() } fn row_ids_single_cmp_roaring( @@ -223,15 +178,26 @@ impl NumericEncoding for PlainArrow { wanted: &Self::Item, order: std::cmp::Ordering, ) -> croaring::Bitmap { - self.row_ids_single_cmp_roaring(wanted, order) + todo!() } fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap { - self.row_ids_gte_lt_roaring(from, to) + todo!() } +} - fn row_id_eq_value(&self, v: Self::Item) -> Option { - self.row_id_eq_value(v) +impl std::fmt::Display for PlainArrow +where + T: ArrowNumericType + std::fmt::Debug, + T::Native: Default + + PartialEq + + PartialOrd + + Copy + + std::fmt::Debug + + std::ops::Add, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[PlainArrow] size: {}", self.size()) } } @@ -242,31 +208,60 @@ pub struct PlainFixed { // total_order can be used as a hint to stop scanning the column early when // applying a comparison predicate to the column. total_order: bool, + + size: usize, } -impl PlainFixed +impl std::fmt::Display for PlainFixed where - T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign, + T: Default + + PartialEq + + PartialOrd + + Copy + + std::fmt::Debug + + std::fmt::Display + + Sync + + Send + + std::ops::AddAssign, { - pub fn size(&self) -> usize { - self.values.len() * std::mem::size_of::() + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[PlainFixed] size: {}", self.size(),) + } +} + +impl NumericEncoding for PlainFixed +where + T: Default + + PartialEq + + PartialOrd + + Copy + + std::fmt::Debug + + std::fmt::Display + + Sync + + Send + + std::ops::AddAssign, +{ + type Item = T; + + fn size(&self) -> usize { + self.size } - pub fn row_id_eq_value(&self, v: T) -> Option { + fn row_id_eq_value(&self, v: T) -> Option { self.values.iter().position(|x| *x == v) } - pub fn row_id_ge_value(&self, v: T) -> Option { + fn row_id_ge_value(&self, v: T) -> Option { self.values.iter().position(|x| *x >= v) } // get value at row_id. Panics if out of bounds. - pub fn value(&self, row_id: usize) -> T { + fn value(&self, row_id: usize) -> T { self.values[row_id] } /// Return the decoded values for the provided logical row ids. - pub fn values(&self, row_ids: &[usize]) -> Vec { + fn values(&self, row_ids: &[usize]) -> Vec { let mut out = Vec::with_capacity(row_ids.len()); for chunks in row_ids.chunks_exact(4) { out.push(self.values[chunks[3]]); @@ -286,24 +281,24 @@ where /// Return the raw encoded values for the provided logical row ids. For Plain /// encoding this is just the decoded values. - pub fn encoded_values(&self, row_ids: &[usize]) -> Vec { + fn encoded_values(&self, row_ids: &[usize]) -> Vec { self.values(row_ids) } /// Return all encoded values. For this encoding this is just the decoded /// values - pub fn all_encoded_values(&self) -> Vec { + fn all_encoded_values(&self) -> Vec { self.values.clone() } - pub fn scan_from(&self, row_id: usize) -> &[T] { + fn scan_from(&self, row_id: usize) -> &[T] { &self.values[row_id..] } /// returns a set of row ids that match a single ordering on a desired value /// /// This supports `value = x` , `value < x` or `value > x`. - pub fn row_ids_single_cmp_roaring( + fn row_ids_single_cmp_roaring( &self, wanted: &T, order: std::cmp::Ordering, @@ -343,7 +338,7 @@ where /// returns a set of row ids that match the half open interval `[from, to)`. /// /// The main use-case for this is time range filtering. - pub fn row_ids_gte_lt_roaring(&self, from: &T, to: &T) -> croaring::Bitmap { + fn row_ids_gte_lt_roaring(&self, from: &T, to: &T) -> croaring::Bitmap { let mut bm = croaring::Bitmap::create(); let mut found = false; //self.values[0]; @@ -376,7 +371,7 @@ where bm } - pub fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> T { + fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> T { let mut res = T::default(); for v in self.values[from_row_id..to_row_id].iter() { res += *v; @@ -384,12 +379,12 @@ where res } - pub fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { + fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { to_row_id - from_row_id } // TODO(edd): make faster - pub fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> T { + fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> T { let mut res = T::default(); // println!( // "cardinality is {:?} out of {:?}", @@ -449,7 +444,7 @@ where res } - pub fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 { + fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 { row_ids.cardinality() } } @@ -460,6 +455,10 @@ impl From<&[i64]> for PlainFixed { values: v.to_vec(), // buf: Vec::with_capacity(v.len()), total_order: false, + size: size_of::>() + + (size_of::() * v.len()) + + size_of::() + + size_of::(), } } } @@ -470,122 +469,14 @@ impl From<&[f64]> for PlainFixed { values: v.to_vec(), // buf: Vec::with_capacity(v.len()), total_order: false, + size: size_of::>() + + (size_of::() * v.len()) + + size_of::() + + size_of::(), } } } -impl NumericEncoding for PlainFixed { - type Item = f64; - - fn size(&self) -> usize { - self.size() - } - - fn value(&self, row_id: usize) -> Self::Item { - self.value(row_id) - } - - fn values(&self, row_ids: &[usize]) -> Vec { - self.values(row_ids) - } - - fn encoded_values(&self, row_ids: &[usize]) -> Vec { - self.encoded_values(row_ids) - } - - fn all_encoded_values(&self) -> Vec { - self.all_encoded_values() - } - - fn scan_from(&self, row_id: usize) -> &[Self::Item] { - self.scan_from(row_id) - } - - fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item { - self.sum_by_ids(row_ids) - } - - fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item { - self.sum_by_id_range(from_row_id, to_row_id) - } - - fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { - self.count_by_id_range(from_row_id, to_row_id) - } - - fn row_ids_single_cmp_roaring( - &self, - wanted: &Self::Item, - order: std::cmp::Ordering, - ) -> croaring::Bitmap { - self.row_ids_single_cmp_roaring(wanted, order) - } - - fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap { - self.row_ids_gte_lt_roaring(from, to) - } - - fn row_id_eq_value(&self, v: Self::Item) -> Option { - self.row_id_eq_value(v) - } -} - -impl NumericEncoding for PlainFixed { - type Item = i64; - - fn size(&self) -> usize { - self.size() - } - - fn value(&self, row_id: usize) -> Self::Item { - self.value(row_id) - } - - fn values(&self, row_ids: &[usize]) -> Vec { - self.values(row_ids) - } - - fn encoded_values(&self, row_ids: &[usize]) -> Vec { - self.encoded_values(row_ids) - } - - fn all_encoded_values(&self) -> Vec { - self.all_encoded_values() - } - - fn scan_from(&self, row_id: usize) -> &[Self::Item] { - self.scan_from(row_id) - } - - fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item { - self.sum_by_ids(row_ids) - } - - fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item { - self.sum_by_id_range(from_row_id, to_row_id) - } - - fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize { - self.count_by_id_range(from_row_id, to_row_id) - } - - fn row_ids_single_cmp_roaring( - &self, - wanted: &Self::Item, - order: std::cmp::Ordering, - ) -> croaring::Bitmap { - self.row_ids_single_cmp_roaring(wanted, order) - } - - fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap { - self.row_ids_gte_lt_roaring(from, to) - } - - fn row_id_eq_value(&self, v: Self::Item) -> Option { - self.row_id_eq_value(v) - } -} - #[derive(Debug, Default)] pub struct DictionaryRLE { // stores the mapping between an entry and its assigned index. @@ -602,7 +493,6 @@ pub struct DictionaryRLE { // stores tuples where each pair refers to a dictionary entry and the number // of times the entry repeats. run_lengths: Vec<(usize, u64)>, - run_length_size: usize, total: u64, } @@ -615,7 +505,6 @@ impl DictionaryRLE { index_entry: BTreeMap::new(), map_size: 0, run_lengths: Vec::new(), - run_length_size: 0, total: 0, } } @@ -627,7 +516,6 @@ impl DictionaryRLE { index_entry: BTreeMap::new(), map_size: 0, run_lengths: Vec::new(), - run_length_size: 0, total: 0, }; @@ -663,7 +551,6 @@ impl DictionaryRLE { } else { // start a new run-length self.run_lengths.push((*idx, additional)); - self.run_length_size += std::mem::size_of::<(usize, u64)>(); } self.index_row_ids .get_mut(&(*idx as u32)) @@ -690,7 +577,6 @@ impl DictionaryRLE { .get_mut(&(idx as u32)) .unwrap() .add_range(self.total..self.total + additional); - self.run_length_size += std::mem::size_of::<(usize, u64)>(); } } } @@ -927,8 +813,28 @@ impl DictionaryRLE { } pub fn size(&self) -> usize { - // mapping and reverse mapping then the rles - 2 * self.map_size + self.run_length_size + // entry_index: BTreeMap, usize>, + + // // stores the mapping between an index and its entry. + // index_entry: BTreeMap>, + + (self.index_entry.len() * size_of::>>()) + + (self.index_row_ids.len() * size_of::>()) + + size_of::() + + (self.run_lengths.len() * size_of::>()) + + size_of::() + } +} + +impl std::fmt::Display for DictionaryRLE { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "[DictionaryRLE] size: {}, dict entries: {}, runs: {} ", + self.size(), + self.index_entry.len(), + self.run_lengths.len() + ) } } @@ -959,6 +865,7 @@ impl std::convert::From<&delorean_table::Packer> for #[cfg(test)] mod test { + use super::NumericEncoding; #[test] fn plain_arrow() { diff --git a/delorean_mem_qe/src/segment.rs b/delorean_mem_qe/src/segment.rs index 9a15b03430..f4315c8234 100644 --- a/delorean_mem_qe/src/segment.rs +++ b/delorean_mem_qe/src/segment.rs @@ -934,6 +934,21 @@ impl Segment { } } +impl std::fmt::Display for Segment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Rows: {}\nSize: {} Columns: ", + self.num_rows(), + self.size() + )?; + for (c, name) in self.columns.iter().zip(self.column_names().iter()) { + writeln!(f, "{} {}", name, c)?; + } + Ok(()) + } +} + /// Meta data for a segment. This data is mainly used to determine if a segment /// may contain a value that can answer a query. #[derive(Debug)]