diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs index 5a0551c485..9b498dbc47 100644 --- a/read_buffer/src/column.rs +++ b/read_buffer/src/column.rs @@ -153,6 +153,18 @@ impl Column { } } + // Returns statistics about the physical layout of columns + pub(crate) fn storage_stats(&self) -> Statistics { + match &self { + Self::String(_, data) => data.storage_stats(), + Self::Float(_, data) => data.storage_stats(), + Self::Integer(_, data) => data.storage_stats(), + Self::Unsigned(_, data) => data.storage_stats(), + Self::Bool(_, data) => data.storage_stats(), + Self::ByteArray(_, data) => data.storage_stats(), + } + } + pub fn properties(&self) -> &ColumnProperties { match &self { Self::String(meta, _) => &meta.properties, @@ -1309,6 +1321,15 @@ impl Iterator for RowIDsIterator<'_> { } } +// Statistics about the composition of a column +pub(crate) struct Statistics { + enc_type: &'static str, + log_data_type: &'static str, + values: u32, + nulls: u32, + bytes: usize, +} + #[cfg(test)] mod test { use super::*; diff --git a/read_buffer/src/column/boolean.rs b/read_buffer/src/column/boolean.rs index a6da69f111..064025ea42 100644 --- a/read_buffer/src/column/boolean.rs +++ b/read_buffer/src/column/boolean.rs @@ -1,5 +1,5 @@ -use super::cmp; use super::encoding::bool::Bool; +use super::{cmp, Statistics}; use crate::column::{RowIDs, Value, Values}; /// Encodings for boolean values. @@ -22,6 +22,17 @@ impl BooleanEncoding { } } + // Returns statistics about the physical layout of columns + pub(crate) fn storage_stats(&self) -> Statistics { + Statistics { + enc_type: self.name(), + log_data_type: "bool", + values: self.num_rows(), + nulls: self.null_count(), + bytes: self.size(), + } + } + /// Determines if the column contains a NULL value. pub fn contains_null(&self) -> bool { match self { @@ -29,6 +40,13 @@ impl BooleanEncoding { } } + /// The total number of rows in the column. + pub fn null_count(&self) -> u32 { + match self { + Self::BooleanNull(enc) => enc.null_count(), + } + } + /// Determines if the column contains a non-null value. pub fn has_any_non_null_value(&self) -> bool { match self { @@ -106,6 +124,13 @@ impl BooleanEncoding { Self::BooleanNull(c) => c.count(row_ids), } } + + /// The name of this encoding. + pub fn name(&self) -> &'static str { + match &self { + Self::BooleanNull(_) => "None", + } + } } impl std::fmt::Display for BooleanEncoding { diff --git a/read_buffer/src/column/encoding/bool.rs b/read_buffer/src/column/encoding/bool.rs index b9d6b58669..0752535d84 100644 --- a/read_buffer/src/column/encoding/bool.rs +++ b/read_buffer/src/column/encoding/bool.rs @@ -36,6 +36,10 @@ impl Bool { self.arr.null_count() > 0 } + pub fn null_count(&self) -> u32 { + self.arr.null_count() as u32 + } + /// Returns an estimation of the total size in bytes used by this column /// encoding. pub fn size(&self) -> usize { diff --git a/read_buffer/src/row_group.rs b/read_buffer/src/row_group.rs index 60f18c8f8c..693870aad4 100644 --- a/read_buffer/src/row_group.rs +++ b/read_buffer/src/row_group.rs @@ -12,7 +12,7 @@ use hashbrown::{hash_map, HashMap}; use itertools::Itertools; use snafu::{ResultExt, Snafu}; -use crate::column::{cmp::Operator, Column, RowIDs, RowIDsOption}; +use crate::column::{self, cmp::Operator, Column, RowIDs, RowIDsOption}; use crate::schema; use crate::schema::{AggregateType, LogicalDataType, ResultSchema}; use crate::value::{ @@ -1059,6 +1059,10 @@ impl RowGroup { dst } + + pub(crate) fn column_storage_statistics(&self) -> Vec { + self.columns.iter().map(|c| c.storage_stats()).collect() + } } impl std::fmt::Display for &RowGroup { diff --git a/read_buffer/src/table.rs b/read_buffer/src/table.rs index 713a430e2a..fa9bc3c216 100644 --- a/read_buffer/src/table.rs +++ b/read_buffer/src/table.rs @@ -12,9 +12,12 @@ use arrow::record_batch::RecordBatch; use data_types::{chunk::ChunkColumnSummary, partition_metadata::TableSummary}; use internal_types::selection::Selection; -use crate::row_group::{self, ColumnName, Predicate, RowGroup}; use crate::schema::{AggregateType, ColumnType, LogicalDataType, ResultSchema}; use crate::value::{OwnedValue, Scalar, Value}; +use crate::{ + column, + row_group::{self, ColumnName, Predicate, RowGroup}, +}; #[derive(Debug, Snafu)] pub enum Error { @@ -513,6 +516,16 @@ impl Table { .iter() .any(|row_group| row_group.satisfies_predicate(predicate)) } + + pub(crate) fn column_storage_statistics(&self) -> Vec { + let table_data = self.table_data.read(); + table_data + .data + .iter() + .map(|rg| rg.column_storage_statistics()) + .flatten() + .collect() + } } // TODO(edd): reduce owned strings here by, e.g., using references as keys.