feat: expose all column stats

pull/24376/head
Edd Robinson 2021-05-11 11:09:12 +01:00
parent 88ed58aa8a
commit c4987028fb
5 changed files with 70 additions and 3 deletions

View File

@ -153,6 +153,18 @@ impl Column {
}
}
// Returns statistics about the physical layout of columns
pub(crate) fn storage_stats(&self) -> Statistics {
match &self {
Self::String(_, data) => data.storage_stats(),
Self::Float(_, data) => data.storage_stats(),
Self::Integer(_, data) => data.storage_stats(),
Self::Unsigned(_, data) => data.storage_stats(),
Self::Bool(_, data) => data.storage_stats(),
Self::ByteArray(_, data) => data.storage_stats(),
}
}
pub fn properties(&self) -> &ColumnProperties {
match &self {
Self::String(meta, _) => &meta.properties,
@ -1309,6 +1321,15 @@ impl Iterator for RowIDsIterator<'_> {
}
}
// Statistics about the composition of a column
pub(crate) struct Statistics {
enc_type: &'static str,
log_data_type: &'static str,
values: u32,
nulls: u32,
bytes: usize,
}
#[cfg(test)]
mod test {
use super::*;

View File

@ -1,5 +1,5 @@
use super::cmp;
use super::encoding::bool::Bool;
use super::{cmp, Statistics};
use crate::column::{RowIDs, Value, Values};
/// Encodings for boolean values.
@ -22,6 +22,17 @@ impl BooleanEncoding {
}
}
// Returns statistics about the physical layout of columns
pub(crate) fn storage_stats(&self) -> Statistics {
Statistics {
enc_type: self.name(),
log_data_type: "bool",
values: self.num_rows(),
nulls: self.null_count(),
bytes: self.size(),
}
}
/// Determines if the column contains a NULL value.
pub fn contains_null(&self) -> bool {
match self {
@ -29,6 +40,13 @@ impl BooleanEncoding {
}
}
/// The total number of rows in the column.
pub fn null_count(&self) -> u32 {
match self {
Self::BooleanNull(enc) => enc.null_count(),
}
}
/// Determines if the column contains a non-null value.
pub fn has_any_non_null_value(&self) -> bool {
match self {
@ -106,6 +124,13 @@ impl BooleanEncoding {
Self::BooleanNull(c) => c.count(row_ids),
}
}
/// The name of this encoding.
pub fn name(&self) -> &'static str {
match &self {
Self::BooleanNull(_) => "None",
}
}
}
impl std::fmt::Display for BooleanEncoding {

View File

@ -36,6 +36,10 @@ impl Bool {
self.arr.null_count() > 0
}
pub fn null_count(&self) -> u32 {
self.arr.null_count() as u32
}
/// Returns an estimation of the total size in bytes used by this column
/// encoding.
pub fn size(&self) -> usize {

View File

@ -12,7 +12,7 @@ use hashbrown::{hash_map, HashMap};
use itertools::Itertools;
use snafu::{ResultExt, Snafu};
use crate::column::{cmp::Operator, Column, RowIDs, RowIDsOption};
use crate::column::{self, cmp::Operator, Column, RowIDs, RowIDsOption};
use crate::schema;
use crate::schema::{AggregateType, LogicalDataType, ResultSchema};
use crate::value::{
@ -1059,6 +1059,10 @@ impl RowGroup {
dst
}
pub(crate) fn column_storage_statistics(&self) -> Vec<column::Statistics> {
self.columns.iter().map(|c| c.storage_stats()).collect()
}
}
impl std::fmt::Display for &RowGroup {

View File

@ -12,9 +12,12 @@ use arrow::record_batch::RecordBatch;
use data_types::{chunk::ChunkColumnSummary, partition_metadata::TableSummary};
use internal_types::selection::Selection;
use crate::row_group::{self, ColumnName, Predicate, RowGroup};
use crate::schema::{AggregateType, ColumnType, LogicalDataType, ResultSchema};
use crate::value::{OwnedValue, Scalar, Value};
use crate::{
column,
row_group::{self, ColumnName, Predicate, RowGroup},
};
#[derive(Debug, Snafu)]
pub enum Error {
@ -513,6 +516,16 @@ impl Table {
.iter()
.any(|row_group| row_group.satisfies_predicate(predicate))
}
pub(crate) fn column_storage_statistics(&self) -> Vec<column::Statistics> {
let table_data = self.table_data.read();
table_data
.data
.iter()
.map(|rg| rg.column_storage_statistics())
.flatten()
.collect()
}
}
// TODO(edd): reduce owned strings here by, e.g., using references as keys.