refactor: Have MUB chunk snapshots hold a TableSummaryAndTimes rather than only ColumnSummaries

This way we can have the first/last write times on MUB chunk snapshots
without needing to deconstruct/reconstruct TableSummaries as it was
doing.
pull/24376/head
Carol (Nichols || Goulding) 2021-07-19 09:25:21 -04:00
parent 956647d966
commit 799406be72
1 changed files with 9 additions and 12 deletions

View File

@ -2,7 +2,7 @@ use super::MBChunk;
use arrow::record_batch::RecordBatch;
use data_types::{
error::ErrorLogger,
partition_metadata::{ColumnSummary, Statistics, TableSummary},
partition_metadata::{Statistics, TableSummaryAndTimes},
timestamp::TimestampRange,
};
use internal_types::{
@ -31,7 +31,7 @@ pub struct ChunkSnapshot {
schema: Arc<Schema>,
batch: RecordBatch,
table_name: Arc<str>,
stats: Vec<ColumnSummary>,
summary: TableSummaryAndTimes,
}
impl ChunkSnapshot {
@ -52,7 +52,7 @@ impl ChunkSnapshot {
schema: Arc::new(schema),
batch,
table_name: Arc::clone(&chunk.table_name),
stats: summary.columns,
summary,
}
}
@ -99,19 +99,16 @@ impl ChunkSnapshot {
})
}
/// Returns a vec of the summary statistics of the tables in this chunk
pub fn table_summary(&self) -> TableSummary {
TableSummary {
name: self.table_name.to_string(),
columns: self.stats.clone(),
}
/// Returns a table summary for this chunk
pub fn table_summary(&self) -> &TableSummaryAndTimes {
&self.summary
}
/// Return the approximate memory size of the chunk, in bytes including the
/// dictionary, tables, statistics and their rows.
pub fn size(&self) -> usize {
let columns = self.column_sizes().map(|(_, size)| size).sum::<usize>();
let stats = self.stats.iter().map(|c| c.size()).sum::<usize>();
let stats = self.summary.columns.iter().map(|c| c.size()).sum::<usize>();
columns + stats + std::mem::size_of::<Self>()
}
@ -123,7 +120,7 @@ impl ChunkSnapshot {
self.batch
.columns()
.iter()
.zip(self.stats.iter())
.zip(self.summary.columns.iter())
.map(move |(array, summary)| {
let size = array.get_array_memory_size() + array.get_buffer_memory_size();
(summary.name.as_str(), size)
@ -143,7 +140,7 @@ impl ChunkSnapshot {
self.schema
.find_index_of(TIME_COLUMN_NAME)
.and_then(|idx| match &self.stats[idx].stats {
.and_then(|idx| match &self.summary.columns[idx].stats {
Statistics::I64(stats) => Some(
!TimestampRange::new(stats.min? as _, stats.max? as _)
.disjoint(timestamp_range),