Merge pull request #816 from influxdata/er/feat/read_buffer/size

feat: implement `size` on Read Buffer
pull/24376/head
kodiakhq[bot] 2021-02-16 22:56:07 +00:00 committed by GitHub
commit 7de5d35bab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 303 additions and 155 deletions

View File

@ -56,7 +56,6 @@ pub struct Chunk {
// Tie data and meta-data together so that they can be wrapped in RWLock.
struct TableData {
size: u64, // size in bytes of the chunk
rows: u64, // Total number of rows across all tables
// Total number of row groups across all tables in the chunk.
@ -72,7 +71,6 @@ impl Chunk {
Self {
id,
chunk_data: RwLock::new(TableData {
size: table.size(),
rows: table.rows(),
row_groups: table.row_groups(),
data: vec![(table.name().to_owned(), table)].into_iter().collect(),
@ -85,9 +83,18 @@ impl Chunk {
self.id
}
/// The total size in bytes of all row groups in all tables in this chunk.
/// The total estimated size in bytes of this `Chunk` and all contained
/// data.
pub fn size(&self) -> u64 {
self.chunk_data.read().unwrap().size
let base_size = std::mem::size_of::<Self>();
let table_data = self.chunk_data.read().unwrap();
base_size as u64
+ table_data
.data
.iter()
.map(|(k, table)| k.len() as u64 + table.size())
.sum::<u64>()
}
/// The total number of rows in all row groups in all tables in this chunk.
@ -127,7 +134,6 @@ impl Chunk {
let mut chunk_data = self.chunk_data.write().unwrap();
// update the meta-data for this chunk with contents of row group.
chunk_data.size += row_group.size();
chunk_data.rows += row_group.rows() as u64;
chunk_data.row_groups += 1;
@ -153,7 +159,6 @@ impl Chunk {
// Remove table and update chunk meta-data if table exists.
if let Some(table) = chunk_data.data.remove(name) {
chunk_data.size -= table.size();
chunk_data.rows -= table.rows();
chunk_data.row_groups -= table.row_groups();
}
@ -327,6 +332,8 @@ mod test {
assert_eq!(chunk.rows(), 6);
assert_eq!(chunk.row_groups(), 1);
assert_eq!(chunk.tables(), 1);
let chunk_size = chunk.size();
assert!(chunk_size > 0);
// Add a row group to the same table in the Chunk.
let columns = vec![("time", ColumnType::create_time(&[-2_i64, 2, 8]))]
@ -334,7 +341,9 @@ mod test {
.map(|(k, v)| (k.to_owned(), v))
.collect();
let rg = RowGroup::new(3, columns);
let rg_size = rg.size();
chunk.upsert_table("table_1", rg);
assert_eq!(chunk.size(), chunk_size + rg_size);
assert_eq!(chunk.rows(), 9);
assert_eq!(chunk.row_groups(), 2);
@ -353,16 +362,19 @@ mod test {
assert_eq!(chunk.tables(), 2);
// Drop table_1
let chunk_size = chunk.size();
chunk.drop_table("table_1");
assert_eq!(chunk.rows(), 2);
assert_eq!(chunk.row_groups(), 1);
assert_eq!(chunk.tables(), 1);
assert!(chunk.size() < chunk_size);
// Drop table_2 - empty table
chunk.drop_table("table_2");
assert_eq!(chunk.rows(), 0);
assert_eq!(chunk.row_groups(), 0);
assert_eq!(chunk.tables(), 0);
assert_eq!(chunk.size(), 64); // base size of `Chunk`
// Drop table_2 - no-op
chunk.drop_table("table_2");

View File

@ -5,7 +5,7 @@ pub mod float;
pub mod integer;
pub mod string;
use std::collections::BTreeSet;
use std::{collections::BTreeSet, mem::size_of};
use croaring::Bitmap;
use either::Either;
@ -48,14 +48,61 @@ impl Column {
//
// Meta information about the column
//
/// The estimated size in bytes of the column.
pub fn size(&self) -> u64 {
// Since `MetaData` is generic each value in the range can have a
// different size, so just do the calculations here where we know each
// `T`.
match &self {
Column::String(meta, data) => {
let mut meta_size =
(size_of::<Option<(String, String)>>() + size_of::<ColumnProperties>()) as u64;
if let Some((min, max)) = &meta.range {
meta_size += (min.len() + max.len()) as u64;
};
meta_size + data.size()
}
Column::Float(meta, data) => {
let meta_size =
(size_of::<Option<(f64, f64)>>() + size_of::<ColumnProperties>()) as u64;
meta_size + data.size()
}
Column::Integer(meta, data) => {
let meta_size =
(size_of::<Option<(i64, i64)>>() + size_of::<ColumnProperties>()) as u64;
meta_size + data.size()
}
Column::Unsigned(meta, data) => {
let meta_size =
(size_of::<Option<(u64, u64)>>() + size_of::<ColumnProperties>()) as u64;
meta_size + data.size()
}
Column::Bool(meta, data) => {
let meta_size =
(size_of::<Option<(bool, bool)>>() + size_of::<ColumnProperties>()) as u64;
meta_size + data.size()
}
Column::ByteArray(meta, data) => {
let mut meta_size = (size_of::<Option<(Vec<u8>, Vec<u8>)>>()
+ size_of::<ColumnProperties>()) as u64;
if let Some((min, max)) = &meta.range {
meta_size += (min.len() + max.len()) as u64;
};
meta_size + data.size()
}
}
}
pub fn num_rows(&self) -> u32 {
match &self {
Column::String(meta, _) => meta.rows,
Column::Float(meta, _) => meta.rows,
Column::Integer(meta, _) => meta.rows,
Column::Unsigned(meta, _) => meta.rows,
Column::Bool(meta, _) => meta.rows,
Column::ByteArray(meta, _) => meta.rows,
Column::String(_, data) => data.num_rows(),
Column::Float(_, data) => data.num_rows(),
Column::Integer(_, data) => data.num_rows(),
Column::Unsigned(_, data) => data.num_rows(),
Column::Bool(_, data) => data.num_rows(),
Column::ByteArray(_, data) => data.num_rows(),
}
}
@ -71,10 +118,6 @@ impl Column {
}
}
pub fn size(&self) -> u64 {
0
}
/// Returns the (min, max) values stored in this column
pub fn column_range(&self) -> Option<(OwnedValue, OwnedValue)> {
match &self {
@ -652,12 +695,6 @@ pub struct MetaData<T>
where
T: PartialOrd + std::fmt::Debug,
{
// The total size of the column in bytes.
size: u64,
// The total number of rows in the column.
rows: u32,
// The minimum and maximum value for this column.
range: Option<(T, T)>,
@ -735,9 +772,7 @@ impl From<arrow::array::StringArray> for Column {
fn from(arr: arrow::array::StringArray) -> Self {
let data = StringEncoding::from(arr);
let meta = MetaData {
rows: data.num_rows(),
range: data.column_range(),
size: data.size(),
properties: ColumnProperties {
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
},
@ -751,9 +786,7 @@ impl From<&[Option<&str>]> for Column {
fn from(arr: &[Option<&str>]) -> Self {
let data = StringEncoding::from(arr);
let meta = MetaData {
rows: data.num_rows(),
range: data.column_range(),
size: data.size(),
properties: ColumnProperties {
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
},
@ -778,9 +811,7 @@ impl From<&[&str]> for Column {
fn from(arr: &[&str]) -> Self {
let data = StringEncoding::from(arr);
let meta = MetaData {
rows: data.num_rows(),
range: data.column_range(),
size: data.size(),
properties: ColumnProperties {
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
},
@ -804,8 +835,6 @@ impl From<&[u64]> for Column {
let data = IntegerEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range: Some((min, max)),
properties: ColumnProperties::default(),
};
@ -857,8 +886,6 @@ impl From<arrow::array::UInt64Array> for Column {
let data = IntegerEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range,
properties: ColumnProperties::default(),
};
@ -881,8 +908,6 @@ impl From<&[i64]> for Column {
let data = IntegerEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range: Some((min, max)),
properties: ColumnProperties::default(),
};
@ -934,8 +959,6 @@ impl From<arrow::array::Int64Array> for Column {
let data = IntegerEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range,
properties: ColumnProperties::default(),
};
@ -957,8 +980,6 @@ impl From<&[f64]> for Column {
let data = FloatEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range: Some((min, max)),
properties: ColumnProperties::default(),
};
@ -1010,8 +1031,6 @@ impl From<arrow::array::Float64Array> for Column {
let data = fixed_null::FixedNull::<arrow::datatypes::Float64Type>::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range,
..MetaData::default()
};
@ -1059,8 +1078,6 @@ impl From<arrow::array::BooleanArray> for Column {
let data = BooleanEncoding::from(arr);
let meta = MetaData {
size: data.size(),
rows: data.num_rows(),
range,
..MetaData::default()
};
@ -1132,6 +1149,16 @@ impl RowIDs {
panic!("cannot unwrap RowIDs to Vector");
}
/// An estimation of the size in bytes needed to store `self`.
pub fn size(&self) -> usize {
match self {
RowIDs::Bitmap(bm) => std::mem::size_of::<Bitmap>() + bm.get_serialized_size_in_bytes(),
RowIDs::Vector(v) => {
std::mem::size_of::<Vec<u32>>() + (std::mem::size_of::<u32>() * v.len())
}
}
}
// Converts the RowIDs to a Vec<u32>. This is expensive and should only be
// used for testing.
pub fn to_vec(&self) -> Vec<u32> {
@ -1243,8 +1270,6 @@ mod test {
assert_eq!(
meta,
super::MetaData::<String> {
size: 317,
rows: 4,
range: Some(("hello".to_string(), "world".to_string())),
properties: ColumnProperties {
has_pre_computed_row_ids: true
@ -1271,8 +1296,6 @@ mod test {
assert_eq!(
meta,
super::MetaData::<String> {
size: 301,
rows: 2,
range: Some(("hello".to_string(), "world".to_string())),
properties: ColumnProperties {
has_pre_computed_row_ids: true
@ -1336,8 +1359,6 @@ mod test {
let input = &[0, -12, u16::MAX as i64, 5];
let col = Column::from(&input[..]);
if let Column::Integer(meta, IntegerEncoding::I64I32(_)) = col {
assert_eq!(meta.size, 40); // 4 i32s (16b) and a vec (24b)
assert_eq!(meta.rows, 4);
assert_eq!(meta.range, Some((-12, u16::MAX as i64)));
} else {
panic!("invalid variant");
@ -1368,8 +1389,6 @@ mod test {
let input = &[13, 12, u16::MAX as u64, 5];
let col = Column::from(&input[..]);
if let Column::Unsigned(meta, IntegerEncoding::U64U16(_)) = col {
assert_eq!(meta.size, 32); // 4 u16s (8b) and a vec (24b)
assert_eq!(meta.rows, 4);
assert_eq!(meta.range, Some((5, u16::MAX as u64)));
} else {
panic!("invalid variant");

View File

@ -36,11 +36,10 @@ impl Bool {
self.arr.null_count() > 0
}
/// Returns the total size in bytes of the encoded data. Note, this method
/// is really an "accurate" estimation. It doesn't include for example the
/// size of the `Plain` struct receiver.
/// Returns an estimation of the total size in bytes used by this column
/// encoding.
pub fn size(&self) -> u64 {
0
(std::mem::size_of::<BooleanArray>() + self.arr.get_array_memory_size()) as u64
}
//
@ -342,6 +341,12 @@ mod test {
v.iter().map(|x| Some(*x)).collect()
}
#[test]
fn size() {
let v = Bool::from(vec![None, None, Some(true), Some(false)].as_slice());
assert_eq!(v.size(), 240);
}
#[test]
fn first_row_id_eq_value() {
let v = Bool::from(vec![true, true].as_slice());

View File

@ -86,11 +86,19 @@ impl RLE {
let index_entry_size = size_of::<Vec<String>>() // container size
+ (size_of::<String>() * self.index_entries.len()) // elements size
+ decoded_keys_size; // heal allocated strings size
+ decoded_keys_size; // heap allocated strings size
// The total size (an upper bound estimate) of all the bitmaps
// in the column.
let row_ids_bitmaps_size = self
.index_row_ids
.values()
.map(|row_ids| row_ids.size())
.sum::<usize>();
// TODO(edd): The Bitmaps on-heap size!!
let index_row_ids_size = size_of::<BTreeMap<u32, Bitmap>>()
+ ((size_of::<u32>() + size_of::<Bitmap>()) * self.index_row_ids.len());
+ (size_of::<u32>() * self.index_row_ids.len())
+ row_ids_bitmaps_size;
let run_lengths_size = size_of::<Vec<(u32, u32)>>() + // container size
(size_of::<(u32, u32)>() * self.run_lengths.len()); // each run-length size
@ -994,16 +1002,24 @@ mod test {
enc.push_none();
enc.push_none();
// keys - 14 bytes.
// entry_index is 24 + ((24+4) * 3) + 14 == 122
// index_entry is 24 + (24*4) + 14 == 134
// index_row_ids is 24 + (4 + 0?? * 4) == 40 ??????
// run lengths is 24 + (8*5) == 64
// 360
// `entry_index` is 24 + ((24+4) * 3) + 14 == 122
//
// Note: there are 4 index entries to account for NULL entry.
// `index_entry` is 24 + (24*4) + 14 == 134
//
// bitmaps for east, north, south and NULL entries.
// `index_row_ids` is 24 + (4 * 4) + (108b for bitmaps) == 148
//
// `run lengths` is 24 + (8*5) == 64
//
// `contains_null` - 1 byte
// `num_rows` - 4 bytes
//
// 473
// TODO(edd): there some mystery bytes in the bitmap implementation.
// need to figure out how to measure these
assert_eq!(enc.size(), 397);
assert_eq!(enc.size(), 473);
}
#[test]

View File

@ -66,11 +66,10 @@ where
self.arr.null_count() > 0
}
/// Returns the total size in bytes of the encoded data. Note, this method
/// is really an "accurate" estimation. It doesn't include for example the
/// size of the `Plain` struct receiver.
/// Returns an estimation of the total size in bytes used by this column
/// encoding.
pub fn size(&self) -> u64 {
0
(std::mem::size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()) as u64
}
//
@ -598,6 +597,12 @@ mod test {
v.iter().map(|x| Some(*x)).collect()
}
#[test]
fn size() {
let v = FixedNull::<UInt64Type>::from(vec![None, None, Some(100), Some(2222)].as_slice());
assert_eq!(v.size(), 240);
}
#[test]
fn first_row_id_eq_value() {
let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());

View File

@ -79,9 +79,6 @@ struct PartitionData {
// identified by a partition key
partitions: BTreeMap<String, Partition>,
// The current total size of the database.
size: u64,
// Total number of rows in the database.
rows: u64,
}
@ -108,7 +105,6 @@ impl Database {
// Take lock on partitions and update.
let mut partition_data = self.data.write().unwrap();
partition_data.size += row_group.size();
partition_data.rows += row_group.rows() as u64;
// create a new chunk if one doesn't exist, or add the table data to
@ -141,7 +137,6 @@ impl Database {
.remove(partition_key)
.context(PartitionNotFound { key: partition_key })?;
partition_data.size -= partition.size();
partition_data.rows -= partition.rows();
Ok(())
}
@ -156,7 +151,6 @@ impl Database {
.context(PartitionNotFound { key: partition_key })?;
partition.drop_chunk(chunk_id).map(|chunk| {
partition_data.size -= chunk.size();
partition_data.rows -= chunk.rows();
// don't return chunk from `drop_chunk`
})
@ -185,8 +179,17 @@ impl Database {
.unwrap_or_default()
}
/// Returns the total estimated size in bytes of the database.
pub fn size(&self) -> u64 {
self.data.read().unwrap().size
let base_size = std::mem::size_of::<Self>();
let partition_data = self.data.read().unwrap();
base_size as u64
+ partition_data
.partitions
.iter()
.map(|(name, partition)| name.len() as u64 + partition.size())
.sum::<u64>()
}
pub fn rows(&self) -> u64 {
@ -502,7 +505,7 @@ impl fmt::Debug for Database {
let partition_data = self.data.read().unwrap();
f.debug_struct("Database")
.field("partitions", &partition_data.partitions.keys())
.field("size", &partition_data.size)
.field("size", &self.size())
.finish()
}
}
@ -513,9 +516,6 @@ struct ChunkData {
// identified by a chunk id.
chunks: BTreeMap<u32, Chunk>,
// The current total size of the partition.
size: u64,
// The current number of row groups in this partition.
row_groups: usize,
@ -537,7 +537,6 @@ impl Partition {
Self {
key: partition_key.to_owned(),
data: RwLock::new(ChunkData {
size: chunk.size(),
row_groups: chunk.row_groups(),
rows: chunk.rows(),
chunks: vec![(chunk.id(), chunk)].into_iter().collect(),
@ -555,7 +554,6 @@ impl Partition {
fn upsert_chunk(&mut self, chunk_id: u32, table_name: String, row_group: RowGroup) {
let mut chunk_data = self.data.write().unwrap();
chunk_data.size += row_group.size();
chunk_data.row_groups += 1;
chunk_data.rows += row_group.rows() as u64;
@ -581,7 +579,6 @@ impl Partition {
.remove(&chunk_id)
.context(ChunkNotFound { id: chunk_id })?;
chunk_data.size -= chunk.size();
chunk_data.rows -= chunk.rows();
chunk_data.row_groups -= chunk.row_groups();
Ok(chunk)
@ -624,8 +621,18 @@ impl Partition {
self.data.read().unwrap().rows
}
/// The total estimated size in bytes of the `Partition` and all contained
/// data.
pub fn size(&self) -> u64 {
self.data.read().unwrap().size
let base_size = std::mem::size_of::<Self>() + self.key.len();
let chunk_data = self.data.read().unwrap();
base_size as u64
+ chunk_data
.chunks
.values()
.map(|chunk| std::mem::size_of::<u32>() as u64 + chunk.size())
.sum::<u64>()
}
}

View File

@ -69,13 +69,13 @@ impl RowGroup {
let mut time_column = None;
for (name, ct) in columns {
meta.size += ct.size();
match ct {
ColumnType::Tag(c) => {
assert_eq!(c.num_rows(), rows);
meta.add_column(
&name,
c.size(),
schema::ColumnType::Tag(name.clone()),
c.logical_datatype(),
c.column_range().unwrap(),
@ -89,6 +89,7 @@ impl RowGroup {
meta.add_column(
&name,
c.size(),
schema::ColumnType::Field(name.clone()),
c.logical_datatype(),
c.column_range().unwrap(),
@ -110,6 +111,7 @@ impl RowGroup {
meta.add_column(
&name,
c.size(),
schema::ColumnType::Timestamp(name.clone()),
c.logical_datatype(),
c.column_range().unwrap(),
@ -133,9 +135,15 @@ impl RowGroup {
}
}
/// The total size in bytes of the read group
/// The total estimated size in bytes of the read group
pub fn size(&self) -> u64 {
self.meta.size
let base_size = std::mem::size_of::<Self>()
+ self
.all_columns_by_name
.iter()
.map(|(key, value)| key.len() + std::mem::size_of::<usize>())
.sum::<usize>();
base_size as u64 + self.meta.size()
}
/// The number of rows in the `RowGroup` (all columns have the same number
@ -1377,6 +1385,12 @@ pub struct ColumnMeta {
pub range: (OwnedValue, OwnedValue),
}
impl ColumnMeta {
pub fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.range.0.size() + self.range.1.size()
}
}
// column metadata is equivalent for two columns if their logical type and
// semantic type are equivalent.
impl PartialEq for ColumnMeta {
@ -1387,10 +1401,10 @@ impl PartialEq for ColumnMeta {
#[derive(Default, Debug)]
pub struct MetaData {
// The total size of the table in bytes.
pub size: u64,
// The total size in bytes of all column data in the `RowGroup`.
pub columns_size: u64,
// The total number of rows in the table.
// The total number of rows in the `RowGroup`.
pub rows: u32,
// The distinct set of columns for this `RowGroup` (all of these columns
@ -1401,8 +1415,7 @@ pub struct MetaData {
// possibly match based on the range of values a column has.
pub columns: BTreeMap<String, ColumnMeta>,
// The total time range of this table spanning all of the `RowGroup`s within
// the table.
// The total time range of this `RowGroup`.
//
// This can be used to skip the table entirely if the time range for a query
// falls outside of this range.
@ -1410,6 +1423,21 @@ pub struct MetaData {
}
impl MetaData {
/// Returns the estimated size in bytes of the meta data and all column data
/// associated with a `RowGroup`.
pub fn size(&self) -> u64 {
let base_size = std::mem::size_of::<Self>();
(base_size
// account for contents of meta data
+ self
.columns
.iter()
.map(|(k, v)| k.len() + v.size())
.sum::<usize>()) as u64
+ self.columns_size
}
// helper function to determine if the provided binary expression could be
// satisfied in the `RowGroup`, If this function returns `false` then there
// no rows in the `RowGroup` would ever match the expression.
@ -1451,6 +1479,7 @@ impl MetaData {
pub fn add_column(
&mut self,
name: &str,
column_size: u64,
col_type: schema::ColumnType,
logical_data_type: LogicalDataType,
range: (OwnedValue, OwnedValue),
@ -1463,6 +1492,7 @@ impl MetaData {
range,
},
);
self.columns_size += column_size;
}
// Extract schema information for a set of columns.
@ -1932,6 +1962,33 @@ mod test {
Predicate::new(vec![expr])
}
#[test]
fn size() {
let mut columns = BTreeMap::new();
let rc = ColumnType::Tag(Column::from(&[Some("west"), Some("west"), None, None][..]));
columns.insert("region".to_string(), rc);
let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..]));
columns.insert("time".to_string(), tc);
let row_group = RowGroup::new(4, columns);
let rg_size = row_group.size();
assert!(rg_size > 0);
let mut columns = BTreeMap::new();
let track = ColumnType::Tag(Column::from(
&[Some("Thinking"), Some("of"), Some("a"), Some("place")][..],
));
columns.insert("track".to_string(), track);
let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..]));
columns.insert("time".to_string(), tc);
let row_group = RowGroup::new(4, columns);
assert!(row_group.size() > rg_size);
}
#[test]
fn row_ids_from_predicates() {
let mut columns = BTreeMap::new();

View File

@ -72,7 +72,7 @@ impl Table {
Self {
name: name.into(),
table_data: RwLock::new(RowGroupData {
meta: Arc::new(MetaData::new(rg.metadata())),
meta: Arc::new(MetaData::new(&rg)),
data: vec![Arc::new(rg)],
}),
}
@ -85,7 +85,7 @@ impl Table {
// `meta` can't be modified whilst protected by an Arc so create a new one.
row_groups.meta = Arc::new(MetaData::update_with(
MetaData::clone(&row_groups.meta), // clone meta-data not Arc
rg.metadata(),
&rg,
));
// Add the new row group data to the table.
@ -126,7 +126,9 @@ impl Table {
/// The total size of the table in bytes.
pub fn size(&self) -> u64 {
self.table_data.read().unwrap().meta.size
let base_size = std::mem::size_of::<Self>() + self.name.len();
// meta.size accounts for all the row group data.
base_size as u64 + self.table_data.read().unwrap().meta.size()
}
// Returns the total number of row groups in this table.
@ -570,24 +572,39 @@ struct MetaData {
}
impl MetaData {
pub fn new(meta: &row_group::MetaData) -> Self {
pub fn new(rg: &row_group::RowGroup) -> Self {
Self {
size: meta.size,
rows: meta.rows as u64,
columns: meta.columns.clone(),
column_names: meta.columns.keys().cloned().collect(),
time_range: Some(meta.time_range),
size: rg.size(),
rows: rg.rows() as u64,
columns: rg.metadata().columns.clone(),
column_names: rg.metadata().columns.keys().cloned().collect(),
time_range: Some(rg.metadata().time_range),
}
}
/// Returns the estimated size in bytes of the `MetaData` struct and all of
/// the row group data associated with a `Table`.
pub fn size(&self) -> u64 {
let base_size = std::mem::size_of::<Self>();
let columns_meta_size = self
.columns
.iter()
.map(|(k, v)| k.len() + v.size())
.sum::<usize>();
let column_names_size = self.column_names.iter().map(|c| c.len()).sum::<usize>();
(base_size + columns_meta_size + column_names_size) as u64 + self.size
}
/// Create a new `MetaData` by consuming `this` and incorporating `other`.
pub fn update_with(mut this: Self, other: &row_group::MetaData) -> Self {
pub fn update_with(mut this: Self, rg: &row_group::RowGroup) -> Self {
let other = rg.metadata();
// The incoming row group must have exactly the same schema as the
// existing row groups in the table.
assert_eq!(&this.columns, &other.columns);
// update size, rows, column ranges, time range
this.size += other.size;
this.size += rg.size();
this.rows += other.rows as u64;
// The incoming row group must have exactly the same schema as the
@ -687,9 +704,9 @@ impl From<&Vec<Arc<RowGroup>>> for MetaData {
panic!("row groups required for meta data construction");
}
let mut meta = Self::new(row_groups[0].metadata());
let mut meta = Self::new(&row_groups[0]);
for row_group in row_groups.iter().skip(1) {
meta = Self::update_with(meta, row_group.metadata());
meta = Self::update_with(meta, &row_group);
}
meta
@ -909,8 +926,6 @@ impl std::fmt::Display for DisplayReadAggregateResults<'_> {
mod test {
use super::*;
use row_group::ColumnMeta;
use crate::column::Column;
use crate::row_group::{BinaryExpr, ColumnType, ReadAggregateResult};
use crate::schema;
@ -919,67 +934,47 @@ mod test {
#[test]
fn meta_data_update_with() {
let rg_meta = row_group::MetaData {
size: 100,
rows: 2000,
columns: vec![(
"region".to_owned(),
ColumnMeta {
typ: schema::ColumnType::Tag("region".to_owned()),
logical_data_type: schema::LogicalDataType::String,
range: (
OwnedValue::String("north".to_owned()),
OwnedValue::String("south".to_owned()),
),
},
)]
.into_iter()
.collect::<BTreeMap<_, _>>(),
time_range: (10, 3000),
};
let mut columns = BTreeMap::new();
columns.insert(
"time".to_string(),
ColumnType::create_time(&[100, 200, 300]),
);
columns.insert(
"region".to_string(),
ColumnType::create_tag(&["west", "west", "north"]),
);
let rg = RowGroup::new(3, columns);
let mut meta = MetaData::new(&rg_meta);
assert_eq!(meta.rows, 2000);
assert_eq!(meta.size, 100);
assert_eq!(meta.time_range, Some((10, 3000)));
let mut meta = MetaData::new(&rg);
assert_eq!(meta.rows, 3);
let meta_size = meta.size;
assert!(meta_size > 0);
assert_eq!(meta.time_range, Some((100, 300)));
assert_eq!(
meta.columns.get("region").unwrap().range,
(
OwnedValue::String("north".to_owned()),
OwnedValue::String("south".to_owned())
OwnedValue::String("west".to_owned())
)
);
meta = MetaData::update_with(
meta,
&row_group::MetaData {
size: 300,
rows: 1500,
columns: vec![(
"region".to_owned(),
ColumnMeta {
typ: schema::ColumnType::Tag("region".to_owned()),
logical_data_type: schema::LogicalDataType::String,
range: (
OwnedValue::String("east".to_owned()),
OwnedValue::String("north".to_owned()),
),
},
)]
.into_iter()
.collect::<BTreeMap<_, _>>(),
time_range: (10, 3500),
},
let mut columns = BTreeMap::new();
columns.insert("time".to_string(), ColumnType::create_time(&[10, 400]));
columns.insert(
"region".to_string(),
ColumnType::create_tag(&["east", "south"]),
);
let rg = RowGroup::new(2, columns);
assert_eq!(meta.rows, 3500);
assert_eq!(meta.size, 400);
assert_eq!(meta.time_range, Some((10, 3500)));
meta = MetaData::update_with(meta, &rg);
assert_eq!(meta.rows, 5);
assert!(meta.size > meta_size);
assert_eq!(meta.time_range, Some((10, 400)));
assert_eq!(
meta.columns.get("region").unwrap().range,
(
OwnedValue::String("east".to_owned()),
OwnedValue::String("south".to_owned())
OwnedValue::String("west".to_owned())
)
);
}

View File

@ -1,5 +1,5 @@
use std::sync::Arc;
use std::{collections::BTreeSet, convert::TryFrom};
use std::{mem::size_of, sync::Arc};
use arrow_deps::arrow;
@ -486,6 +486,18 @@ pub enum OwnedValue {
Scalar(Scalar),
}
impl OwnedValue {
/// The size in bytes of this value.
pub fn size(&self) -> usize {
let self_size = size_of::<Self>();
match self {
Self::String(s) => s.len() + self_size,
Self::ByteArray(arr) => arr.len() + self_size,
_ => self_size,
}
}
}
impl PartialEq<Value<'_>> for OwnedValue {
fn eq(&self, other: &Value<'_>) -> bool {
match (&self, other) {
@ -797,3 +809,23 @@ impl EncodedValues {
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn size() {
let v1 = OwnedValue::Null;
assert_eq!(v1.size(), 32);
let v1 = OwnedValue::Scalar(Scalar::I64(22));
assert_eq!(v1.size(), 32);
let v1 = OwnedValue::String("11morebytes".to_owned());
assert_eq!(v1.size(), 43);
let v1 = OwnedValue::ByteArray(vec![2, 44, 252]);
assert_eq!(v1.size(), 35);
}
}