diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs index 01a8571d55..9068e35107 100644 --- a/read_buffer/src/chunk.rs +++ b/read_buffer/src/chunk.rs @@ -56,7 +56,6 @@ pub struct Chunk { // Tie data and meta-data together so that they can be wrapped in RWLock. struct TableData { - size: u64, // size in bytes of the chunk rows: u64, // Total number of rows across all tables // Total number of row groups across all tables in the chunk. @@ -72,7 +71,6 @@ impl Chunk { Self { id, chunk_data: RwLock::new(TableData { - size: table.size(), rows: table.rows(), row_groups: table.row_groups(), data: vec![(table.name().to_owned(), table)].into_iter().collect(), @@ -85,9 +83,18 @@ impl Chunk { self.id } - /// The total size in bytes of all row groups in all tables in this chunk. + /// The total estimated size in bytes of this `Chunk` and all contained + /// data. pub fn size(&self) -> u64 { - self.chunk_data.read().unwrap().size + let base_size = std::mem::size_of::(); + + let table_data = self.chunk_data.read().unwrap(); + base_size as u64 + + table_data + .data + .iter() + .map(|(k, table)| k.len() as u64 + table.size()) + .sum::() } /// The total number of rows in all row groups in all tables in this chunk. @@ -127,7 +134,6 @@ impl Chunk { let mut chunk_data = self.chunk_data.write().unwrap(); // update the meta-data for this chunk with contents of row group. - chunk_data.size += row_group.size(); chunk_data.rows += row_group.rows() as u64; chunk_data.row_groups += 1; @@ -153,7 +159,6 @@ impl Chunk { // Remove table and update chunk meta-data if table exists. if let Some(table) = chunk_data.data.remove(name) { - chunk_data.size -= table.size(); chunk_data.rows -= table.rows(); chunk_data.row_groups -= table.row_groups(); } @@ -327,6 +332,8 @@ mod test { assert_eq!(chunk.rows(), 6); assert_eq!(chunk.row_groups(), 1); assert_eq!(chunk.tables(), 1); + let chunk_size = chunk.size(); + assert!(chunk_size > 0); // Add a row group to the same table in the Chunk. let columns = vec![("time", ColumnType::create_time(&[-2_i64, 2, 8]))] @@ -334,7 +341,9 @@ mod test { .map(|(k, v)| (k.to_owned(), v)) .collect(); let rg = RowGroup::new(3, columns); + let rg_size = rg.size(); chunk.upsert_table("table_1", rg); + assert_eq!(chunk.size(), chunk_size + rg_size); assert_eq!(chunk.rows(), 9); assert_eq!(chunk.row_groups(), 2); @@ -353,16 +362,19 @@ mod test { assert_eq!(chunk.tables(), 2); // Drop table_1 + let chunk_size = chunk.size(); chunk.drop_table("table_1"); assert_eq!(chunk.rows(), 2); assert_eq!(chunk.row_groups(), 1); assert_eq!(chunk.tables(), 1); + assert!(chunk.size() < chunk_size); // Drop table_2 - empty table chunk.drop_table("table_2"); assert_eq!(chunk.rows(), 0); assert_eq!(chunk.row_groups(), 0); assert_eq!(chunk.tables(), 0); + assert_eq!(chunk.size(), 64); // base size of `Chunk` // Drop table_2 - no-op chunk.drop_table("table_2"); diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs index 8e13c63759..67c8a61a20 100644 --- a/read_buffer/src/column.rs +++ b/read_buffer/src/column.rs @@ -5,7 +5,7 @@ pub mod float; pub mod integer; pub mod string; -use std::collections::BTreeSet; +use std::{collections::BTreeSet, mem::size_of}; use croaring::Bitmap; use either::Either; @@ -48,14 +48,61 @@ impl Column { // // Meta information about the column // + + /// The estimated size in bytes of the column. + pub fn size(&self) -> u64 { + // Since `MetaData` is generic each value in the range can have a + // different size, so just do the calculations here where we know each + // `T`. + match &self { + Column::String(meta, data) => { + let mut meta_size = + (size_of::>() + size_of::()) as u64; + if let Some((min, max)) = &meta.range { + meta_size += (min.len() + max.len()) as u64; + }; + meta_size + data.size() + } + Column::Float(meta, data) => { + let meta_size = + (size_of::>() + size_of::()) as u64; + meta_size + data.size() + } + Column::Integer(meta, data) => { + let meta_size = + (size_of::>() + size_of::()) as u64; + meta_size + data.size() + } + Column::Unsigned(meta, data) => { + let meta_size = + (size_of::>() + size_of::()) as u64; + meta_size + data.size() + } + Column::Bool(meta, data) => { + let meta_size = + (size_of::>() + size_of::()) as u64; + meta_size + data.size() + } + Column::ByteArray(meta, data) => { + let mut meta_size = (size_of::, Vec)>>() + + size_of::()) as u64; + if let Some((min, max)) = &meta.range { + meta_size += (min.len() + max.len()) as u64; + }; + + meta_size + data.size() + } + } + } + pub fn num_rows(&self) -> u32 { match &self { - Column::String(meta, _) => meta.rows, - Column::Float(meta, _) => meta.rows, - Column::Integer(meta, _) => meta.rows, - Column::Unsigned(meta, _) => meta.rows, - Column::Bool(meta, _) => meta.rows, - Column::ByteArray(meta, _) => meta.rows, + Column::String(_, data) => data.num_rows(), + Column::Float(_, data) => data.num_rows(), + Column::Integer(_, data) => data.num_rows(), + Column::Unsigned(_, data) => data.num_rows(), + Column::Bool(_, data) => data.num_rows(), + Column::ByteArray(_, data) => data.num_rows(), } } @@ -71,10 +118,6 @@ impl Column { } } - pub fn size(&self) -> u64 { - 0 - } - /// Returns the (min, max) values stored in this column pub fn column_range(&self) -> Option<(OwnedValue, OwnedValue)> { match &self { @@ -652,12 +695,6 @@ pub struct MetaData where T: PartialOrd + std::fmt::Debug, { - // The total size of the column in bytes. - size: u64, - - // The total number of rows in the column. - rows: u32, - // The minimum and maximum value for this column. range: Option<(T, T)>, @@ -735,9 +772,7 @@ impl From for Column { fn from(arr: arrow::array::StringArray) -> Self { let data = StringEncoding::from(arr); let meta = MetaData { - rows: data.num_rows(), range: data.column_range(), - size: data.size(), properties: ColumnProperties { has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(), }, @@ -751,9 +786,7 @@ impl From<&[Option<&str>]> for Column { fn from(arr: &[Option<&str>]) -> Self { let data = StringEncoding::from(arr); let meta = MetaData { - rows: data.num_rows(), range: data.column_range(), - size: data.size(), properties: ColumnProperties { has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(), }, @@ -778,9 +811,7 @@ impl From<&[&str]> for Column { fn from(arr: &[&str]) -> Self { let data = StringEncoding::from(arr); let meta = MetaData { - rows: data.num_rows(), range: data.column_range(), - size: data.size(), properties: ColumnProperties { has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(), }, @@ -804,8 +835,6 @@ impl From<&[u64]> for Column { let data = IntegerEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range: Some((min, max)), properties: ColumnProperties::default(), }; @@ -857,8 +886,6 @@ impl From for Column { let data = IntegerEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range, properties: ColumnProperties::default(), }; @@ -881,8 +908,6 @@ impl From<&[i64]> for Column { let data = IntegerEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range: Some((min, max)), properties: ColumnProperties::default(), }; @@ -934,8 +959,6 @@ impl From for Column { let data = IntegerEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range, properties: ColumnProperties::default(), }; @@ -957,8 +980,6 @@ impl From<&[f64]> for Column { let data = FloatEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range: Some((min, max)), properties: ColumnProperties::default(), }; @@ -1010,8 +1031,6 @@ impl From for Column { let data = fixed_null::FixedNull::::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range, ..MetaData::default() }; @@ -1059,8 +1078,6 @@ impl From for Column { let data = BooleanEncoding::from(arr); let meta = MetaData { - size: data.size(), - rows: data.num_rows(), range, ..MetaData::default() }; @@ -1132,6 +1149,16 @@ impl RowIDs { panic!("cannot unwrap RowIDs to Vector"); } + /// An estimation of the size in bytes needed to store `self`. + pub fn size(&self) -> usize { + match self { + RowIDs::Bitmap(bm) => std::mem::size_of::() + bm.get_serialized_size_in_bytes(), + RowIDs::Vector(v) => { + std::mem::size_of::>() + (std::mem::size_of::() * v.len()) + } + } + } + // Converts the RowIDs to a Vec. This is expensive and should only be // used for testing. pub fn to_vec(&self) -> Vec { @@ -1243,8 +1270,6 @@ mod test { assert_eq!( meta, super::MetaData:: { - size: 317, - rows: 4, range: Some(("hello".to_string(), "world".to_string())), properties: ColumnProperties { has_pre_computed_row_ids: true @@ -1271,8 +1296,6 @@ mod test { assert_eq!( meta, super::MetaData:: { - size: 301, - rows: 2, range: Some(("hello".to_string(), "world".to_string())), properties: ColumnProperties { has_pre_computed_row_ids: true @@ -1336,8 +1359,6 @@ mod test { let input = &[0, -12, u16::MAX as i64, 5]; let col = Column::from(&input[..]); if let Column::Integer(meta, IntegerEncoding::I64I32(_)) = col { - assert_eq!(meta.size, 40); // 4 i32s (16b) and a vec (24b) - assert_eq!(meta.rows, 4); assert_eq!(meta.range, Some((-12, u16::MAX as i64))); } else { panic!("invalid variant"); @@ -1368,8 +1389,6 @@ mod test { let input = &[13, 12, u16::MAX as u64, 5]; let col = Column::from(&input[..]); if let Column::Unsigned(meta, IntegerEncoding::U64U16(_)) = col { - assert_eq!(meta.size, 32); // 4 u16s (8b) and a vec (24b) - assert_eq!(meta.rows, 4); assert_eq!(meta.range, Some((5, u16::MAX as u64))); } else { panic!("invalid variant"); diff --git a/read_buffer/src/column/encoding/bool.rs b/read_buffer/src/column/encoding/bool.rs index 704e535d09..515f59769d 100644 --- a/read_buffer/src/column/encoding/bool.rs +++ b/read_buffer/src/column/encoding/bool.rs @@ -36,11 +36,10 @@ impl Bool { self.arr.null_count() > 0 } - /// Returns the total size in bytes of the encoded data. Note, this method - /// is really an "accurate" estimation. It doesn't include for example the - /// size of the `Plain` struct receiver. + /// Returns an estimation of the total size in bytes used by this column + /// encoding. pub fn size(&self) -> u64 { - 0 + (std::mem::size_of::() + self.arr.get_array_memory_size()) as u64 } // @@ -342,6 +341,12 @@ mod test { v.iter().map(|x| Some(*x)).collect() } + #[test] + fn size() { + let v = Bool::from(vec![None, None, Some(true), Some(false)].as_slice()); + assert_eq!(v.size(), 240); + } + #[test] fn first_row_id_eq_value() { let v = Bool::from(vec![true, true].as_slice()); diff --git a/read_buffer/src/column/encoding/dictionary/rle.rs b/read_buffer/src/column/encoding/dictionary/rle.rs index 82eb360e93..8358737e5b 100644 --- a/read_buffer/src/column/encoding/dictionary/rle.rs +++ b/read_buffer/src/column/encoding/dictionary/rle.rs @@ -86,11 +86,19 @@ impl RLE { let index_entry_size = size_of::>() // container size + (size_of::() * self.index_entries.len()) // elements size - + decoded_keys_size; // heal allocated strings size + + decoded_keys_size; // heap allocated strings size + + // The total size (an upper bound estimate) of all the bitmaps + // in the column. + let row_ids_bitmaps_size = self + .index_row_ids + .values() + .map(|row_ids| row_ids.size()) + .sum::(); - // TODO(edd): The Bitmaps on-heap size!! let index_row_ids_size = size_of::>() - + ((size_of::() + size_of::()) * self.index_row_ids.len()); + + (size_of::() * self.index_row_ids.len()) + + row_ids_bitmaps_size; let run_lengths_size = size_of::>() + // container size (size_of::<(u32, u32)>() * self.run_lengths.len()); // each run-length size @@ -994,16 +1002,24 @@ mod test { enc.push_none(); enc.push_none(); - // keys - 14 bytes. - // entry_index is 24 + ((24+4) * 3) + 14 == 122 - // index_entry is 24 + (24*4) + 14 == 134 - // index_row_ids is 24 + (4 + 0?? * 4) == 40 ?????? - // run lengths is 24 + (8*5) == 64 - // 360 + // `entry_index` is 24 + ((24+4) * 3) + 14 == 122 + // + // Note: there are 4 index entries to account for NULL entry. + // `index_entry` is 24 + (24*4) + 14 == 134 + // + // bitmaps for east, north, south and NULL entries. + // `index_row_ids` is 24 + (4 * 4) + (108b for bitmaps) == 148 + // + // `run lengths` is 24 + (8*5) == 64 + // + // `contains_null` - 1 byte + // `num_rows` - 4 bytes + // + // 473 // TODO(edd): there some mystery bytes in the bitmap implementation. // need to figure out how to measure these - assert_eq!(enc.size(), 397); + assert_eq!(enc.size(), 473); } #[test] diff --git a/read_buffer/src/column/encoding/fixed_null.rs b/read_buffer/src/column/encoding/fixed_null.rs index e1ce7dda29..e6848a0633 100644 --- a/read_buffer/src/column/encoding/fixed_null.rs +++ b/read_buffer/src/column/encoding/fixed_null.rs @@ -66,11 +66,10 @@ where self.arr.null_count() > 0 } - /// Returns the total size in bytes of the encoded data. Note, this method - /// is really an "accurate" estimation. It doesn't include for example the - /// size of the `Plain` struct receiver. + /// Returns an estimation of the total size in bytes used by this column + /// encoding. pub fn size(&self) -> u64 { - 0 + (std::mem::size_of::>() + self.arr.get_array_memory_size()) as u64 } // @@ -598,6 +597,12 @@ mod test { v.iter().map(|x| Some(*x)).collect() } + #[test] + fn size() { + let v = FixedNull::::from(vec![None, None, Some(100), Some(2222)].as_slice()); + assert_eq!(v.size(), 240); + } + #[test] fn first_row_id_eq_value() { let v = super::FixedNull::::from(vec![22, 33, 18].as_slice()); diff --git a/read_buffer/src/lib.rs b/read_buffer/src/lib.rs index af90ac445d..24bcf80218 100644 --- a/read_buffer/src/lib.rs +++ b/read_buffer/src/lib.rs @@ -79,9 +79,6 @@ struct PartitionData { // identified by a partition key partitions: BTreeMap, - // The current total size of the database. - size: u64, - // Total number of rows in the database. rows: u64, } @@ -108,7 +105,6 @@ impl Database { // Take lock on partitions and update. let mut partition_data = self.data.write().unwrap(); - partition_data.size += row_group.size(); partition_data.rows += row_group.rows() as u64; // create a new chunk if one doesn't exist, or add the table data to @@ -141,7 +137,6 @@ impl Database { .remove(partition_key) .context(PartitionNotFound { key: partition_key })?; - partition_data.size -= partition.size(); partition_data.rows -= partition.rows(); Ok(()) } @@ -156,7 +151,6 @@ impl Database { .context(PartitionNotFound { key: partition_key })?; partition.drop_chunk(chunk_id).map(|chunk| { - partition_data.size -= chunk.size(); partition_data.rows -= chunk.rows(); // don't return chunk from `drop_chunk` }) @@ -185,8 +179,17 @@ impl Database { .unwrap_or_default() } + /// Returns the total estimated size in bytes of the database. pub fn size(&self) -> u64 { - self.data.read().unwrap().size + let base_size = std::mem::size_of::(); + + let partition_data = self.data.read().unwrap(); + base_size as u64 + + partition_data + .partitions + .iter() + .map(|(name, partition)| name.len() as u64 + partition.size()) + .sum::() } pub fn rows(&self) -> u64 { @@ -502,7 +505,7 @@ impl fmt::Debug for Database { let partition_data = self.data.read().unwrap(); f.debug_struct("Database") .field("partitions", &partition_data.partitions.keys()) - .field("size", &partition_data.size) + .field("size", &self.size()) .finish() } } @@ -513,9 +516,6 @@ struct ChunkData { // identified by a chunk id. chunks: BTreeMap, - // The current total size of the partition. - size: u64, - // The current number of row groups in this partition. row_groups: usize, @@ -537,7 +537,6 @@ impl Partition { Self { key: partition_key.to_owned(), data: RwLock::new(ChunkData { - size: chunk.size(), row_groups: chunk.row_groups(), rows: chunk.rows(), chunks: vec![(chunk.id(), chunk)].into_iter().collect(), @@ -555,7 +554,6 @@ impl Partition { fn upsert_chunk(&mut self, chunk_id: u32, table_name: String, row_group: RowGroup) { let mut chunk_data = self.data.write().unwrap(); - chunk_data.size += row_group.size(); chunk_data.row_groups += 1; chunk_data.rows += row_group.rows() as u64; @@ -581,7 +579,6 @@ impl Partition { .remove(&chunk_id) .context(ChunkNotFound { id: chunk_id })?; - chunk_data.size -= chunk.size(); chunk_data.rows -= chunk.rows(); chunk_data.row_groups -= chunk.row_groups(); Ok(chunk) @@ -624,8 +621,18 @@ impl Partition { self.data.read().unwrap().rows } + /// The total estimated size in bytes of the `Partition` and all contained + /// data. pub fn size(&self) -> u64 { - self.data.read().unwrap().size + let base_size = std::mem::size_of::() + self.key.len(); + + let chunk_data = self.data.read().unwrap(); + base_size as u64 + + chunk_data + .chunks + .values() + .map(|chunk| std::mem::size_of::() as u64 + chunk.size()) + .sum::() } } diff --git a/read_buffer/src/row_group.rs b/read_buffer/src/row_group.rs index 1e498b4382..a454261ec6 100644 --- a/read_buffer/src/row_group.rs +++ b/read_buffer/src/row_group.rs @@ -69,13 +69,13 @@ impl RowGroup { let mut time_column = None; for (name, ct) in columns { - meta.size += ct.size(); match ct { ColumnType::Tag(c) => { assert_eq!(c.num_rows(), rows); meta.add_column( &name, + c.size(), schema::ColumnType::Tag(name.clone()), c.logical_datatype(), c.column_range().unwrap(), @@ -89,6 +89,7 @@ impl RowGroup { meta.add_column( &name, + c.size(), schema::ColumnType::Field(name.clone()), c.logical_datatype(), c.column_range().unwrap(), @@ -110,6 +111,7 @@ impl RowGroup { meta.add_column( &name, + c.size(), schema::ColumnType::Timestamp(name.clone()), c.logical_datatype(), c.column_range().unwrap(), @@ -133,9 +135,15 @@ impl RowGroup { } } - /// The total size in bytes of the read group + /// The total estimated size in bytes of the read group pub fn size(&self) -> u64 { - self.meta.size + let base_size = std::mem::size_of::() + + self + .all_columns_by_name + .iter() + .map(|(key, value)| key.len() + std::mem::size_of::()) + .sum::(); + base_size as u64 + self.meta.size() } /// The number of rows in the `RowGroup` (all columns have the same number @@ -1377,6 +1385,12 @@ pub struct ColumnMeta { pub range: (OwnedValue, OwnedValue), } +impl ColumnMeta { + pub fn size(&self) -> usize { + std::mem::size_of::() + self.range.0.size() + self.range.1.size() + } +} + // column metadata is equivalent for two columns if their logical type and // semantic type are equivalent. impl PartialEq for ColumnMeta { @@ -1387,10 +1401,10 @@ impl PartialEq for ColumnMeta { #[derive(Default, Debug)] pub struct MetaData { - // The total size of the table in bytes. - pub size: u64, + // The total size in bytes of all column data in the `RowGroup`. + pub columns_size: u64, - // The total number of rows in the table. + // The total number of rows in the `RowGroup`. pub rows: u32, // The distinct set of columns for this `RowGroup` (all of these columns @@ -1401,8 +1415,7 @@ pub struct MetaData { // possibly match based on the range of values a column has. pub columns: BTreeMap, - // The total time range of this table spanning all of the `RowGroup`s within - // the table. + // The total time range of this `RowGroup`. // // This can be used to skip the table entirely if the time range for a query // falls outside of this range. @@ -1410,6 +1423,21 @@ pub struct MetaData { } impl MetaData { + /// Returns the estimated size in bytes of the meta data and all column data + /// associated with a `RowGroup`. + pub fn size(&self) -> u64 { + let base_size = std::mem::size_of::(); + + (base_size + // account for contents of meta data + + self + .columns + .iter() + .map(|(k, v)| k.len() + v.size()) + .sum::()) as u64 + + self.columns_size + } + // helper function to determine if the provided binary expression could be // satisfied in the `RowGroup`, If this function returns `false` then there // no rows in the `RowGroup` would ever match the expression. @@ -1451,6 +1479,7 @@ impl MetaData { pub fn add_column( &mut self, name: &str, + column_size: u64, col_type: schema::ColumnType, logical_data_type: LogicalDataType, range: (OwnedValue, OwnedValue), @@ -1463,6 +1492,7 @@ impl MetaData { range, }, ); + self.columns_size += column_size; } // Extract schema information for a set of columns. @@ -1932,6 +1962,33 @@ mod test { Predicate::new(vec![expr]) } + #[test] + fn size() { + let mut columns = BTreeMap::new(); + let rc = ColumnType::Tag(Column::from(&[Some("west"), Some("west"), None, None][..])); + columns.insert("region".to_string(), rc); + let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..])); + columns.insert("time".to_string(), tc); + + let row_group = RowGroup::new(4, columns); + + let rg_size = row_group.size(); + assert!(rg_size > 0); + + let mut columns = BTreeMap::new(); + + let track = ColumnType::Tag(Column::from( + &[Some("Thinking"), Some("of"), Some("a"), Some("place")][..], + )); + columns.insert("track".to_string(), track); + let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..])); + columns.insert("time".to_string(), tc); + + let row_group = RowGroup::new(4, columns); + + assert!(row_group.size() > rg_size); + } + #[test] fn row_ids_from_predicates() { let mut columns = BTreeMap::new(); diff --git a/read_buffer/src/table.rs b/read_buffer/src/table.rs index 8399a9657d..a1c95982bd 100644 --- a/read_buffer/src/table.rs +++ b/read_buffer/src/table.rs @@ -72,7 +72,7 @@ impl Table { Self { name: name.into(), table_data: RwLock::new(RowGroupData { - meta: Arc::new(MetaData::new(rg.metadata())), + meta: Arc::new(MetaData::new(&rg)), data: vec![Arc::new(rg)], }), } @@ -85,7 +85,7 @@ impl Table { // `meta` can't be modified whilst protected by an Arc so create a new one. row_groups.meta = Arc::new(MetaData::update_with( MetaData::clone(&row_groups.meta), // clone meta-data not Arc - rg.metadata(), + &rg, )); // Add the new row group data to the table. @@ -126,7 +126,9 @@ impl Table { /// The total size of the table in bytes. pub fn size(&self) -> u64 { - self.table_data.read().unwrap().meta.size + let base_size = std::mem::size_of::() + self.name.len(); + // meta.size accounts for all the row group data. + base_size as u64 + self.table_data.read().unwrap().meta.size() } // Returns the total number of row groups in this table. @@ -570,24 +572,39 @@ struct MetaData { } impl MetaData { - pub fn new(meta: &row_group::MetaData) -> Self { + pub fn new(rg: &row_group::RowGroup) -> Self { Self { - size: meta.size, - rows: meta.rows as u64, - columns: meta.columns.clone(), - column_names: meta.columns.keys().cloned().collect(), - time_range: Some(meta.time_range), + size: rg.size(), + rows: rg.rows() as u64, + columns: rg.metadata().columns.clone(), + column_names: rg.metadata().columns.keys().cloned().collect(), + time_range: Some(rg.metadata().time_range), } } + /// Returns the estimated size in bytes of the `MetaData` struct and all of + /// the row group data associated with a `Table`. + pub fn size(&self) -> u64 { + let base_size = std::mem::size_of::(); + let columns_meta_size = self + .columns + .iter() + .map(|(k, v)| k.len() + v.size()) + .sum::(); + + let column_names_size = self.column_names.iter().map(|c| c.len()).sum::(); + (base_size + columns_meta_size + column_names_size) as u64 + self.size + } + /// Create a new `MetaData` by consuming `this` and incorporating `other`. - pub fn update_with(mut this: Self, other: &row_group::MetaData) -> Self { + pub fn update_with(mut this: Self, rg: &row_group::RowGroup) -> Self { + let other = rg.metadata(); // The incoming row group must have exactly the same schema as the // existing row groups in the table. assert_eq!(&this.columns, &other.columns); // update size, rows, column ranges, time range - this.size += other.size; + this.size += rg.size(); this.rows += other.rows as u64; // The incoming row group must have exactly the same schema as the @@ -687,9 +704,9 @@ impl From<&Vec>> for MetaData { panic!("row groups required for meta data construction"); } - let mut meta = Self::new(row_groups[0].metadata()); + let mut meta = Self::new(&row_groups[0]); for row_group in row_groups.iter().skip(1) { - meta = Self::update_with(meta, row_group.metadata()); + meta = Self::update_with(meta, &row_group); } meta @@ -909,8 +926,6 @@ impl std::fmt::Display for DisplayReadAggregateResults<'_> { mod test { use super::*; - use row_group::ColumnMeta; - use crate::column::Column; use crate::row_group::{BinaryExpr, ColumnType, ReadAggregateResult}; use crate::schema; @@ -919,67 +934,47 @@ mod test { #[test] fn meta_data_update_with() { - let rg_meta = row_group::MetaData { - size: 100, - rows: 2000, - columns: vec![( - "region".to_owned(), - ColumnMeta { - typ: schema::ColumnType::Tag("region".to_owned()), - logical_data_type: schema::LogicalDataType::String, - range: ( - OwnedValue::String("north".to_owned()), - OwnedValue::String("south".to_owned()), - ), - }, - )] - .into_iter() - .collect::>(), - time_range: (10, 3000), - }; + let mut columns = BTreeMap::new(); + columns.insert( + "time".to_string(), + ColumnType::create_time(&[100, 200, 300]), + ); + columns.insert( + "region".to_string(), + ColumnType::create_tag(&["west", "west", "north"]), + ); + let rg = RowGroup::new(3, columns); - let mut meta = MetaData::new(&rg_meta); - assert_eq!(meta.rows, 2000); - assert_eq!(meta.size, 100); - assert_eq!(meta.time_range, Some((10, 3000))); + let mut meta = MetaData::new(&rg); + assert_eq!(meta.rows, 3); + let meta_size = meta.size; + assert!(meta_size > 0); + assert_eq!(meta.time_range, Some((100, 300))); assert_eq!( meta.columns.get("region").unwrap().range, ( OwnedValue::String("north".to_owned()), - OwnedValue::String("south".to_owned()) + OwnedValue::String("west".to_owned()) ) ); - meta = MetaData::update_with( - meta, - &row_group::MetaData { - size: 300, - rows: 1500, - columns: vec![( - "region".to_owned(), - ColumnMeta { - typ: schema::ColumnType::Tag("region".to_owned()), - logical_data_type: schema::LogicalDataType::String, - range: ( - OwnedValue::String("east".to_owned()), - OwnedValue::String("north".to_owned()), - ), - }, - )] - .into_iter() - .collect::>(), - time_range: (10, 3500), - }, + let mut columns = BTreeMap::new(); + columns.insert("time".to_string(), ColumnType::create_time(&[10, 400])); + columns.insert( + "region".to_string(), + ColumnType::create_tag(&["east", "south"]), ); + let rg = RowGroup::new(2, columns); - assert_eq!(meta.rows, 3500); - assert_eq!(meta.size, 400); - assert_eq!(meta.time_range, Some((10, 3500))); + meta = MetaData::update_with(meta, &rg); + assert_eq!(meta.rows, 5); + assert!(meta.size > meta_size); + assert_eq!(meta.time_range, Some((10, 400))); assert_eq!( meta.columns.get("region").unwrap().range, ( OwnedValue::String("east".to_owned()), - OwnedValue::String("south".to_owned()) + OwnedValue::String("west".to_owned()) ) ); } diff --git a/read_buffer/src/value.rs b/read_buffer/src/value.rs index 00e93036a2..8526a9412c 100644 --- a/read_buffer/src/value.rs +++ b/read_buffer/src/value.rs @@ -1,5 +1,5 @@ -use std::sync::Arc; use std::{collections::BTreeSet, convert::TryFrom}; +use std::{mem::size_of, sync::Arc}; use arrow_deps::arrow; @@ -486,6 +486,18 @@ pub enum OwnedValue { Scalar(Scalar), } +impl OwnedValue { + /// The size in bytes of this value. + pub fn size(&self) -> usize { + let self_size = size_of::(); + match self { + Self::String(s) => s.len() + self_size, + Self::ByteArray(arr) => arr.len() + self_size, + _ => self_size, + } + } +} + impl PartialEq> for OwnedValue { fn eq(&self, other: &Value<'_>) -> bool { match (&self, other) { @@ -797,3 +809,23 @@ impl EncodedValues { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn size() { + let v1 = OwnedValue::Null; + assert_eq!(v1.size(), 32); + + let v1 = OwnedValue::Scalar(Scalar::I64(22)); + assert_eq!(v1.size(), 32); + + let v1 = OwnedValue::String("11morebytes".to_owned()); + assert_eq!(v1.size(), 43); + + let v1 = OwnedValue::ByteArray(vec![2, 44, 252]); + assert_eq!(v1.size(), 35); + } +}