Merge pull request #816 from influxdata/er/feat/read_buffer/size
feat: implement `size` on Read Bufferpull/24376/head
commit
7de5d35bab
|
@ -56,7 +56,6 @@ pub struct Chunk {
|
|||
|
||||
// Tie data and meta-data together so that they can be wrapped in RWLock.
|
||||
struct TableData {
|
||||
size: u64, // size in bytes of the chunk
|
||||
rows: u64, // Total number of rows across all tables
|
||||
|
||||
// Total number of row groups across all tables in the chunk.
|
||||
|
@ -72,7 +71,6 @@ impl Chunk {
|
|||
Self {
|
||||
id,
|
||||
chunk_data: RwLock::new(TableData {
|
||||
size: table.size(),
|
||||
rows: table.rows(),
|
||||
row_groups: table.row_groups(),
|
||||
data: vec![(table.name().to_owned(), table)].into_iter().collect(),
|
||||
|
@ -85,9 +83,18 @@ impl Chunk {
|
|||
self.id
|
||||
}
|
||||
|
||||
/// The total size in bytes of all row groups in all tables in this chunk.
|
||||
/// The total estimated size in bytes of this `Chunk` and all contained
|
||||
/// data.
|
||||
pub fn size(&self) -> u64 {
|
||||
self.chunk_data.read().unwrap().size
|
||||
let base_size = std::mem::size_of::<Self>();
|
||||
|
||||
let table_data = self.chunk_data.read().unwrap();
|
||||
base_size as u64
|
||||
+ table_data
|
||||
.data
|
||||
.iter()
|
||||
.map(|(k, table)| k.len() as u64 + table.size())
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
/// The total number of rows in all row groups in all tables in this chunk.
|
||||
|
@ -127,7 +134,6 @@ impl Chunk {
|
|||
let mut chunk_data = self.chunk_data.write().unwrap();
|
||||
|
||||
// update the meta-data for this chunk with contents of row group.
|
||||
chunk_data.size += row_group.size();
|
||||
chunk_data.rows += row_group.rows() as u64;
|
||||
chunk_data.row_groups += 1;
|
||||
|
||||
|
@ -153,7 +159,6 @@ impl Chunk {
|
|||
|
||||
// Remove table and update chunk meta-data if table exists.
|
||||
if let Some(table) = chunk_data.data.remove(name) {
|
||||
chunk_data.size -= table.size();
|
||||
chunk_data.rows -= table.rows();
|
||||
chunk_data.row_groups -= table.row_groups();
|
||||
}
|
||||
|
@ -327,6 +332,8 @@ mod test {
|
|||
assert_eq!(chunk.rows(), 6);
|
||||
assert_eq!(chunk.row_groups(), 1);
|
||||
assert_eq!(chunk.tables(), 1);
|
||||
let chunk_size = chunk.size();
|
||||
assert!(chunk_size > 0);
|
||||
|
||||
// Add a row group to the same table in the Chunk.
|
||||
let columns = vec![("time", ColumnType::create_time(&[-2_i64, 2, 8]))]
|
||||
|
@ -334,7 +341,9 @@ mod test {
|
|||
.map(|(k, v)| (k.to_owned(), v))
|
||||
.collect();
|
||||
let rg = RowGroup::new(3, columns);
|
||||
let rg_size = rg.size();
|
||||
chunk.upsert_table("table_1", rg);
|
||||
assert_eq!(chunk.size(), chunk_size + rg_size);
|
||||
|
||||
assert_eq!(chunk.rows(), 9);
|
||||
assert_eq!(chunk.row_groups(), 2);
|
||||
|
@ -353,16 +362,19 @@ mod test {
|
|||
assert_eq!(chunk.tables(), 2);
|
||||
|
||||
// Drop table_1
|
||||
let chunk_size = chunk.size();
|
||||
chunk.drop_table("table_1");
|
||||
assert_eq!(chunk.rows(), 2);
|
||||
assert_eq!(chunk.row_groups(), 1);
|
||||
assert_eq!(chunk.tables(), 1);
|
||||
assert!(chunk.size() < chunk_size);
|
||||
|
||||
// Drop table_2 - empty table
|
||||
chunk.drop_table("table_2");
|
||||
assert_eq!(chunk.rows(), 0);
|
||||
assert_eq!(chunk.row_groups(), 0);
|
||||
assert_eq!(chunk.tables(), 0);
|
||||
assert_eq!(chunk.size(), 64); // base size of `Chunk`
|
||||
|
||||
// Drop table_2 - no-op
|
||||
chunk.drop_table("table_2");
|
||||
|
|
|
@ -5,7 +5,7 @@ pub mod float;
|
|||
pub mod integer;
|
||||
pub mod string;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::{collections::BTreeSet, mem::size_of};
|
||||
|
||||
use croaring::Bitmap;
|
||||
use either::Either;
|
||||
|
@ -48,14 +48,61 @@ impl Column {
|
|||
//
|
||||
// Meta information about the column
|
||||
//
|
||||
|
||||
/// The estimated size in bytes of the column.
|
||||
pub fn size(&self) -> u64 {
|
||||
// Since `MetaData` is generic each value in the range can have a
|
||||
// different size, so just do the calculations here where we know each
|
||||
// `T`.
|
||||
match &self {
|
||||
Column::String(meta, data) => {
|
||||
let mut meta_size =
|
||||
(size_of::<Option<(String, String)>>() + size_of::<ColumnProperties>()) as u64;
|
||||
if let Some((min, max)) = &meta.range {
|
||||
meta_size += (min.len() + max.len()) as u64;
|
||||
};
|
||||
meta_size + data.size()
|
||||
}
|
||||
Column::Float(meta, data) => {
|
||||
let meta_size =
|
||||
(size_of::<Option<(f64, f64)>>() + size_of::<ColumnProperties>()) as u64;
|
||||
meta_size + data.size()
|
||||
}
|
||||
Column::Integer(meta, data) => {
|
||||
let meta_size =
|
||||
(size_of::<Option<(i64, i64)>>() + size_of::<ColumnProperties>()) as u64;
|
||||
meta_size + data.size()
|
||||
}
|
||||
Column::Unsigned(meta, data) => {
|
||||
let meta_size =
|
||||
(size_of::<Option<(u64, u64)>>() + size_of::<ColumnProperties>()) as u64;
|
||||
meta_size + data.size()
|
||||
}
|
||||
Column::Bool(meta, data) => {
|
||||
let meta_size =
|
||||
(size_of::<Option<(bool, bool)>>() + size_of::<ColumnProperties>()) as u64;
|
||||
meta_size + data.size()
|
||||
}
|
||||
Column::ByteArray(meta, data) => {
|
||||
let mut meta_size = (size_of::<Option<(Vec<u8>, Vec<u8>)>>()
|
||||
+ size_of::<ColumnProperties>()) as u64;
|
||||
if let Some((min, max)) = &meta.range {
|
||||
meta_size += (min.len() + max.len()) as u64;
|
||||
};
|
||||
|
||||
meta_size + data.size()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn num_rows(&self) -> u32 {
|
||||
match &self {
|
||||
Column::String(meta, _) => meta.rows,
|
||||
Column::Float(meta, _) => meta.rows,
|
||||
Column::Integer(meta, _) => meta.rows,
|
||||
Column::Unsigned(meta, _) => meta.rows,
|
||||
Column::Bool(meta, _) => meta.rows,
|
||||
Column::ByteArray(meta, _) => meta.rows,
|
||||
Column::String(_, data) => data.num_rows(),
|
||||
Column::Float(_, data) => data.num_rows(),
|
||||
Column::Integer(_, data) => data.num_rows(),
|
||||
Column::Unsigned(_, data) => data.num_rows(),
|
||||
Column::Bool(_, data) => data.num_rows(),
|
||||
Column::ByteArray(_, data) => data.num_rows(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -71,10 +118,6 @@ impl Column {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
0
|
||||
}
|
||||
|
||||
/// Returns the (min, max) values stored in this column
|
||||
pub fn column_range(&self) -> Option<(OwnedValue, OwnedValue)> {
|
||||
match &self {
|
||||
|
@ -652,12 +695,6 @@ pub struct MetaData<T>
|
|||
where
|
||||
T: PartialOrd + std::fmt::Debug,
|
||||
{
|
||||
// The total size of the column in bytes.
|
||||
size: u64,
|
||||
|
||||
// The total number of rows in the column.
|
||||
rows: u32,
|
||||
|
||||
// The minimum and maximum value for this column.
|
||||
range: Option<(T, T)>,
|
||||
|
||||
|
@ -735,9 +772,7 @@ impl From<arrow::array::StringArray> for Column {
|
|||
fn from(arr: arrow::array::StringArray) -> Self {
|
||||
let data = StringEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
rows: data.num_rows(),
|
||||
range: data.column_range(),
|
||||
size: data.size(),
|
||||
properties: ColumnProperties {
|
||||
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
|
||||
},
|
||||
|
@ -751,9 +786,7 @@ impl From<&[Option<&str>]> for Column {
|
|||
fn from(arr: &[Option<&str>]) -> Self {
|
||||
let data = StringEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
rows: data.num_rows(),
|
||||
range: data.column_range(),
|
||||
size: data.size(),
|
||||
properties: ColumnProperties {
|
||||
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
|
||||
},
|
||||
|
@ -778,9 +811,7 @@ impl From<&[&str]> for Column {
|
|||
fn from(arr: &[&str]) -> Self {
|
||||
let data = StringEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
rows: data.num_rows(),
|
||||
range: data.column_range(),
|
||||
size: data.size(),
|
||||
properties: ColumnProperties {
|
||||
has_pre_computed_row_ids: data.has_pre_computed_row_id_sets(),
|
||||
},
|
||||
|
@ -804,8 +835,6 @@ impl From<&[u64]> for Column {
|
|||
|
||||
let data = IntegerEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range: Some((min, max)),
|
||||
properties: ColumnProperties::default(),
|
||||
};
|
||||
|
@ -857,8 +886,6 @@ impl From<arrow::array::UInt64Array> for Column {
|
|||
|
||||
let data = IntegerEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range,
|
||||
properties: ColumnProperties::default(),
|
||||
};
|
||||
|
@ -881,8 +908,6 @@ impl From<&[i64]> for Column {
|
|||
|
||||
let data = IntegerEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range: Some((min, max)),
|
||||
properties: ColumnProperties::default(),
|
||||
};
|
||||
|
@ -934,8 +959,6 @@ impl From<arrow::array::Int64Array> for Column {
|
|||
|
||||
let data = IntegerEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range,
|
||||
properties: ColumnProperties::default(),
|
||||
};
|
||||
|
@ -957,8 +980,6 @@ impl From<&[f64]> for Column {
|
|||
|
||||
let data = FloatEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range: Some((min, max)),
|
||||
properties: ColumnProperties::default(),
|
||||
};
|
||||
|
@ -1010,8 +1031,6 @@ impl From<arrow::array::Float64Array> for Column {
|
|||
|
||||
let data = fixed_null::FixedNull::<arrow::datatypes::Float64Type>::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range,
|
||||
..MetaData::default()
|
||||
};
|
||||
|
@ -1059,8 +1078,6 @@ impl From<arrow::array::BooleanArray> for Column {
|
|||
|
||||
let data = BooleanEncoding::from(arr);
|
||||
let meta = MetaData {
|
||||
size: data.size(),
|
||||
rows: data.num_rows(),
|
||||
range,
|
||||
..MetaData::default()
|
||||
};
|
||||
|
@ -1132,6 +1149,16 @@ impl RowIDs {
|
|||
panic!("cannot unwrap RowIDs to Vector");
|
||||
}
|
||||
|
||||
/// An estimation of the size in bytes needed to store `self`.
|
||||
pub fn size(&self) -> usize {
|
||||
match self {
|
||||
RowIDs::Bitmap(bm) => std::mem::size_of::<Bitmap>() + bm.get_serialized_size_in_bytes(),
|
||||
RowIDs::Vector(v) => {
|
||||
std::mem::size_of::<Vec<u32>>() + (std::mem::size_of::<u32>() * v.len())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Converts the RowIDs to a Vec<u32>. This is expensive and should only be
|
||||
// used for testing.
|
||||
pub fn to_vec(&self) -> Vec<u32> {
|
||||
|
@ -1243,8 +1270,6 @@ mod test {
|
|||
assert_eq!(
|
||||
meta,
|
||||
super::MetaData::<String> {
|
||||
size: 317,
|
||||
rows: 4,
|
||||
range: Some(("hello".to_string(), "world".to_string())),
|
||||
properties: ColumnProperties {
|
||||
has_pre_computed_row_ids: true
|
||||
|
@ -1271,8 +1296,6 @@ mod test {
|
|||
assert_eq!(
|
||||
meta,
|
||||
super::MetaData::<String> {
|
||||
size: 301,
|
||||
rows: 2,
|
||||
range: Some(("hello".to_string(), "world".to_string())),
|
||||
properties: ColumnProperties {
|
||||
has_pre_computed_row_ids: true
|
||||
|
@ -1336,8 +1359,6 @@ mod test {
|
|||
let input = &[0, -12, u16::MAX as i64, 5];
|
||||
let col = Column::from(&input[..]);
|
||||
if let Column::Integer(meta, IntegerEncoding::I64I32(_)) = col {
|
||||
assert_eq!(meta.size, 40); // 4 i32s (16b) and a vec (24b)
|
||||
assert_eq!(meta.rows, 4);
|
||||
assert_eq!(meta.range, Some((-12, u16::MAX as i64)));
|
||||
} else {
|
||||
panic!("invalid variant");
|
||||
|
@ -1368,8 +1389,6 @@ mod test {
|
|||
let input = &[13, 12, u16::MAX as u64, 5];
|
||||
let col = Column::from(&input[..]);
|
||||
if let Column::Unsigned(meta, IntegerEncoding::U64U16(_)) = col {
|
||||
assert_eq!(meta.size, 32); // 4 u16s (8b) and a vec (24b)
|
||||
assert_eq!(meta.rows, 4);
|
||||
assert_eq!(meta.range, Some((5, u16::MAX as u64)));
|
||||
} else {
|
||||
panic!("invalid variant");
|
||||
|
|
|
@ -36,11 +36,10 @@ impl Bool {
|
|||
self.arr.null_count() > 0
|
||||
}
|
||||
|
||||
/// Returns the total size in bytes of the encoded data. Note, this method
|
||||
/// is really an "accurate" estimation. It doesn't include for example the
|
||||
/// size of the `Plain` struct receiver.
|
||||
/// Returns an estimation of the total size in bytes used by this column
|
||||
/// encoding.
|
||||
pub fn size(&self) -> u64 {
|
||||
0
|
||||
(std::mem::size_of::<BooleanArray>() + self.arr.get_array_memory_size()) as u64
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -342,6 +341,12 @@ mod test {
|
|||
v.iter().map(|x| Some(*x)).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size() {
|
||||
let v = Bool::from(vec![None, None, Some(true), Some(false)].as_slice());
|
||||
assert_eq!(v.size(), 240);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first_row_id_eq_value() {
|
||||
let v = Bool::from(vec![true, true].as_slice());
|
||||
|
|
|
@ -86,11 +86,19 @@ impl RLE {
|
|||
|
||||
let index_entry_size = size_of::<Vec<String>>() // container size
|
||||
+ (size_of::<String>() * self.index_entries.len()) // elements size
|
||||
+ decoded_keys_size; // heal allocated strings size
|
||||
+ decoded_keys_size; // heap allocated strings size
|
||||
|
||||
// The total size (an upper bound estimate) of all the bitmaps
|
||||
// in the column.
|
||||
let row_ids_bitmaps_size = self
|
||||
.index_row_ids
|
||||
.values()
|
||||
.map(|row_ids| row_ids.size())
|
||||
.sum::<usize>();
|
||||
|
||||
// TODO(edd): The Bitmaps on-heap size!!
|
||||
let index_row_ids_size = size_of::<BTreeMap<u32, Bitmap>>()
|
||||
+ ((size_of::<u32>() + size_of::<Bitmap>()) * self.index_row_ids.len());
|
||||
+ (size_of::<u32>() * self.index_row_ids.len())
|
||||
+ row_ids_bitmaps_size;
|
||||
|
||||
let run_lengths_size = size_of::<Vec<(u32, u32)>>() + // container size
|
||||
(size_of::<(u32, u32)>() * self.run_lengths.len()); // each run-length size
|
||||
|
@ -994,16 +1002,24 @@ mod test {
|
|||
enc.push_none();
|
||||
enc.push_none();
|
||||
|
||||
// keys - 14 bytes.
|
||||
// entry_index is 24 + ((24+4) * 3) + 14 == 122
|
||||
// index_entry is 24 + (24*4) + 14 == 134
|
||||
// index_row_ids is 24 + (4 + 0?? * 4) == 40 ??????
|
||||
// run lengths is 24 + (8*5) == 64
|
||||
// 360
|
||||
// `entry_index` is 24 + ((24+4) * 3) + 14 == 122
|
||||
//
|
||||
// Note: there are 4 index entries to account for NULL entry.
|
||||
// `index_entry` is 24 + (24*4) + 14 == 134
|
||||
//
|
||||
// bitmaps for east, north, south and NULL entries.
|
||||
// `index_row_ids` is 24 + (4 * 4) + (108b for bitmaps) == 148
|
||||
//
|
||||
// `run lengths` is 24 + (8*5) == 64
|
||||
//
|
||||
// `contains_null` - 1 byte
|
||||
// `num_rows` - 4 bytes
|
||||
//
|
||||
// 473
|
||||
|
||||
// TODO(edd): there some mystery bytes in the bitmap implementation.
|
||||
// need to figure out how to measure these
|
||||
assert_eq!(enc.size(), 397);
|
||||
assert_eq!(enc.size(), 473);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -66,11 +66,10 @@ where
|
|||
self.arr.null_count() > 0
|
||||
}
|
||||
|
||||
/// Returns the total size in bytes of the encoded data. Note, this method
|
||||
/// is really an "accurate" estimation. It doesn't include for example the
|
||||
/// size of the `Plain` struct receiver.
|
||||
/// Returns an estimation of the total size in bytes used by this column
|
||||
/// encoding.
|
||||
pub fn size(&self) -> u64 {
|
||||
0
|
||||
(std::mem::size_of::<PrimitiveArray<T>>() + self.arr.get_array_memory_size()) as u64
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -598,6 +597,12 @@ mod test {
|
|||
v.iter().map(|x| Some(*x)).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size() {
|
||||
let v = FixedNull::<UInt64Type>::from(vec![None, None, Some(100), Some(2222)].as_slice());
|
||||
assert_eq!(v.size(), 240);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first_row_id_eq_value() {
|
||||
let v = super::FixedNull::<Int64Type>::from(vec![22, 33, 18].as_slice());
|
||||
|
|
|
@ -79,9 +79,6 @@ struct PartitionData {
|
|||
// identified by a partition key
|
||||
partitions: BTreeMap<String, Partition>,
|
||||
|
||||
// The current total size of the database.
|
||||
size: u64,
|
||||
|
||||
// Total number of rows in the database.
|
||||
rows: u64,
|
||||
}
|
||||
|
@ -108,7 +105,6 @@ impl Database {
|
|||
|
||||
// Take lock on partitions and update.
|
||||
let mut partition_data = self.data.write().unwrap();
|
||||
partition_data.size += row_group.size();
|
||||
partition_data.rows += row_group.rows() as u64;
|
||||
|
||||
// create a new chunk if one doesn't exist, or add the table data to
|
||||
|
@ -141,7 +137,6 @@ impl Database {
|
|||
.remove(partition_key)
|
||||
.context(PartitionNotFound { key: partition_key })?;
|
||||
|
||||
partition_data.size -= partition.size();
|
||||
partition_data.rows -= partition.rows();
|
||||
Ok(())
|
||||
}
|
||||
|
@ -156,7 +151,6 @@ impl Database {
|
|||
.context(PartitionNotFound { key: partition_key })?;
|
||||
|
||||
partition.drop_chunk(chunk_id).map(|chunk| {
|
||||
partition_data.size -= chunk.size();
|
||||
partition_data.rows -= chunk.rows();
|
||||
// don't return chunk from `drop_chunk`
|
||||
})
|
||||
|
@ -185,8 +179,17 @@ impl Database {
|
|||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns the total estimated size in bytes of the database.
|
||||
pub fn size(&self) -> u64 {
|
||||
self.data.read().unwrap().size
|
||||
let base_size = std::mem::size_of::<Self>();
|
||||
|
||||
let partition_data = self.data.read().unwrap();
|
||||
base_size as u64
|
||||
+ partition_data
|
||||
.partitions
|
||||
.iter()
|
||||
.map(|(name, partition)| name.len() as u64 + partition.size())
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
pub fn rows(&self) -> u64 {
|
||||
|
@ -502,7 +505,7 @@ impl fmt::Debug for Database {
|
|||
let partition_data = self.data.read().unwrap();
|
||||
f.debug_struct("Database")
|
||||
.field("partitions", &partition_data.partitions.keys())
|
||||
.field("size", &partition_data.size)
|
||||
.field("size", &self.size())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
@ -513,9 +516,6 @@ struct ChunkData {
|
|||
// identified by a chunk id.
|
||||
chunks: BTreeMap<u32, Chunk>,
|
||||
|
||||
// The current total size of the partition.
|
||||
size: u64,
|
||||
|
||||
// The current number of row groups in this partition.
|
||||
row_groups: usize,
|
||||
|
||||
|
@ -537,7 +537,6 @@ impl Partition {
|
|||
Self {
|
||||
key: partition_key.to_owned(),
|
||||
data: RwLock::new(ChunkData {
|
||||
size: chunk.size(),
|
||||
row_groups: chunk.row_groups(),
|
||||
rows: chunk.rows(),
|
||||
chunks: vec![(chunk.id(), chunk)].into_iter().collect(),
|
||||
|
@ -555,7 +554,6 @@ impl Partition {
|
|||
fn upsert_chunk(&mut self, chunk_id: u32, table_name: String, row_group: RowGroup) {
|
||||
let mut chunk_data = self.data.write().unwrap();
|
||||
|
||||
chunk_data.size += row_group.size();
|
||||
chunk_data.row_groups += 1;
|
||||
chunk_data.rows += row_group.rows() as u64;
|
||||
|
||||
|
@ -581,7 +579,6 @@ impl Partition {
|
|||
.remove(&chunk_id)
|
||||
.context(ChunkNotFound { id: chunk_id })?;
|
||||
|
||||
chunk_data.size -= chunk.size();
|
||||
chunk_data.rows -= chunk.rows();
|
||||
chunk_data.row_groups -= chunk.row_groups();
|
||||
Ok(chunk)
|
||||
|
@ -624,8 +621,18 @@ impl Partition {
|
|||
self.data.read().unwrap().rows
|
||||
}
|
||||
|
||||
/// The total estimated size in bytes of the `Partition` and all contained
|
||||
/// data.
|
||||
pub fn size(&self) -> u64 {
|
||||
self.data.read().unwrap().size
|
||||
let base_size = std::mem::size_of::<Self>() + self.key.len();
|
||||
|
||||
let chunk_data = self.data.read().unwrap();
|
||||
base_size as u64
|
||||
+ chunk_data
|
||||
.chunks
|
||||
.values()
|
||||
.map(|chunk| std::mem::size_of::<u32>() as u64 + chunk.size())
|
||||
.sum::<u64>()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -69,13 +69,13 @@ impl RowGroup {
|
|||
let mut time_column = None;
|
||||
|
||||
for (name, ct) in columns {
|
||||
meta.size += ct.size();
|
||||
match ct {
|
||||
ColumnType::Tag(c) => {
|
||||
assert_eq!(c.num_rows(), rows);
|
||||
|
||||
meta.add_column(
|
||||
&name,
|
||||
c.size(),
|
||||
schema::ColumnType::Tag(name.clone()),
|
||||
c.logical_datatype(),
|
||||
c.column_range().unwrap(),
|
||||
|
@ -89,6 +89,7 @@ impl RowGroup {
|
|||
|
||||
meta.add_column(
|
||||
&name,
|
||||
c.size(),
|
||||
schema::ColumnType::Field(name.clone()),
|
||||
c.logical_datatype(),
|
||||
c.column_range().unwrap(),
|
||||
|
@ -110,6 +111,7 @@ impl RowGroup {
|
|||
|
||||
meta.add_column(
|
||||
&name,
|
||||
c.size(),
|
||||
schema::ColumnType::Timestamp(name.clone()),
|
||||
c.logical_datatype(),
|
||||
c.column_range().unwrap(),
|
||||
|
@ -133,9 +135,15 @@ impl RowGroup {
|
|||
}
|
||||
}
|
||||
|
||||
/// The total size in bytes of the read group
|
||||
/// The total estimated size in bytes of the read group
|
||||
pub fn size(&self) -> u64 {
|
||||
self.meta.size
|
||||
let base_size = std::mem::size_of::<Self>()
|
||||
+ self
|
||||
.all_columns_by_name
|
||||
.iter()
|
||||
.map(|(key, value)| key.len() + std::mem::size_of::<usize>())
|
||||
.sum::<usize>();
|
||||
base_size as u64 + self.meta.size()
|
||||
}
|
||||
|
||||
/// The number of rows in the `RowGroup` (all columns have the same number
|
||||
|
@ -1377,6 +1385,12 @@ pub struct ColumnMeta {
|
|||
pub range: (OwnedValue, OwnedValue),
|
||||
}
|
||||
|
||||
impl ColumnMeta {
|
||||
pub fn size(&self) -> usize {
|
||||
std::mem::size_of::<Self>() + self.range.0.size() + self.range.1.size()
|
||||
}
|
||||
}
|
||||
|
||||
// column metadata is equivalent for two columns if their logical type and
|
||||
// semantic type are equivalent.
|
||||
impl PartialEq for ColumnMeta {
|
||||
|
@ -1387,10 +1401,10 @@ impl PartialEq for ColumnMeta {
|
|||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct MetaData {
|
||||
// The total size of the table in bytes.
|
||||
pub size: u64,
|
||||
// The total size in bytes of all column data in the `RowGroup`.
|
||||
pub columns_size: u64,
|
||||
|
||||
// The total number of rows in the table.
|
||||
// The total number of rows in the `RowGroup`.
|
||||
pub rows: u32,
|
||||
|
||||
// The distinct set of columns for this `RowGroup` (all of these columns
|
||||
|
@ -1401,8 +1415,7 @@ pub struct MetaData {
|
|||
// possibly match based on the range of values a column has.
|
||||
pub columns: BTreeMap<String, ColumnMeta>,
|
||||
|
||||
// The total time range of this table spanning all of the `RowGroup`s within
|
||||
// the table.
|
||||
// The total time range of this `RowGroup`.
|
||||
//
|
||||
// This can be used to skip the table entirely if the time range for a query
|
||||
// falls outside of this range.
|
||||
|
@ -1410,6 +1423,21 @@ pub struct MetaData {
|
|||
}
|
||||
|
||||
impl MetaData {
|
||||
/// Returns the estimated size in bytes of the meta data and all column data
|
||||
/// associated with a `RowGroup`.
|
||||
pub fn size(&self) -> u64 {
|
||||
let base_size = std::mem::size_of::<Self>();
|
||||
|
||||
(base_size
|
||||
// account for contents of meta data
|
||||
+ self
|
||||
.columns
|
||||
.iter()
|
||||
.map(|(k, v)| k.len() + v.size())
|
||||
.sum::<usize>()) as u64
|
||||
+ self.columns_size
|
||||
}
|
||||
|
||||
// helper function to determine if the provided binary expression could be
|
||||
// satisfied in the `RowGroup`, If this function returns `false` then there
|
||||
// no rows in the `RowGroup` would ever match the expression.
|
||||
|
@ -1451,6 +1479,7 @@ impl MetaData {
|
|||
pub fn add_column(
|
||||
&mut self,
|
||||
name: &str,
|
||||
column_size: u64,
|
||||
col_type: schema::ColumnType,
|
||||
logical_data_type: LogicalDataType,
|
||||
range: (OwnedValue, OwnedValue),
|
||||
|
@ -1463,6 +1492,7 @@ impl MetaData {
|
|||
range,
|
||||
},
|
||||
);
|
||||
self.columns_size += column_size;
|
||||
}
|
||||
|
||||
// Extract schema information for a set of columns.
|
||||
|
@ -1932,6 +1962,33 @@ mod test {
|
|||
Predicate::new(vec![expr])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size() {
|
||||
let mut columns = BTreeMap::new();
|
||||
let rc = ColumnType::Tag(Column::from(&[Some("west"), Some("west"), None, None][..]));
|
||||
columns.insert("region".to_string(), rc);
|
||||
let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..]));
|
||||
columns.insert("time".to_string(), tc);
|
||||
|
||||
let row_group = RowGroup::new(4, columns);
|
||||
|
||||
let rg_size = row_group.size();
|
||||
assert!(rg_size > 0);
|
||||
|
||||
let mut columns = BTreeMap::new();
|
||||
|
||||
let track = ColumnType::Tag(Column::from(
|
||||
&[Some("Thinking"), Some("of"), Some("a"), Some("place")][..],
|
||||
));
|
||||
columns.insert("track".to_string(), track);
|
||||
let tc = ColumnType::Time(Column::from(&[100_i64, 200, 500, 600][..]));
|
||||
columns.insert("time".to_string(), tc);
|
||||
|
||||
let row_group = RowGroup::new(4, columns);
|
||||
|
||||
assert!(row_group.size() > rg_size);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn row_ids_from_predicates() {
|
||||
let mut columns = BTreeMap::new();
|
||||
|
|
|
@ -72,7 +72,7 @@ impl Table {
|
|||
Self {
|
||||
name: name.into(),
|
||||
table_data: RwLock::new(RowGroupData {
|
||||
meta: Arc::new(MetaData::new(rg.metadata())),
|
||||
meta: Arc::new(MetaData::new(&rg)),
|
||||
data: vec![Arc::new(rg)],
|
||||
}),
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ impl Table {
|
|||
// `meta` can't be modified whilst protected by an Arc so create a new one.
|
||||
row_groups.meta = Arc::new(MetaData::update_with(
|
||||
MetaData::clone(&row_groups.meta), // clone meta-data not Arc
|
||||
rg.metadata(),
|
||||
&rg,
|
||||
));
|
||||
|
||||
// Add the new row group data to the table.
|
||||
|
@ -126,7 +126,9 @@ impl Table {
|
|||
|
||||
/// The total size of the table in bytes.
|
||||
pub fn size(&self) -> u64 {
|
||||
self.table_data.read().unwrap().meta.size
|
||||
let base_size = std::mem::size_of::<Self>() + self.name.len();
|
||||
// meta.size accounts for all the row group data.
|
||||
base_size as u64 + self.table_data.read().unwrap().meta.size()
|
||||
}
|
||||
|
||||
// Returns the total number of row groups in this table.
|
||||
|
@ -570,24 +572,39 @@ struct MetaData {
|
|||
}
|
||||
|
||||
impl MetaData {
|
||||
pub fn new(meta: &row_group::MetaData) -> Self {
|
||||
pub fn new(rg: &row_group::RowGroup) -> Self {
|
||||
Self {
|
||||
size: meta.size,
|
||||
rows: meta.rows as u64,
|
||||
columns: meta.columns.clone(),
|
||||
column_names: meta.columns.keys().cloned().collect(),
|
||||
time_range: Some(meta.time_range),
|
||||
size: rg.size(),
|
||||
rows: rg.rows() as u64,
|
||||
columns: rg.metadata().columns.clone(),
|
||||
column_names: rg.metadata().columns.keys().cloned().collect(),
|
||||
time_range: Some(rg.metadata().time_range),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the estimated size in bytes of the `MetaData` struct and all of
|
||||
/// the row group data associated with a `Table`.
|
||||
pub fn size(&self) -> u64 {
|
||||
let base_size = std::mem::size_of::<Self>();
|
||||
let columns_meta_size = self
|
||||
.columns
|
||||
.iter()
|
||||
.map(|(k, v)| k.len() + v.size())
|
||||
.sum::<usize>();
|
||||
|
||||
let column_names_size = self.column_names.iter().map(|c| c.len()).sum::<usize>();
|
||||
(base_size + columns_meta_size + column_names_size) as u64 + self.size
|
||||
}
|
||||
|
||||
/// Create a new `MetaData` by consuming `this` and incorporating `other`.
|
||||
pub fn update_with(mut this: Self, other: &row_group::MetaData) -> Self {
|
||||
pub fn update_with(mut this: Self, rg: &row_group::RowGroup) -> Self {
|
||||
let other = rg.metadata();
|
||||
// The incoming row group must have exactly the same schema as the
|
||||
// existing row groups in the table.
|
||||
assert_eq!(&this.columns, &other.columns);
|
||||
|
||||
// update size, rows, column ranges, time range
|
||||
this.size += other.size;
|
||||
this.size += rg.size();
|
||||
this.rows += other.rows as u64;
|
||||
|
||||
// The incoming row group must have exactly the same schema as the
|
||||
|
@ -687,9 +704,9 @@ impl From<&Vec<Arc<RowGroup>>> for MetaData {
|
|||
panic!("row groups required for meta data construction");
|
||||
}
|
||||
|
||||
let mut meta = Self::new(row_groups[0].metadata());
|
||||
let mut meta = Self::new(&row_groups[0]);
|
||||
for row_group in row_groups.iter().skip(1) {
|
||||
meta = Self::update_with(meta, row_group.metadata());
|
||||
meta = Self::update_with(meta, &row_group);
|
||||
}
|
||||
|
||||
meta
|
||||
|
@ -909,8 +926,6 @@ impl std::fmt::Display for DisplayReadAggregateResults<'_> {
|
|||
mod test {
|
||||
use super::*;
|
||||
|
||||
use row_group::ColumnMeta;
|
||||
|
||||
use crate::column::Column;
|
||||
use crate::row_group::{BinaryExpr, ColumnType, ReadAggregateResult};
|
||||
use crate::schema;
|
||||
|
@ -919,67 +934,47 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn meta_data_update_with() {
|
||||
let rg_meta = row_group::MetaData {
|
||||
size: 100,
|
||||
rows: 2000,
|
||||
columns: vec![(
|
||||
"region".to_owned(),
|
||||
ColumnMeta {
|
||||
typ: schema::ColumnType::Tag("region".to_owned()),
|
||||
logical_data_type: schema::LogicalDataType::String,
|
||||
range: (
|
||||
OwnedValue::String("north".to_owned()),
|
||||
OwnedValue::String("south".to_owned()),
|
||||
),
|
||||
},
|
||||
)]
|
||||
.into_iter()
|
||||
.collect::<BTreeMap<_, _>>(),
|
||||
time_range: (10, 3000),
|
||||
};
|
||||
let mut columns = BTreeMap::new();
|
||||
columns.insert(
|
||||
"time".to_string(),
|
||||
ColumnType::create_time(&[100, 200, 300]),
|
||||
);
|
||||
columns.insert(
|
||||
"region".to_string(),
|
||||
ColumnType::create_tag(&["west", "west", "north"]),
|
||||
);
|
||||
let rg = RowGroup::new(3, columns);
|
||||
|
||||
let mut meta = MetaData::new(&rg_meta);
|
||||
assert_eq!(meta.rows, 2000);
|
||||
assert_eq!(meta.size, 100);
|
||||
assert_eq!(meta.time_range, Some((10, 3000)));
|
||||
let mut meta = MetaData::new(&rg);
|
||||
assert_eq!(meta.rows, 3);
|
||||
let meta_size = meta.size;
|
||||
assert!(meta_size > 0);
|
||||
assert_eq!(meta.time_range, Some((100, 300)));
|
||||
assert_eq!(
|
||||
meta.columns.get("region").unwrap().range,
|
||||
(
|
||||
OwnedValue::String("north".to_owned()),
|
||||
OwnedValue::String("south".to_owned())
|
||||
OwnedValue::String("west".to_owned())
|
||||
)
|
||||
);
|
||||
|
||||
meta = MetaData::update_with(
|
||||
meta,
|
||||
&row_group::MetaData {
|
||||
size: 300,
|
||||
rows: 1500,
|
||||
columns: vec![(
|
||||
"region".to_owned(),
|
||||
ColumnMeta {
|
||||
typ: schema::ColumnType::Tag("region".to_owned()),
|
||||
logical_data_type: schema::LogicalDataType::String,
|
||||
range: (
|
||||
OwnedValue::String("east".to_owned()),
|
||||
OwnedValue::String("north".to_owned()),
|
||||
),
|
||||
},
|
||||
)]
|
||||
.into_iter()
|
||||
.collect::<BTreeMap<_, _>>(),
|
||||
time_range: (10, 3500),
|
||||
},
|
||||
let mut columns = BTreeMap::new();
|
||||
columns.insert("time".to_string(), ColumnType::create_time(&[10, 400]));
|
||||
columns.insert(
|
||||
"region".to_string(),
|
||||
ColumnType::create_tag(&["east", "south"]),
|
||||
);
|
||||
let rg = RowGroup::new(2, columns);
|
||||
|
||||
assert_eq!(meta.rows, 3500);
|
||||
assert_eq!(meta.size, 400);
|
||||
assert_eq!(meta.time_range, Some((10, 3500)));
|
||||
meta = MetaData::update_with(meta, &rg);
|
||||
assert_eq!(meta.rows, 5);
|
||||
assert!(meta.size > meta_size);
|
||||
assert_eq!(meta.time_range, Some((10, 400)));
|
||||
assert_eq!(
|
||||
meta.columns.get("region").unwrap().range,
|
||||
(
|
||||
OwnedValue::String("east".to_owned()),
|
||||
OwnedValue::String("south".to_owned())
|
||||
OwnedValue::String("west".to_owned())
|
||||
)
|
||||
);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use std::sync::Arc;
|
||||
use std::{collections::BTreeSet, convert::TryFrom};
|
||||
use std::{mem::size_of, sync::Arc};
|
||||
|
||||
use arrow_deps::arrow;
|
||||
|
||||
|
@ -486,6 +486,18 @@ pub enum OwnedValue {
|
|||
Scalar(Scalar),
|
||||
}
|
||||
|
||||
impl OwnedValue {
|
||||
/// The size in bytes of this value.
|
||||
pub fn size(&self) -> usize {
|
||||
let self_size = size_of::<Self>();
|
||||
match self {
|
||||
Self::String(s) => s.len() + self_size,
|
||||
Self::ByteArray(arr) => arr.len() + self_size,
|
||||
_ => self_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<Value<'_>> for OwnedValue {
|
||||
fn eq(&self, other: &Value<'_>) -> bool {
|
||||
match (&self, other) {
|
||||
|
@ -797,3 +809,23 @@ impl EncodedValues {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn size() {
|
||||
let v1 = OwnedValue::Null;
|
||||
assert_eq!(v1.size(), 32);
|
||||
|
||||
let v1 = OwnedValue::Scalar(Scalar::I64(22));
|
||||
assert_eq!(v1.size(), 32);
|
||||
|
||||
let v1 = OwnedValue::String("11morebytes".to_owned());
|
||||
assert_eq!(v1.size(), 43);
|
||||
|
||||
let v1 = OwnedValue::ByteArray(vec![2, 44, 252]);
|
||||
assert_eq!(v1.size(), 35);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue