Merge pull request #2267 from influxdata/er/read_buffer/cap

feat: Add API to encodings to get allocated buffer sizes
pull/24376/head
kodiakhq[bot] 2021-08-12 15:59:52 +00:00 committed by GitHub
commit ac28f83cf0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 188 additions and 101 deletions

View File

@ -369,10 +369,10 @@ async fn sql_select_from_system_chunk_columns() {
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
"| partition_key | chunk_id | table_name | column_name | storage | row_count | null_count | min_value | max_value | memory_bytes |",
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 252 |",
"| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 425 |",
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 240 |",
"| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 425 |",
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 255 |",
"| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 281 |",
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 243 |",
"| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 281 |",
"| 1970-01-01T00 | 0 | h2o | time | ReadBuffer | 2 | 0 | 50 | 250 | 51 |",
"| 1970-01-01T00 | 0 | o2 | city | OpenMutableBuffer | 2 | 1 | Boston | Boston | 35 |",
"| 1970-01-01T00 | 0 | o2 | reading | OpenMutableBuffer | 2 | 1 | 51 | 51 | 25 |",

View File

@ -659,10 +659,10 @@ mod test {
"# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
"# TYPE read_buffer_column_bytes gauge",
r#"read_buffer_column_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64"} 72"#,
r#"read_buffer_column_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 800"#,
r#"read_buffer_column_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 512"#,
r#"read_buffer_column_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 96"#,
r#"read_buffer_column_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#,
r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 500"#,
r#"read_buffer_column_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 384"#,
r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 506"#,
"# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer",
"# TYPE read_buffer_column_raw_bytes gauge",
r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#,

View File

@ -11,7 +11,7 @@ impl BooleanEncoding {
/// The total size in bytes of the store columnar data.
pub fn size(&self) -> usize {
match self {
Self::BooleanNull(enc) => enc.size(),
Self::BooleanNull(enc) => enc.size(false),
}
}

View File

@ -1,6 +1,7 @@
//! An encoding nullable bool, by an Arrow array.
use std::cmp::Ordering;
use std::fmt::Debug;
use std::mem::size_of;
use arrow::array::{Array, BooleanArray};
use cmp::Operator;
@ -19,7 +20,7 @@ impl std::fmt::Display for Bool {
"[Bool] rows: {:?}, nulls: {:?}, size: {}",
self.arr.len(),
self.arr.null_count(),
self.size()
self.size(false)
)
}
}
@ -42,8 +43,12 @@ impl Bool {
/// Returns an estimation of the total size in bytes used by this column
/// encoding.
pub fn size(&self) -> usize {
std::mem::size_of::<BooleanArray>() + self.arr.get_array_memory_size()
pub fn size(&self, buffers: bool) -> usize {
size_of::<Self>()
+ match buffers {
true => self.arr.get_array_memory_size(), // includes buffer capacities
false => self.arr.get_buffer_memory_size(),
}
}
/// The estimated total size in bytes of the underlying bool values in the
@ -360,7 +365,8 @@ mod test {
#[test]
fn size() {
let v = Bool::from(vec![None, None, Some(true), Some(false)].as_slice());
assert_eq!(v.size(), 400);
assert_eq!(v.size(false), 256);
assert_eq!(v.size(true), 400); // includes allocated buffers
}
#[test]

View File

@ -18,8 +18,10 @@ pub trait ScalarEncoding<L>: Debug + Display + Send + Sync {
/// A useful name for the encoding, likely used in instrumentation.
fn name(&self) -> &'static str;
/// The total size in bytes to store encoded data in memory.
fn size(&self) -> usize;
/// The total size in bytes to store encoded data in memory. If `buffers`
/// is true then the returned size should account for any allocated buffers
/// within the contained encoding structures.
fn size(&self, buffers: bool) -> usize;
/// The estimated total size in bytes of the underlying encoded values if
/// they were stored contiguously as a vector of `L`. `include_null` should

View File

@ -53,7 +53,7 @@ where
"[{}] rows: {:?}, size: {}",
self.name(),
self.num_rows(),
self.size()
self.size(false)
)
}
}
@ -252,9 +252,13 @@ where
self.values.len() as u32
}
/// Encoded data size including `Self` - an "accurate" estimation.
fn size(&self) -> usize {
size_of::<Self>() + (size_of::<P>() * self.values.len())
fn size(&self, buffers: bool) -> usize {
let values = size_of::<P>()
* match buffers {
true => self.values.capacity(),
false => self.values.len(),
};
size_of::<Self>() + values
}
fn size_raw(&self, _: bool) -> usize {
@ -425,6 +429,19 @@ mod test {
(Fixed::new(values, Arc::clone(&mock)), mock)
}
#[test]
fn size() {
let (v, _) = new_encoding(vec![22_i64, 1, 18]);
// Self is 32 bytes and there are 3 * 8b values
assert_eq!(v.size(false), 56);
// check pre-allocated sizing
let (mut v, _) = new_encoding(vec![]);
v.values.reserve_exact(40);
// Self if 32 bytes and there are 40 * 8b values allocated
assert_eq!(v.size(true), 352);
}
#[test]
fn value() {
let (v, transcoder) = new_encoding(vec![22, 1, 18]);

View File

@ -52,7 +52,7 @@ where
self.name(),
self.arr.len(),
self.arr.null_count(),
self.size()
self.size(false)
)
}
}
@ -260,8 +260,12 @@ where
self.arr.null_count() as u32
}
fn size(&self) -> usize {
size_of::<Self>() + self.arr.get_array_memory_size()
fn size(&self, buffers: bool) -> usize {
size_of::<Self>()
+ match buffers {
true => self.arr.get_array_memory_size(),
false => self.arr.get_buffer_memory_size(),
}
}
/// The estimated total size in bytes of the underlying values in the
@ -478,7 +482,8 @@ mod test {
#[test]
fn size() {
let (v, _) = new_encoding(vec![None, None, Some(100), Some(2222)]);
assert_eq!(v.size(), 408);
assert_eq!(v.size(false), 264);
assert_eq!(v.size(true), 408); // includes allocated buffers
}
#[test]

View File

@ -70,7 +70,7 @@ where
f,
"[{}] size: {:?} rows: {:?} nulls: {} runs: {} ",
self.name(),
self.size(),
self.size(false),
self.num_rows(),
self.null_count(),
self.run_lengths.len()
@ -343,8 +343,13 @@ where
ENCODING_NAME
}
fn size(&self) -> usize {
std::mem::size_of::<Self>() + (self.run_lengths.len() * size_of::<(u32, Option<P>)>())
fn size(&self, buffers: bool) -> usize {
let values = size_of::<(u32, Option<P>)>()
* match buffers {
true => self.run_lengths.capacity(),
false => self.run_lengths.len(),
};
std::mem::size_of::<Self>() + values
}
fn size_raw(&self, include_nulls: bool) -> usize {
@ -713,16 +718,26 @@ mod test {
fn size() {
let (mut enc, _) = new_encoding(vec![]);
// 40b Self + (0 rl * 24) = 32
assert_eq!(enc.size(), 40);
// 40b Self + (0 rl * 24) = 40
assert_eq!(enc.size(false), 40);
enc.push_none();
// 40b Self + (1 rl * 24) = 56
assert_eq!(enc.size(), 64);
// 40b Self + (1 rl * 24) = 64
assert_eq!(enc.size(false), 64);
enc.push_additional_some(1, 10);
// 40b Self + (2 rl * 24) = 80
assert_eq!(enc.size(), 88);
// 40b Self + (2 rl * 24) = 88
assert_eq!(enc.size(false), 88);
// check allocated buffer size
let (mut enc, _) = new_encoding(vec![]);
enc.run_lengths.reserve_exact(40);
// 40b Self + (40 rl * 24) = 1000b
assert_eq!(enc.size(true), 1000);
// 40b Self + (40 rl * 24) = 1000b - no new allocations
enc.push_additional_some(1, 10);
assert_eq!(enc.size(true), 1000);
}
#[test]

View File

@ -30,8 +30,8 @@ impl Encoding {
pub fn size(&self) -> usize {
match &self {
Self::RLE(enc) => enc.size(),
Self::Plain(enc) => enc.size(),
Self::RLE(enc) => enc.size(false),
Self::Plain(enc) => enc.size(false),
}
}

View File

@ -47,7 +47,7 @@ impl Default for Dictionary {
}
impl Dictionary {
/// Initialises an Dictionar encoding with a set of logical values.
/// Initialises a Dictionary encoding with a set of logical values.
/// Creating an encoding using `with_dictionary` ensures that the dictionary
/// is in the correct order, and will allow values to be inserted with any
/// value in the dictionary.
@ -61,22 +61,33 @@ impl Dictionary {
}
/// A reasonable estimation of the on-heap size this encoding takes up.
pub fn size(&self) -> usize {
// the total size of all decoded values in the column.
let decoded_keys_size = self
/// If `buffers` is true then all allocated buffers in the encoding are
/// accounted for.
pub fn size(&self, buffers: bool) -> usize {
let base_size = size_of::<Self>();
// Total size of all decoded values in the column.
let mut decoded_keys_size = self
.entries
.iter()
.map(|k| match k {
Some(v) => v.len(),
Some(v) => v.len(),
None => 0,
} + size_of::<Option<String>>())
.sum::<usize>();
let entries_size = size_of::<Vec<Option<String>>>() + decoded_keys_size;
let encoded_ids_size = size_of::<Vec<u32>>() + (size_of::<u32>() * self.encoded_data.len());
if buffers {
decoded_keys_size +=
(self.entries.capacity() - self.entries.len()) * size_of::<Option<String>>();
}
// + 1 for contains_null field
entries_size + encoded_ids_size + 1
let encoded_ids_size = size_of::<u32>()
* match buffers {
true => self.encoded_data.capacity(),
false => self.encoded_data.len(),
};
base_size + decoded_keys_size + encoded_ids_size
}
/// A reasonable estimation of the on-heap size of the underlying string
@ -837,7 +848,7 @@ impl std::fmt::Display for Dictionary {
f,
"[{}] size: {:?} rows: {:?} cardinality: {}",
ENCODING_NAME,
self.size(),
self.size(false),
self.num_rows(),
self.cardinality(),
)
@ -873,17 +884,13 @@ mod test {
enc.push_none();
enc.push_none();
// keys - 14 bytes.
// 3 string entries in dictionary
// entries is 24 + (24*4) + 14 == 134
// Self - 24+24+8 = 56 bytes (two vectors, a bool and padding)
// 4 string entries (inc NULL) in vec = 4 * 24 = 96
// 3 string entries with length 4+5+5 = 14
// 15 rows.
// encoded ids is 24 + (4 * 15) == 84
// 134 + 84 + 1 == 219
assert_eq!(enc.size(), 219);
// encoded ids is (4 * 15) == 60
// 56 + 96 + 14 + 60 = 226
assert_eq!(enc.size(false), 226);
// check dictionary
assert_eq!(
@ -899,6 +906,24 @@ mod test {
enc.encoded_data,
vec![1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 3, NULL_ID, NULL_ID, NULL_ID, NULL_ID]
);
// check for allocated size
let mut enc = Dictionary::default();
enc.encoded_data.reserve_exact(40);
enc.entries.reserve_exact(39); // account for already-allocated NULL element
enc.push_additional(Some("east".to_string()), 3);
enc.push_additional(Some("north".to_string()), 1);
enc.push_additional(Some("east".to_string()), 5);
enc.push_additional(Some("south".to_string()), 2);
enc.push_additional(None, 4);
// Self - 24+24+8 = 56 bytes (two vectors, a bool and padding)
// 40 string entries (inc NULL) in vec = 40 * 24 = 960
// 3 string entries with lengths 4+5+5 = 14
// 15 rows but 40 elements allocated
// encoded ids is (40 * 4) == 160
// 56 + 960 + 14 + 160 = 1190
assert_eq!(enc.size(true), 1190);
}
#[test]

View File

@ -3,8 +3,6 @@ use std::convert::From;
use std::iter;
use std::mem::size_of;
use croaring::Bitmap;
use arrow::array::{Array, StringArray};
use super::NULL_ID;
@ -75,13 +73,18 @@ impl RLE {
}
/// A reasonable estimation of the on-heap size this encoding takes up.
pub fn size(&self) -> usize {
// the total size of all decoded values in the column.
let decoded_keys_size = self.index_entries.iter().map(|k| k.len()).sum::<usize>();
/// If `buffers` is true then the size of all allocated buffers in the
/// encoding are accounted for.
pub fn size(&self, buffers: bool) -> usize {
let base_size = size_of::<Self>();
let index_entry_size = size_of::<Vec<String>>() // container size
+ (size_of::<String>() * self.index_entries.len()) // elements size
+ decoded_keys_size; // heap allocated strings size
let mut index_entries_size = size_of::<String>()
* match buffers {
true => self.index_entries.capacity(),
false => self.index_entries.len(),
};
// the total size of all decoded values in the column.
index_entries_size += self.index_entries.iter().map(|k| k.len()).sum::<usize>();
// The total size (an upper bound estimate) of all the bitmaps
// in the column.
@ -91,14 +94,16 @@ impl RLE {
.map(|row_ids| row_ids.size())
.sum::<usize>();
let index_row_ids_size = size_of::<BTreeMap<u32, Bitmap>>()
+ (size_of::<u32>() * self.index_row_ids.len())
+ row_ids_bitmaps_size;
let index_row_ids_size =
(size_of::<u32>() * self.index_row_ids.len()) + row_ids_bitmaps_size;
let run_lengths_size = size_of::<Vec<(u32, u32)>>() + // container size
(size_of::<(u32, u32)>() * self.run_lengths.len()); // each run-length size
let run_lengths_size = size_of::<(u32, u32)>()
* match buffers {
true => self.run_lengths.capacity(),
false => self.run_lengths.len(),
};
index_entry_size + index_row_ids_size + run_lengths_size + 1 + 4
base_size + index_entries_size + index_row_ids_size + run_lengths_size
}
/// A reasonable estimation of the on-heap size of the underlying string
@ -958,7 +963,7 @@ impl std::fmt::Display for RLE {
f,
"[{}] size: {:?} rows: {:?} cardinality: {}, nulls: {} runs: {} ",
ENCODING_NAME,
self.size(),
self.size(false),
self.num_rows,
self.cardinality(),
self.null_count(),
@ -1000,22 +1005,34 @@ mod test {
enc.push_none();
enc.push_none();
// Note: there are 4 index entries to account for NULL entry.
// `index_entry` is 24 + (24*4) + 14 == 134
// * Self: 24 + 24 + 24 + 1 + (padding 3b) + 4 = 80b
// * index entries: (4) are is (24*4) + 14 == 110
// * index row ids: (bitmaps) is (4 * 4) + (108b for bitmaps) == 124
// * run lengths: (8*5) == 40
//
// bitmaps for east, north, south and NULL entries.
// `index_row_ids` is 24 + (4 * 4) + (108b for bitmaps) == 148
//
// `run lengths` is 24 + (8*5) == 64
//
// `contains_null` - 1 byte
// `num_rows` - 4 bytes
//
// 351
// 354
// assert_eq!(enc.size(false), 354);
// TODO(edd): there some mystery bytes in the bitmap implementation.
// need to figure out how to measure these
assert_eq!(enc.size(), 351);
// check allocated size
let mut enc = RLE::default();
enc.index_entries.reserve_exact(39); // account for already-allocated NULL element
enc.run_lengths.reserve_exact(40);
enc.push_additional(Some("east".to_string()), 3);
enc.push_additional(Some("north".to_string()), 1);
enc.push_additional(Some("east".to_string()), 5);
enc.push_additional(Some("south".to_string()), 2);
enc.push_none();
enc.push_none();
enc.push_none();
enc.push_none();
// * Self: 24 + 24 + 24 + 1 + (padding 3b) + 4 = 80b
// * index entries: (40 * 24) + 14 == 974
// * index row ids: (bitmaps) is (4 * 4) + (108b for bitmaps) == 124
// * run lengths: (40 * 8) == 320
//
assert_eq!(enc.size(true), 1498);
}
#[test]

View File

@ -32,7 +32,7 @@ impl FloatEncoding {
/// The total size in bytes of to store columnar data in memory.
pub fn size(&self) -> usize {
match self {
Self::F64(enc, _) => enc.size(),
Self::F64(enc, _) => enc.size(false),
}
}

View File

@ -27,8 +27,8 @@ impl IntegerEncoding {
/// The total size in bytes of the store columnar data.
pub fn size(&self) -> usize {
match self {
Self::I64(enc, _) => enc.size(),
Self::U64(enc, _) => enc.size(),
Self::I64(enc, _) => enc.size(false),
Self::U64(enc, _) => enc.size(false),
}
}
@ -971,13 +971,13 @@ mod test {
// Input data containing NULL will be stored in an Arrow array encoding
let cases = vec![
(vec![None, Some(0_i64)], 400_usize), // u8 Arrow array
(vec![None, Some(-120_i64)], 400), // i8
(vec![None, Some(399_i64)], 400), // u16
(vec![None, Some(-399_i64)], 400), // i16
(vec![None, Some(u32::MAX as i64)], 400), // u32
(vec![None, Some(i32::MIN as i64)], 400), // i32
(vec![None, Some(u32::MAX as i64 + 1)], 400), //u64
(vec![None, Some(0_i64)], 256_usize), // u8 Arrow array
(vec![None, Some(-120_i64)], 256), // i8
(vec![None, Some(399_i64)], 256), // u16
(vec![None, Some(-399_i64)], 256), // i16
(vec![None, Some(u32::MAX as i64)], 256), // u32
(vec![None, Some(i32::MIN as i64)], 256), // i32
(vec![None, Some(u32::MAX as i64 + 1)], 256), //u64
];
for (case, name) in cases.iter().cloned() {
@ -1163,10 +1163,10 @@ mod test {
// Input data containing NULL will be stored in an Arrow array encoding
let cases = vec![
(vec![None, Some(0_u64)], 400_usize),
(vec![None, Some(399_u64)], 400),
(vec![None, Some(u32::MAX as u64)], 400),
(vec![None, Some(u64::MAX)], 400),
(vec![None, Some(0_u64)], 256_usize),
(vec![None, Some(399_u64)], 256),
(vec![None, Some(u32::MAX as u64)], 256),
(vec![None, Some(u64::MAX)], 256),
];
for (case, size) in cases.iter().cloned() {

View File

@ -30,8 +30,8 @@ impl StringEncoding {
/// The estimated total size in bytes of the in-memory columnar data.
pub fn size(&self) -> usize {
match self {
Self::RleDictionary(enc) => enc.size(),
Self::Dictionary(enc) => enc.size(),
Self::RleDictionary(enc) => enc.size(false),
Self::Dictionary(enc) => enc.size(false),
}
}

View File

@ -2531,7 +2531,7 @@ mod tests {
("svr_id", "1"),
])
.histogram()
.sample_sum_eq(3191.0)
.sample_sum_eq(3197.0)
.unwrap();
let rb = collect_read_filter(&rb_chunk).await;
@ -3400,7 +3400,7 @@ mod tests {
id: 2,
storage: ChunkStorage::ReadBufferAndObjectStore,
lifecycle_action,
memory_bytes: 3284, // size of RB and OS chunks
memory_bytes: 3140, // size of RB and OS chunks
object_store_bytes: 1577, // size of parquet file
row_count: 2,
time_of_last_access: None,
@ -3451,7 +3451,7 @@ mod tests {
}
assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 87);
assert_eq!(db.catalog.metrics().memory().read_buffer(), 2410);
assert_eq!(db.catalog.metrics().memory().read_buffer(), 2266);
assert_eq!(db.catalog.metrics().memory().object_store(), 874);
}