feat: improve bitset size estimation

pull/24376/head
Edd Robinson 2021-08-17 22:23:14 +01:00
parent 1daa30cc7d
commit b9f09fce49
5 changed files with 52 additions and 6 deletions

View File

@ -369,9 +369,9 @@ async fn sql_select_from_system_chunk_columns() {
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
"| partition_key | chunk_id | table_name | column_name | storage | row_count | null_count | min_value | max_value | memory_bytes |",
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 375 |",
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 351 |",
"| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |",
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 363 |",
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 339 |",
"| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |",
"| 1970-01-01T00 | 0 | h2o | time | ReadBuffer | 2 | 0 | 50 | 250 | 110 |",
"| 1970-01-01T00 | 0 | o2 | city | OpenMutableBuffer | 2 | 1 | Boston | Boston | 35 |",

View File

@ -694,7 +694,7 @@ mod test {
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 906"#,
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 186"#,
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#,
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 730"#,
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 664"#,
"# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer",
"# TYPE read_buffer_column_raw_bytes gauge",
r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#,

View File

@ -1275,7 +1275,12 @@ impl RowIDs {
pub fn size(&self) -> usize {
std::mem::size_of::<Self>()
+ match self {
Self::Bitmap(bm) => bm.get_serialized_size_in_bytes(),
Self::Bitmap(bm) => {
let stats = bm.statistics();
(stats.n_bytes_array_containers
+ stats.n_bytes_bitset_containers
+ stats.n_bytes_run_containers) as usize
}
Self::Vector(v) => std::mem::size_of::<u32>() * v.len(),
}
}
@ -1319,6 +1324,12 @@ impl RowIDs {
}
}
pub fn optimise_storage(&mut self) {
if let Self::Bitmap(ids) = self {
ids.run_optimize();
}
}
pub fn clear(&mut self) {
match self {
Self::Bitmap(ids) => ids.clear(),
@ -1420,6 +1431,41 @@ mod test {
assert_eq!(row_ids.to_vec(), vec![2, 3, 4]);
}
#[test]
fn row_ids_size() {
// See example roaring bitmaps in the paper:
// Roaring Bitmaps: Implementation of an Optimized Software Library
// Daniel Lemire et al.
// The first 1000 multiples of 62 in the domain [0, 61938].
// These are stored as 1000 16-bit values in a sorted array container
// requiring 2,000 bytes. We need 32 bytes for the RowIDs enum.
let row_ids = RowIDs::Bitmap((0..61939).step_by(62).collect());
assert_eq!(row_ids.size(), 32 + 2000);
// Runs of values in the domain [2^16, 2^16 + 100) then [2^16 + 101, 2^16 + 201)
// and then [2^16 + 300, 2^16 + 400). Altogether 300 values stored in
// 3 run lengths (starting value and number of subsequent values).
//
// Note: the paper says we need 3 * 16 bits to store the runs but
// actually you need 3 * 32 bits because you need 16 bits to store the
// value and 16 bits to store the run.
//
// So we have 32 bytes for the RowIDs enum, 2 bytes for the starting
// value of the container (65536) and then 3 runs of 4 bytes each.
let mut row_ids = RowIDs::Bitmap((65536..65536 + 100).collect());
row_ids.add_range(65536 + 101, 65536 + 201);
row_ids.add_range(65536 + 300, 65536 + 400);
row_ids.optimise_storage();
assert_eq!(row_ids.size(), 32 + 2 + (3 * 4));
// All even numbers in the domain [2*2^16, 3*2^16].
// These will be stored using a bitset container. We need 32 bytes for
// the RowIDs enum and 2^16 bits (8,192 bytes) for the container.
let row_ids = RowIDs::Bitmap(((2 * 65536)..(3 * 65536)).step_by(2).collect());
assert_eq!(row_ids.size(), 32 + 8192);
}
#[test]
fn from_arrow_string_array_column_meta() {
let cases = vec![

View File

@ -1040,7 +1040,7 @@ mod test {
// * index row ids: (bitmaps) is (4 * 4) + (204b for bitmaps) == 220
// * run lengths: (40 * 8) == 320
//
assert_eq!(enc.size(true), 1594);
assert_eq!(enc.size(true), 1544);
}
#[test]

View File

@ -2551,7 +2551,7 @@ mod tests {
("svr_id", "1"),
])
.histogram()
.sample_sum_eq(3820.0)
.sample_sum_eq(3706.0)
.unwrap();
let rb = collect_read_filter(&rb_chunk).await;