feat: improve bitset size estimation
parent
1daa30cc7d
commit
b9f09fce49
|
@ -369,9 +369,9 @@ async fn sql_select_from_system_chunk_columns() {
|
|||
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
|
||||
"| partition_key | chunk_id | table_name | column_name | storage | row_count | null_count | min_value | max_value | memory_bytes |",
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
|
||||
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 375 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 351 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 363 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 339 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |",
|
||||
"| 1970-01-01T00 | 0 | h2o | time | ReadBuffer | 2 | 0 | 50 | 250 | 110 |",
|
||||
"| 1970-01-01T00 | 0 | o2 | city | OpenMutableBuffer | 2 | 1 | Boston | Boston | 35 |",
|
||||
|
|
|
@ -694,7 +694,7 @@ mod test {
|
|||
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 906"#,
|
||||
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 186"#,
|
||||
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#,
|
||||
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 730"#,
|
||||
r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 664"#,
|
||||
"# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer",
|
||||
"# TYPE read_buffer_column_raw_bytes gauge",
|
||||
r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#,
|
||||
|
|
|
@ -1275,7 +1275,12 @@ impl RowIDs {
|
|||
pub fn size(&self) -> usize {
|
||||
std::mem::size_of::<Self>()
|
||||
+ match self {
|
||||
Self::Bitmap(bm) => bm.get_serialized_size_in_bytes(),
|
||||
Self::Bitmap(bm) => {
|
||||
let stats = bm.statistics();
|
||||
(stats.n_bytes_array_containers
|
||||
+ stats.n_bytes_bitset_containers
|
||||
+ stats.n_bytes_run_containers) as usize
|
||||
}
|
||||
Self::Vector(v) => std::mem::size_of::<u32>() * v.len(),
|
||||
}
|
||||
}
|
||||
|
@ -1319,6 +1324,12 @@ impl RowIDs {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn optimise_storage(&mut self) {
|
||||
if let Self::Bitmap(ids) = self {
|
||||
ids.run_optimize();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
match self {
|
||||
Self::Bitmap(ids) => ids.clear(),
|
||||
|
@ -1420,6 +1431,41 @@ mod test {
|
|||
assert_eq!(row_ids.to_vec(), vec![2, 3, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn row_ids_size() {
|
||||
// See example roaring bitmaps in the paper:
|
||||
// Roaring Bitmaps: Implementation of an Optimized Software Library
|
||||
// Daniel Lemire et al.
|
||||
|
||||
// The first 1000 multiples of 62 in the domain [0, 61938].
|
||||
// These are stored as 1000 16-bit values in a sorted array container
|
||||
// requiring 2,000 bytes. We need 32 bytes for the RowIDs enum.
|
||||
let row_ids = RowIDs::Bitmap((0..61939).step_by(62).collect());
|
||||
assert_eq!(row_ids.size(), 32 + 2000);
|
||||
|
||||
// Runs of values in the domain [2^16, 2^16 + 100) then [2^16 + 101, 2^16 + 201)
|
||||
// and then [2^16 + 300, 2^16 + 400). Altogether 300 values stored in
|
||||
// 3 run lengths (starting value and number of subsequent values).
|
||||
//
|
||||
// Note: the paper says we need 3 * 16 bits to store the runs but
|
||||
// actually you need 3 * 32 bits because you need 16 bits to store the
|
||||
// value and 16 bits to store the run.
|
||||
//
|
||||
// So we have 32 bytes for the RowIDs enum, 2 bytes for the starting
|
||||
// value of the container (65536) and then 3 runs of 4 bytes each.
|
||||
let mut row_ids = RowIDs::Bitmap((65536..65536 + 100).collect());
|
||||
row_ids.add_range(65536 + 101, 65536 + 201);
|
||||
row_ids.add_range(65536 + 300, 65536 + 400);
|
||||
row_ids.optimise_storage();
|
||||
assert_eq!(row_ids.size(), 32 + 2 + (3 * 4));
|
||||
|
||||
// All even numbers in the domain [2*2^16, 3*2^16].
|
||||
// These will be stored using a bitset container. We need 32 bytes for
|
||||
// the RowIDs enum and 2^16 bits (8,192 bytes) for the container.
|
||||
let row_ids = RowIDs::Bitmap(((2 * 65536)..(3 * 65536)).step_by(2).collect());
|
||||
assert_eq!(row_ids.size(), 32 + 8192);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_arrow_string_array_column_meta() {
|
||||
let cases = vec![
|
||||
|
|
|
@ -1040,7 +1040,7 @@ mod test {
|
|||
// * index row ids: (bitmaps) is (4 * 4) + (204b for bitmaps) == 220
|
||||
// * run lengths: (40 * 8) == 320
|
||||
//
|
||||
assert_eq!(enc.size(true), 1594);
|
||||
assert_eq!(enc.size(true), 1544);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -2551,7 +2551,7 @@ mod tests {
|
|||
("svr_id", "1"),
|
||||
])
|
||||
.histogram()
|
||||
.sample_sum_eq(3820.0)
|
||||
.sample_sum_eq(3706.0)
|
||||
.unwrap();
|
||||
|
||||
let rb = collect_read_filter(&rb_chunk).await;
|
||||
|
|
Loading…
Reference in New Issue