From 1daa30cc7d4b58c24fb1ec231b1485c5ae8dc8b3 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 17 Aug 2021 20:45:03 +0100 Subject: [PATCH 1/2] fix: include enum in sizing --- query_tests/src/sql.rs | 4 ++-- read_buffer/src/chunk.rs | 2 +- read_buffer/src/column.rs | 9 ++++----- read_buffer/src/column/encoding/string/rle.rs | 4 ++-- server/src/db.rs | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs index 0d91727cbd..24e935d324 100644 --- a/query_tests/src/sql.rs +++ b/query_tests/src/sql.rs @@ -369,9 +369,9 @@ async fn sql_select_from_system_chunk_columns() { "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+", "| partition_key | chunk_id | table_name | column_name | storage | row_count | null_count | min_value | max_value | memory_bytes |", "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+", - "| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 327 |", + "| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 375 |", "| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |", - "| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 315 |", + "| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 363 |", "| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |", "| 1970-01-01T00 | 0 | h2o | time | ReadBuffer | 2 | 0 | 50 | 250 | 110 |", "| 1970-01-01T00 | 0 | o2 | city | OpenMutableBuffer | 2 | 1 | Boston | Boston | 35 |", diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs index 3f347be6ca..6c5019eb99 100644 --- a/read_buffer/src/chunk.rs +++ b/read_buffer/src/chunk.rs @@ -694,7 +694,7 @@ mod test { r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 906"#, r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 186"#, r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#, - r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 586"#, + r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 730"#, "# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer", "# TYPE read_buffer_column_raw_bytes gauge", r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#, diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs index a0430422bf..6ab905bfa6 100644 --- a/read_buffer/src/column.rs +++ b/read_buffer/src/column.rs @@ -1273,12 +1273,11 @@ impl RowIDs { /// An estimation of the size in bytes needed to store `self`. pub fn size(&self) -> usize { - match self { - Self::Bitmap(bm) => std::mem::size_of::() + bm.get_serialized_size_in_bytes(), - Self::Vector(v) => { - std::mem::size_of::>() + (std::mem::size_of::() * v.len()) + std::mem::size_of::() + + match self { + Self::Bitmap(bm) => bm.get_serialized_size_in_bytes(), + Self::Vector(v) => std::mem::size_of::() * v.len(), } - } } /// Returns an iterator over the contents of the RowIDs. diff --git a/read_buffer/src/column/encoding/string/rle.rs b/read_buffer/src/column/encoding/string/rle.rs index f9f665ac0c..9900a3558b 100644 --- a/read_buffer/src/column/encoding/string/rle.rs +++ b/read_buffer/src/column/encoding/string/rle.rs @@ -1037,10 +1037,10 @@ mod test { // * Self: 24 + 24 + 24 + 1 + (padding 3b) + 4 = 80b // * index entries: (40 * 24) + 14 == 974 - // * index row ids: (bitmaps) is (4 * 4) + (108b for bitmaps) == 124 + // * index row ids: (bitmaps) is (4 * 4) + (204b for bitmaps) == 220 // * run lengths: (40 * 8) == 320 // - assert_eq!(enc.size(true), 1498); + assert_eq!(enc.size(true), 1594); } #[test] diff --git a/server/src/db.rs b/server/src/db.rs index 454b24df60..76715d9f04 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -2551,7 +2551,7 @@ mod tests { ("svr_id", "1"), ]) .histogram() - .sample_sum_eq(3628.0) + .sample_sum_eq(3820.0) .unwrap(); let rb = collect_read_filter(&rb_chunk).await; From b9f09fce4942f0156d30195bbb5cd4e245f75c92 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 17 Aug 2021 22:23:14 +0100 Subject: [PATCH 2/2] feat: improve bitset size estimation --- query_tests/src/sql.rs | 4 +- read_buffer/src/chunk.rs | 2 +- read_buffer/src/column.rs | 48 ++++++++++++++++++- read_buffer/src/column/encoding/string/rle.rs | 2 +- server/src/db.rs | 2 +- 5 files changed, 52 insertions(+), 6 deletions(-) diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs index 24e935d324..fb1e426be6 100644 --- a/query_tests/src/sql.rs +++ b/query_tests/src/sql.rs @@ -369,9 +369,9 @@ async fn sql_select_from_system_chunk_columns() { "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+", "| partition_key | chunk_id | table_name | column_name | storage | row_count | null_count | min_value | max_value | memory_bytes |", "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+", - "| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 375 |", + "| 1970-01-01T00 | 0 | h2o | city | ReadBuffer | 2 | 0 | Boston | Boston | 351 |", "| 1970-01-01T00 | 0 | h2o | other_temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |", - "| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 363 |", + "| 1970-01-01T00 | 0 | h2o | state | ReadBuffer | 2 | 0 | MA | MA | 339 |", "| 1970-01-01T00 | 0 | h2o | temp | ReadBuffer | 2 | 1 | 70.4 | 70.4 | 471 |", "| 1970-01-01T00 | 0 | h2o | time | ReadBuffer | 2 | 0 | 50 | 250 | 110 |", "| 1970-01-01T00 | 0 | o2 | city | OpenMutableBuffer | 2 | 1 | Boston | Boston | 35 |", diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs index 6c5019eb99..175d07136e 100644 --- a/read_buffer/src/chunk.rs +++ b/read_buffer/src/chunk.rs @@ -694,7 +694,7 @@ mod test { r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 906"#, r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 186"#, r#"read_buffer_column_allocated_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#, - r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 730"#, + r#"read_buffer_column_allocated_bytes{db="mydb",encoding="RLE",log_data_type="string"} 664"#, "# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer", "# TYPE read_buffer_column_raw_bytes gauge", r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#, diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs index 6ab905bfa6..07ad961949 100644 --- a/read_buffer/src/column.rs +++ b/read_buffer/src/column.rs @@ -1275,7 +1275,12 @@ impl RowIDs { pub fn size(&self) -> usize { std::mem::size_of::() + match self { - Self::Bitmap(bm) => bm.get_serialized_size_in_bytes(), + Self::Bitmap(bm) => { + let stats = bm.statistics(); + (stats.n_bytes_array_containers + + stats.n_bytes_bitset_containers + + stats.n_bytes_run_containers) as usize + } Self::Vector(v) => std::mem::size_of::() * v.len(), } } @@ -1319,6 +1324,12 @@ impl RowIDs { } } + pub fn optimise_storage(&mut self) { + if let Self::Bitmap(ids) = self { + ids.run_optimize(); + } + } + pub fn clear(&mut self) { match self { Self::Bitmap(ids) => ids.clear(), @@ -1420,6 +1431,41 @@ mod test { assert_eq!(row_ids.to_vec(), vec![2, 3, 4]); } + #[test] + fn row_ids_size() { + // See example roaring bitmaps in the paper: + // Roaring Bitmaps: Implementation of an Optimized Software Library + // Daniel Lemire et al. + + // The first 1000 multiples of 62 in the domain [0, 61938]. + // These are stored as 1000 16-bit values in a sorted array container + // requiring 2,000 bytes. We need 32 bytes for the RowIDs enum. + let row_ids = RowIDs::Bitmap((0..61939).step_by(62).collect()); + assert_eq!(row_ids.size(), 32 + 2000); + + // Runs of values in the domain [2^16, 2^16 + 100) then [2^16 + 101, 2^16 + 201) + // and then [2^16 + 300, 2^16 + 400). Altogether 300 values stored in + // 3 run lengths (starting value and number of subsequent values). + // + // Note: the paper says we need 3 * 16 bits to store the runs but + // actually you need 3 * 32 bits because you need 16 bits to store the + // value and 16 bits to store the run. + // + // So we have 32 bytes for the RowIDs enum, 2 bytes for the starting + // value of the container (65536) and then 3 runs of 4 bytes each. + let mut row_ids = RowIDs::Bitmap((65536..65536 + 100).collect()); + row_ids.add_range(65536 + 101, 65536 + 201); + row_ids.add_range(65536 + 300, 65536 + 400); + row_ids.optimise_storage(); + assert_eq!(row_ids.size(), 32 + 2 + (3 * 4)); + + // All even numbers in the domain [2*2^16, 3*2^16]. + // These will be stored using a bitset container. We need 32 bytes for + // the RowIDs enum and 2^16 bits (8,192 bytes) for the container. + let row_ids = RowIDs::Bitmap(((2 * 65536)..(3 * 65536)).step_by(2).collect()); + assert_eq!(row_ids.size(), 32 + 8192); + } + #[test] fn from_arrow_string_array_column_meta() { let cases = vec![ diff --git a/read_buffer/src/column/encoding/string/rle.rs b/read_buffer/src/column/encoding/string/rle.rs index 9900a3558b..92d139483a 100644 --- a/read_buffer/src/column/encoding/string/rle.rs +++ b/read_buffer/src/column/encoding/string/rle.rs @@ -1040,7 +1040,7 @@ mod test { // * index row ids: (bitmaps) is (4 * 4) + (204b for bitmaps) == 220 // * run lengths: (40 * 8) == 320 // - assert_eq!(enc.size(true), 1594); + assert_eq!(enc.size(true), 1544); } #[test] diff --git a/server/src/db.rs b/server/src/db.rs index 76715d9f04..a5eac9669d 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -2551,7 +2551,7 @@ mod tests { ("svr_id", "1"), ]) .histogram() - .sample_sum_eq(3820.0) + .sample_sum_eq(3706.0) .unwrap(); let rb = collect_read_filter(&rb_chunk).await;