From 78d3749af50d392a861ebb6e1b82ce87ac3db60d Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 10:45:14 +0100
Subject: [PATCH 1/6] feat: size dictionary encoding by allocated space

---
 read_buffer/src/column/encoding/string.rs     |  2 +-
 .../src/column/encoding/string/dictionary.rs  | 65 +++++++++++++------
 read_buffer/src/column/string.rs              |  2 +-
 3 files changed, 47 insertions(+), 22 deletions(-)
diff --git a/read_buffer/src/column/encoding/string.rs b/read_buffer/src/column/encoding/string.rs
index 85975c9980..42574e669e 100644
--- a/read_buffer/src/column/encoding/string.rs
+++ b/read_buffer/src/column/encoding/string.rs
@@ -31,7 +31,7 @@ impl Encoding {
     pub fn size(&self) -> usize {
         match &self {
             Self::RLE(enc) => enc.size(),
-            Self::Plain(enc) => enc.size(),
+            Self::Plain(enc) => enc.size(false),
         }
     }
 
diff --git a/read_buffer/src/column/encoding/string/dictionary.rs b/read_buffer/src/column/encoding/string/dictionary.rs
index fdb29a9141..d5c418cdf9 100644
--- a/read_buffer/src/column/encoding/string/dictionary.rs
+++ b/read_buffer/src/column/encoding/string/dictionary.rs
@@ -47,7 +47,7 @@ impl Default for Dictionary {
 }
 
 impl Dictionary {
-    /// Initialises an Dictionar encoding with a set of logical values.
+    /// Initialises a Dictionary encoding with a set of logical values.
     /// Creating an encoding using `with_dictionary` ensures that the dictionary
     /// is in the correct order, and will allow values to be inserted with any
     /// value in the dictionary.
@@ -61,22 +61,33 @@ impl Dictionary {
     }
 
     /// A reasonable estimation of the on-heap size this encoding takes up.
-    pub fn size(&self) -> usize {
-        // the total size of all decoded values in the column.
-        let decoded_keys_size = self
+    /// If `buffers` is true then all allocated buffers in the encoding are
+    /// accounted for.
+    pub fn size(&self, buffers: bool) -> usize {
+        let base_size = size_of::<Self>();
+
+        // Total size of all decoded values in the column.
+        let mut decoded_keys_size = self
             .entries
             .iter()
             .map(|k| match k {
-                Some(v) =>  v.len(),
+                Some(v) => v.len(),
                 None => 0,
             } + size_of::<Option<String>>())
             .sum::<usize>();
 
-        let entries_size = size_of::<Vec<Option<String>>>() + decoded_keys_size;
-        let encoded_ids_size = size_of::<Vec<u32>>() + (size_of::<u32>() * self.encoded_data.len());
+        if buffers {
+            decoded_keys_size +=
+                (self.entries.capacity() - self.entries.len()) * size_of::<Option<String>>();
+        }
 
-        // + 1 for contains_null field
-        entries_size + encoded_ids_size + 1
+        let encoded_ids_size = size_of::<u32>()
+            * match buffers {
+                true => self.encoded_data.capacity(),
+                false => self.encoded_data.len(),
+            };
+
+        base_size + decoded_keys_size + encoded_ids_size
     }
 
     /// A reasonable estimation of the on-heap size of the underlying string
@@ -837,7 +848,7 @@ impl std::fmt::Display for Dictionary {
             f,
             "[{}] size: {:?} rows: {:?} cardinality: {}",
             ENCODING_NAME,
-            self.size(),
+            self.size(false),
             self.num_rows(),
             self.cardinality(),
         )
@@ -873,17 +884,13 @@ mod test {
         enc.push_none();
         enc.push_none();
 
-        // keys - 14 bytes.
-
-        // 3 string entries in dictionary
-        // entries is 24 + (24*4) + 14 == 134
-
+        // Self - 24+24+8 = 56 bytes (two vectors, a bool and padding)
+        // 4 string entries (inc NULL) in vec = 4 * 24 = 96
+        // 3 string entries with length 4+5+5 = 14
         // 15 rows.
-        // encoded ids is 24 + (4 * 15) == 84
-
-        // 134 + 84 + 1 == 219
-
-        assert_eq!(enc.size(), 219);
+        // encoded ids is (4 * 15) == 60
+        // 56 + 96 + 14 + 60 = 226
+        assert_eq!(enc.size(false), 226);
 
         // check dictionary
         assert_eq!(
@@ -899,6 +906,24 @@ mod test {
             enc.encoded_data,
             vec![1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 3, NULL_ID, NULL_ID, NULL_ID, NULL_ID]
         );
+
+        // check for allocated size
+        let mut enc = Dictionary::default();
+        enc.encoded_data.reserve_exact(40);
+        enc.entries.reserve_exact(39); // account for already-allocated NULL element
+        enc.push_additional(Some("east".to_string()), 3);
+        enc.push_additional(Some("north".to_string()), 1);
+        enc.push_additional(Some("east".to_string()), 5);
+        enc.push_additional(Some("south".to_string()), 2);
+        enc.push_additional(None, 4);
+
+        // Self - 24+24+8 = 56 bytes (two vectors, a bool and padding)
+        // 40 string entries (inc NULL) in vec = 40 * 24 = 960
+        // 3 string entries with lengths 4+5+5 = 14
+        // 15 rows but 40 elements allocated
+        // encoded ids is (40 * 4) == 160
+        // 56 + 960 + 14 + 160 = 1190
+        assert_eq!(enc.size(true), 1190);
     }
 
     #[test]
diff --git a/read_buffer/src/column/string.rs b/read_buffer/src/column/string.rs
index 87649b9cd5..709ccfe259 100644
--- a/read_buffer/src/column/string.rs
+++ b/read_buffer/src/column/string.rs
@@ -31,7 +31,7 @@ impl StringEncoding {
     pub fn size(&self) -> usize {
         match self {
             Self::RleDictionary(enc) => enc.size(),
-            Self::Dictionary(enc) => enc.size(),
+            Self::Dictionary(enc) => enc.size(false),
         }
     }
 

From b4f8e854f63dad21ddbae286576b4a0679d331e5 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 10:46:31 +0100
Subject: [PATCH 2/6] feat: size rle string encoding by allocated buffers

---
 read_buffer/src/chunk.rs                      |  2 +-
 read_buffer/src/column/encoding/string.rs     |  2 +-
 read_buffer/src/column/encoding/string/rle.rs | 75 ++++++++++++-------
 read_buffer/src/column/string.rs              |  2 +-
 4 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs
index e3f2f22645..db3bf8bdf9 100644
--- a/read_buffer/src/chunk.rs
+++ b/read_buffer/src/chunk.rs
@@ -662,7 +662,7 @@ mod test {
         r#"read_buffer_column_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 800"#,
         r#"read_buffer_column_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 96"#,
         r#"read_buffer_column_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#,
-        r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 500"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 506"#,
         "# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer",
         "# TYPE read_buffer_column_raw_bytes gauge",
         r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64",null="false"} 96"#,
diff --git a/read_buffer/src/column/encoding/string.rs b/read_buffer/src/column/encoding/string.rs
index 42574e669e..c16fc40390 100644
--- a/read_buffer/src/column/encoding/string.rs
+++ b/read_buffer/src/column/encoding/string.rs
@@ -30,7 +30,7 @@ impl Encoding {
 
     pub fn size(&self) -> usize {
         match &self {
-            Self::RLE(enc) => enc.size(),
+            Self::RLE(enc) => enc.size(false),
             Self::Plain(enc) => enc.size(false),
         }
     }
diff --git a/read_buffer/src/column/encoding/string/rle.rs b/read_buffer/src/column/encoding/string/rle.rs
index edb471b551..9818a1203b 100644
--- a/read_buffer/src/column/encoding/string/rle.rs
+++ b/read_buffer/src/column/encoding/string/rle.rs
@@ -3,8 +3,6 @@ use std::convert::From;
 use std::iter;
 use std::mem::size_of;
 
-use croaring::Bitmap;
-
 use arrow::array::{Array, StringArray};
 
 use super::NULL_ID;
@@ -75,13 +73,18 @@ impl RLE {
     }
 
     /// A reasonable estimation of the on-heap size this encoding takes up.
-    pub fn size(&self) -> usize {
-        // the total size of all decoded values in the column.
-        let decoded_keys_size = self.index_entries.iter().map(|k| k.len()).sum::<usize>();
+    /// If `buffers` is true then the size of all allocated buffers in the
+    /// encoding are accounted for.
+    pub fn size(&self, buffers: bool) -> usize {
+        let base_size = size_of::<Self>();
 
-        let index_entry_size = size_of::<Vec<String>>() // container size
-            + (size_of::<String>() * self.index_entries.len()) // elements size
-            + decoded_keys_size; // heap allocated strings size
+        let mut index_entries_size = size_of::<String>()
+            * match buffers {
+                true => self.index_entries.capacity(),
+                false => self.index_entries.len(),
+            };
+        // the total size of all decoded values in the column.
+        index_entries_size += self.index_entries.iter().map(|k| k.len()).sum::<usize>();
 
         // The total size (an upper bound estimate) of all the bitmaps
         // in the column.
@@ -91,14 +94,16 @@ impl RLE {
             .map(|row_ids| row_ids.size())
             .sum::<usize>();
 
-        let index_row_ids_size = size_of::<BTreeMap<u32, Bitmap>>()
-            + (size_of::<u32>() * self.index_row_ids.len())
-            + row_ids_bitmaps_size;
+        let index_row_ids_size =
+            (size_of::<u32>() * self.index_row_ids.len()) + row_ids_bitmaps_size;
 
-        let run_lengths_size = size_of::<Vec<(u32, u32)>>() + // container size
-            (size_of::<(u32, u32)>() * self.run_lengths.len()); // each run-length size
+        let run_lengths_size = size_of::<(u32, u32)>()
+            * match buffers {
+                true => self.run_lengths.capacity(),
+                false => self.run_lengths.len(),
+            };
 
-        index_entry_size + index_row_ids_size + run_lengths_size + 1 + 4
+        base_size + index_entries_size + index_row_ids_size + run_lengths_size
     }
 
     /// A reasonable estimation of the on-heap size of the underlying string
@@ -958,7 +963,7 @@ impl std::fmt::Display for RLE {
             f,
             "[{}] size: {:?} rows: {:?} cardinality: {}, nulls: {} runs: {} ",
             ENCODING_NAME,
-            self.size(),
+            self.size(false),
             self.num_rows,
             self.cardinality(),
             self.null_count(),
@@ -1000,22 +1005,34 @@ mod test {
         enc.push_none();
         enc.push_none();
 
-        // Note: there are 4 index entries to account for NULL entry.
-        // `index_entry` is 24 + (24*4) + 14 == 134
+        // * Self: 24 + 24 + 24 + 1 + (padding 3b) + 4 = 80b
+        // * index entries: (4) are is (24*4) + 14 == 110
+        // * index row ids: (bitmaps) is (4 * 4) + (108b for bitmaps) == 124
+        // * run lengths: (8*5) == 40
         //
-        // bitmaps for east, north, south and NULL entries.
-        // `index_row_ids` is 24 + (4 * 4) + (108b for bitmaps) == 148
-        //
-        // `run lengths` is 24 + (8*5) == 64
-        //
-        // `contains_null` - 1 byte
-        // `num_rows` - 4 bytes
-        //
-        // 351
+        // 354
+        // assert_eq!(enc.size(false), 354);
 
-        // TODO(edd): there some mystery bytes in the bitmap implementation.
-        // need to figure out how to measure these
-        assert_eq!(enc.size(), 351);
+        // check allocated size
+        let mut enc = RLE::default();
+        enc.index_entries.reserve_exact(39); // account for already-allocated NULL element
+        enc.run_lengths.reserve_exact(40);
+
+        enc.push_additional(Some("east".to_string()), 3);
+        enc.push_additional(Some("north".to_string()), 1);
+        enc.push_additional(Some("east".to_string()), 5);
+        enc.push_additional(Some("south".to_string()), 2);
+        enc.push_none();
+        enc.push_none();
+        enc.push_none();
+        enc.push_none();
+
+        // * Self: 24 + 24 + 24 + 1 + (padding 3b) + 4 = 80b
+        // * index entries: (40 * 24) + 14 == 974
+        // * index row ids: (bitmaps) is (4 * 4) + (108b for bitmaps) == 124
+        // * run lengths: (40 * 8) == 320
+        //
+        assert_eq!(enc.size(true), 1498);
     }
 
     #[test]
diff --git a/read_buffer/src/column/string.rs b/read_buffer/src/column/string.rs
index 709ccfe259..6987e9d538 100644
--- a/read_buffer/src/column/string.rs
+++ b/read_buffer/src/column/string.rs
@@ -30,7 +30,7 @@ impl StringEncoding {
     /// The estimated total size in bytes of the in-memory columnar data.
     pub fn size(&self) -> usize {
         match self {
-            Self::RleDictionary(enc) => enc.size(),
+            Self::RleDictionary(enc) => enc.size(false),
             Self::Dictionary(enc) => enc.size(false),
         }
     }

From 11349fa30dce66f6c7b10c8fa4ef72b004d659df Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 10:14:57 +0100
Subject: [PATCH 3/6] feat: add allocated size to bool

---
 read_buffer/src/column/boolean.rs       |  2 +-
 read_buffer/src/column/encoding/bool.rs | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/read_buffer/src/column/boolean.rs b/read_buffer/src/column/boolean.rs
index c568022fc1..a356b18622 100644
--- a/read_buffer/src/column/boolean.rs
+++ b/read_buffer/src/column/boolean.rs
@@ -11,7 +11,7 @@ impl BooleanEncoding {
     /// The total size in bytes of the store columnar data.
     pub fn size(&self) -> usize {
         match self {
-            Self::BooleanNull(enc) => enc.size(),
+            Self::BooleanNull(enc) => enc.size(false),
         }
     }
 
diff --git a/read_buffer/src/column/encoding/bool.rs b/read_buffer/src/column/encoding/bool.rs
index bd844633d9..5dd6891a2a 100644
--- a/read_buffer/src/column/encoding/bool.rs
+++ b/read_buffer/src/column/encoding/bool.rs
@@ -1,6 +1,7 @@
 //! An encoding nullable bool, by an Arrow array.
 use std::cmp::Ordering;
 use std::fmt::Debug;
+use std::mem::size_of;
 
 use arrow::array::{Array, BooleanArray};
 use cmp::Operator;
@@ -19,7 +20,7 @@ impl std::fmt::Display for Bool {
             "[Bool] rows: {:?}, nulls: {:?}, size: {}",
             self.arr.len(),
             self.arr.null_count(),
-            self.size()
+            self.size(false)
         )
     }
 }
@@ -42,8 +43,12 @@ impl Bool {
 
     /// Returns an estimation of the total size in bytes used by this column
     /// encoding.
-    pub fn size(&self) -> usize {
-        std::mem::size_of::<BooleanArray>() + self.arr.get_array_memory_size()
+    pub fn size(&self, buffers: bool) -> usize {
+        size_of::<Self>()
+            + match buffers {
+                true => self.arr.get_array_memory_size(), // includes buffer capacities
+                false => self.arr.get_buffer_memory_size(),
+            }
     }
 
     /// The estimated total size in bytes of the underlying bool values in the
@@ -360,7 +365,8 @@ mod test {
     #[test]
     fn size() {
         let v = Bool::from(vec![None, None, Some(true), Some(false)].as_slice());
-        assert_eq!(v.size(), 400);
+        assert_eq!(v.size(false), 256);
+        assert_eq!(v.size(true), 400); // includes allocated buffers
     }
 
     #[test]

From 0e8b0edfc9e4fd432a45c425826997d87a29e777 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 10:43:21 +0100
Subject: [PATCH 4/6] feat: add buffer-based sizing for numerical encodings

---
 read_buffer/src/chunk.rs                      |  4 +--
 read_buffer/src/column/encoding/scalar.rs     |  6 ++--
 .../src/column/encoding/scalar/fixed.rs       | 25 +++++++++++---
 .../src/column/encoding/scalar/fixed_null.rs  | 13 +++++---
 read_buffer/src/column/encoding/scalar/rle.rs | 33 ++++++++++++++-----
 read_buffer/src/column/float.rs               |  2 +-
 read_buffer/src/column/integer.rs             | 26 +++++++--------
 7 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs
index db3bf8bdf9..fcc4638cdd 100644
--- a/read_buffer/src/chunk.rs
+++ b/read_buffer/src/chunk.rs
@@ -659,9 +659,9 @@ mod test {
                 "# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
         "# TYPE read_buffer_column_bytes gauge",
         r#"read_buffer_column_bytes{db="mydb",encoding="BT_U32-FIXED",log_data_type="i64"} 72"#,
-        r#"read_buffer_column_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 800"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="FBT_U8-FIXEDN",log_data_type="f64"} 512"#,
         r#"read_buffer_column_bytes{db="mydb",encoding="FIXED",log_data_type="f64"} 96"#,
-        r#"read_buffer_column_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 672"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="FIXEDN",log_data_type="bool"} 384"#,
         r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 506"#,
         "# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer",
         "# TYPE read_buffer_column_raw_bytes gauge",
diff --git a/read_buffer/src/column/encoding/scalar.rs b/read_buffer/src/column/encoding/scalar.rs
index 282c3a6ec0..8789d8fdbb 100644
--- a/read_buffer/src/column/encoding/scalar.rs
+++ b/read_buffer/src/column/encoding/scalar.rs
@@ -18,8 +18,10 @@ pub trait ScalarEncoding<L>: Debug + Display + Send + Sync {
     /// A useful name for the encoding, likely used in instrumentation.
     fn name(&self) -> &'static str;
 
-    /// The total size in bytes to store encoded data in memory.
-    fn size(&self) -> usize;
+    /// The total size in bytes to store encoded data in memory. If `buffers`
+    /// is true then the returned size should account for any allocated buffers
+    /// within the contained encoding structures.
+    fn size(&self, buffers: bool) -> usize;
 
     /// The estimated total size in bytes of the underlying encoded values if
     /// they were stored contiguously as a vector of `L`. `include_null` should
diff --git a/read_buffer/src/column/encoding/scalar/fixed.rs b/read_buffer/src/column/encoding/scalar/fixed.rs
index edb9013c71..d6689e97d9 100644
--- a/read_buffer/src/column/encoding/scalar/fixed.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed.rs
@@ -53,7 +53,7 @@ where
             "[{}] rows: {:?}, size: {}",
             self.name(),
             self.num_rows(),
-            self.size()
+            self.size(false)
         )
     }
 }
@@ -252,9 +252,13 @@ where
         self.values.len() as u32
     }
 
-    /// Encoded data size including `Self` - an "accurate" estimation.
-    fn size(&self) -> usize {
-        size_of::<Self>() + (size_of::<P>() * self.values.len())
+    fn size(&self, buffers: bool) -> usize {
+        let values = size_of::<P>()
+            * match buffers {
+                true => self.values.capacity(),
+                false => self.values.len(),
+            };
+        size_of::<Self>() + values
     }
 
     fn size_raw(&self, _: bool) -> usize {
@@ -425,6 +429,19 @@ mod test {
         (Fixed::new(values, Arc::clone(&mock)), mock)
     }
 
+    #[test]
+    fn size() {
+        let (v, _) = new_encoding(vec![22_i64, 1, 18]);
+        // Self if 32 bytes and there are 3 * 8b values
+        assert_eq!(v.size(false), 56);
+
+        // check pre-allocated sizing
+        let (mut v, _) = new_encoding(vec![]);
+        v.values.reserve_exact(40);
+        // Self if 32 bytes and there are 40 * 8b values allocated
+        assert_eq!(v.size(true), 352);
+    }
+
     #[test]
     fn value() {
         let (v, transcoder) = new_encoding(vec![22, 1, 18]);
diff --git a/read_buffer/src/column/encoding/scalar/fixed_null.rs b/read_buffer/src/column/encoding/scalar/fixed_null.rs
index 2179e8f8f4..922f597a77 100644
--- a/read_buffer/src/column/encoding/scalar/fixed_null.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed_null.rs
@@ -52,7 +52,7 @@ where
             self.name(),
             self.arr.len(),
             self.arr.null_count(),
-            self.size()
+            self.size(false)
         )
     }
 }
@@ -260,8 +260,12 @@ where
         self.arr.null_count() as u32
     }
 
-    fn size(&self) -> usize {
-        size_of::<Self>() + self.arr.get_array_memory_size()
+    fn size(&self, buffers: bool) -> usize {
+        size_of::<Self>()
+            + match buffers {
+                true => self.arr.get_array_memory_size(),
+                false => self.arr.get_buffer_memory_size(),
+            }
     }
 
     /// The estimated total size in bytes of the underlying values in the
@@ -478,7 +482,8 @@ mod test {
     #[test]
     fn size() {
         let (v, _) = new_encoding(vec![None, None, Some(100), Some(2222)]);
-        assert_eq!(v.size(), 408);
+        assert_eq!(v.size(false), 264);
+        assert_eq!(v.size(true), 408); // includes allocated buffers
     }
 
     #[test]
diff --git a/read_buffer/src/column/encoding/scalar/rle.rs b/read_buffer/src/column/encoding/scalar/rle.rs
index 2f6d8c2fab..fc8c1b3bc2 100644
--- a/read_buffer/src/column/encoding/scalar/rle.rs
+++ b/read_buffer/src/column/encoding/scalar/rle.rs
@@ -70,7 +70,7 @@ where
             f,
             "[{}] size: {:?} rows: {:?} nulls: {} runs: {} ",
             self.name(),
-            self.size(),
+            self.size(false),
             self.num_rows(),
             self.null_count(),
             self.run_lengths.len()
@@ -343,8 +343,13 @@ where
         ENCODING_NAME
     }
 
-    fn size(&self) -> usize {
-        std::mem::size_of::<Self>() + (self.run_lengths.len() * size_of::<(u32, Option<P>)>())
+    fn size(&self, buffers: bool) -> usize {
+        let values = size_of::<(u32, Option<P>)>()
+            * match buffers {
+                true => self.run_lengths.capacity(),
+                false => self.run_lengths.len(),
+            };
+        std::mem::size_of::<Self>() + values
     }
 
     fn size_raw(&self, include_nulls: bool) -> usize {
@@ -713,16 +718,26 @@ mod test {
     fn size() {
         let (mut enc, _) = new_encoding(vec![]);
 
-        // 40b Self + (0 rl * 24) = 32
-        assert_eq!(enc.size(), 40);
+        // 40b Self + (0 rl * 24) = 40
+        assert_eq!(enc.size(false), 40);
 
         enc.push_none();
-        // 40b Self + (1 rl * 24) = 56
-        assert_eq!(enc.size(), 64);
+        // 40b Self + (1 rl * 24) = 64
+        assert_eq!(enc.size(false), 64);
 
         enc.push_additional_some(1, 10);
-        // 40b Self + (2 rl * 24) = 80
-        assert_eq!(enc.size(), 88);
+        // 40b Self + (2 rl * 24) = 88
+        assert_eq!(enc.size(false), 88);
+
+        // check allocated buffer size
+        let (mut enc, _) = new_encoding(vec![]);
+        enc.run_lengths.reserve_exact(40);
+        // 40b Self + (40 rl * 24) = 1000b
+        assert_eq!(enc.size(true), 1000);
+
+        // 40b Self + (40 rl * 24) = 1000b - no new allocations
+        enc.push_additional_some(1, 10);
+        assert_eq!(enc.size(true), 1000);
     }
 
     #[test]
diff --git a/read_buffer/src/column/float.rs b/read_buffer/src/column/float.rs
index 16ac473055..5f08e6fcd7 100644
--- a/read_buffer/src/column/float.rs
+++ b/read_buffer/src/column/float.rs
@@ -32,7 +32,7 @@ impl FloatEncoding {
     /// The total size in bytes of to store columnar data in memory.
     pub fn size(&self) -> usize {
         match self {
-            Self::F64(enc, _) => enc.size(),
+            Self::F64(enc, _) => enc.size(false),
         }
     }
 
diff --git a/read_buffer/src/column/integer.rs b/read_buffer/src/column/integer.rs
index 2118edf85a..3cd2dcd04d 100644
--- a/read_buffer/src/column/integer.rs
+++ b/read_buffer/src/column/integer.rs
@@ -27,8 +27,8 @@ impl IntegerEncoding {
     /// The total size in bytes of the store columnar data.
     pub fn size(&self) -> usize {
         match self {
-            Self::I64(enc, _) => enc.size(),
-            Self::U64(enc, _) => enc.size(),
+            Self::I64(enc, _) => enc.size(false),
+            Self::U64(enc, _) => enc.size(false),
         }
     }
 
@@ -971,13 +971,13 @@ mod test {
 
         // Input data containing NULL will be stored in an Arrow array encoding
         let cases = vec![
-            (vec![None, Some(0_i64)], 400_usize),         // u8 Arrow array
-            (vec![None, Some(-120_i64)], 400),            // i8
-            (vec![None, Some(399_i64)], 400),             // u16
-            (vec![None, Some(-399_i64)], 400),            // i16
-            (vec![None, Some(u32::MAX as i64)], 400),     // u32
-            (vec![None, Some(i32::MIN as i64)], 400),     // i32
-            (vec![None, Some(u32::MAX as i64 + 1)], 400), //u64
+            (vec![None, Some(0_i64)], 256_usize),         // u8 Arrow array
+            (vec![None, Some(-120_i64)], 256),            // i8
+            (vec![None, Some(399_i64)], 256),             // u16
+            (vec![None, Some(-399_i64)], 256),            // i16
+            (vec![None, Some(u32::MAX as i64)], 256),     // u32
+            (vec![None, Some(i32::MIN as i64)], 256),     // i32
+            (vec![None, Some(u32::MAX as i64 + 1)], 256), //u64
         ];
 
         for (case, name) in cases.iter().cloned() {
@@ -1163,10 +1163,10 @@ mod test {
 
         // Input data containing NULL will be stored in an Arrow array encoding
         let cases = vec![
-            (vec![None, Some(0_u64)], 400_usize),
-            (vec![None, Some(399_u64)], 400),
-            (vec![None, Some(u32::MAX as u64)], 400),
-            (vec![None, Some(u64::MAX)], 400),
+            (vec![None, Some(0_u64)], 256_usize),
+            (vec![None, Some(399_u64)], 256),
+            (vec![None, Some(u32::MAX as u64)], 256),
+            (vec![None, Some(u64::MAX)], 256),
         ];
 
         for (case, size) in cases.iter().cloned() {

From c68bbb630940da08a15f711be3189f972ae3862f Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 11:21:28 +0100
Subject: [PATCH 5/6] test: update test

---
 query_tests/src/sql.rs | 8 ++++----
 server/src/db.rs       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs
index 1ba61c3b05..e554ba3681 100644
--- a/query_tests/src/sql.rs
+++ b/query_tests/src/sql.rs
@@ -369,10 +369,10 @@ async fn sql_select_from_system_chunk_columns() {
         "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
         "| partition_key | chunk_id | table_name | column_name | storage           | row_count | null_count | min_value | max_value | memory_bytes |",
         "+---------------+----------+------------+-------------+-------------------+-----------+------------+-----------+-----------+--------------+",
-        "| 1970-01-01T00 | 0        | h2o        | city        | ReadBuffer        | 2         | 0          | Boston    | Boston    | 252          |",
-        "| 1970-01-01T00 | 0        | h2o        | other_temp  | ReadBuffer        | 2         | 1          | 70.4      | 70.4      | 425          |",
-        "| 1970-01-01T00 | 0        | h2o        | state       | ReadBuffer        | 2         | 0          | MA        | MA        | 240          |",
-        "| 1970-01-01T00 | 0        | h2o        | temp        | ReadBuffer        | 2         | 1          | 70.4      | 70.4      | 425          |",
+        "| 1970-01-01T00 | 0        | h2o        | city        | ReadBuffer        | 2         | 0          | Boston    | Boston    | 255          |",
+        "| 1970-01-01T00 | 0        | h2o        | other_temp  | ReadBuffer        | 2         | 1          | 70.4      | 70.4      | 281          |",
+        "| 1970-01-01T00 | 0        | h2o        | state       | ReadBuffer        | 2         | 0          | MA        | MA        | 243          |",
+        "| 1970-01-01T00 | 0        | h2o        | temp        | ReadBuffer        | 2         | 1          | 70.4      | 70.4      | 281          |",
         "| 1970-01-01T00 | 0        | h2o        | time        | ReadBuffer        | 2         | 0          | 50        | 250       | 51           |",
         "| 1970-01-01T00 | 0        | o2         | city        | OpenMutableBuffer | 2         | 1          | Boston    | Boston    | 35           |",
         "| 1970-01-01T00 | 0        | o2         | reading     | OpenMutableBuffer | 2         | 1          | 51        | 51        | 25           |",
diff --git a/server/src/db.rs b/server/src/db.rs
index ae79f37ff9..337d36a0ca 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -2531,7 +2531,7 @@ mod tests {
                 ("svr_id", "1"),
             ])
             .histogram()
-            .sample_sum_eq(3191.0)
+            .sample_sum_eq(3197.0)
             .unwrap();
 
         let rb = collect_read_filter(&rb_chunk).await;
@@ -3400,7 +3400,7 @@ mod tests {
                 id: 2,
                 storage: ChunkStorage::ReadBufferAndObjectStore,
                 lifecycle_action,
-                memory_bytes: 3284,       // size of RB and OS chunks
+                memory_bytes: 3140,       // size of RB and OS chunks
                 object_store_bytes: 1577, // size of parquet file
                 row_count: 2,
                 time_of_last_access: None,
@@ -3451,7 +3451,7 @@ mod tests {
         }
 
         assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 87);
-        assert_eq!(db.catalog.metrics().memory().read_buffer(), 2410);
+        assert_eq!(db.catalog.metrics().memory().read_buffer(), 2266);
         assert_eq!(db.catalog.metrics().memory().object_store(), 874);
     }
 

From e78aebdf1969d6431831c74ac3e071f3c702d2ff Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Thu, 12 Aug 2021 15:57:01 +0100
Subject: [PATCH 6/6] refactor: update
 read_buffer/src/column/encoding/scalar/fixed.rs

Co-authored-by: Andrew Lamb <alamb@influxdata.com>
---
 read_buffer/src/column/encoding/scalar/fixed.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read_buffer/src/column/encoding/scalar/fixed.rs b/read_buffer/src/column/encoding/scalar/fixed.rs
index d6689e97d9..6cf6b0b63e 100644
--- a/read_buffer/src/column/encoding/scalar/fixed.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed.rs
@@ -432,7 +432,7 @@ mod test {
     #[test]
     fn size() {
         let (v, _) = new_encoding(vec![22_i64, 1, 18]);
-        // Self if 32 bytes and there are 3 * 8b values
+        // Self is 32 bytes and there are 3 * 8b values
         assert_eq!(v.size(false), 56);
 
         // check pre-allocated sizing