From d80e71ad8626e63c0feb287c4db764be7a649209 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Fri, 14 May 2021 10:34:54 +0100
Subject: [PATCH] feat: add new metric to track raw size

---
 read_buffer/src/chunk.rs          | 164 ++++++++++++++++++------------
 read_buffer/src/column.rs         |  12 ++-
 read_buffer/src/column/boolean.rs |   2 +
 read_buffer/src/column/float.rs   |   2 +
 read_buffer/src/column/integer.rs |   2 +
 read_buffer/src/column/string.rs  |   2 +
 6 files changed, 114 insertions(+), 70 deletions(-)

diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs
index b3286e7289..8f7bf88193 100644
--- a/read_buffer/src/chunk.rs
+++ b/read_buffer/src/chunk.rs
@@ -663,6 +663,26 @@ impl Chunk {
                 .column_bytes_total
                 .add_with_labels(stat.bytes as f64 * sign, labels);
 
+            // update raw estimated bytes of NULL values
+            self.metrics.column_raw_bytes_total.add_with_labels(
+                (stat.raw_bytes as f64 - stat.raw_bytes_no_null as f64) * sign,
+                &[
+                    KeyValue::new("encoding", stat.enc_type),
+                    KeyValue::new("log_data_type", stat.log_data_type),
+                    KeyValue::new("null", "true"),
+                ],
+            );
+
+            // update raw estimated bytes of non-NULL values
+            self.metrics.column_raw_bytes_total.add_with_labels(
+                stat.raw_bytes_no_null as f64 * sign,
+                &[
+                    KeyValue::new("encoding", stat.enc_type),
+                    KeyValue::new("log_data_type", stat.log_data_type),
+                    KeyValue::new("null", "false"),
+                ],
+            );
+
             // update number of NULL values
             self.metrics.column_values_total.add_with_labels(
                 stat.nulls as f64 * sign,
@@ -703,28 +723,16 @@ pub struct ChunkMetrics {
 
     // This metric tracks the total number of bytes used by read buffer columns
     column_bytes_total: metrics::Gauge,
+
+    // This metric tracks an estimated uncompressed data size for read buffer
+    // columns, further segmented by nullness. It is a building block for
+    // tracking a measure of overall compression.
+    column_raw_bytes_total: metrics::Gauge,
 }
 
 impl ChunkMetrics {
     pub fn new(registry: &MetricRegistry) -> Self {
-        let domain = registry.register_domain("read_buffer");
-        Self {
-            columns_total: domain.register_gauge_metric(
-                "column",
-                Some("total"),
-                "The number of columns within the Read Buffer",
-            ),
-            column_values_total: domain.register_gauge_metric(
-                "column",
-                Some("values"),
-                "The number of values within columns in the Read Buffer",
-            ),
-            column_bytes_total: domain.register_gauge_metric(
-                "column",
-                Some("bytes"),
-                "The number of bytes used by all columns in the Read Buffer",
-            ),
-        }
+        Self::new_with_db(registry, String::new())
     }
 
     pub fn new_with_db(registry: &MetricRegistry, db: String) -> Self {
@@ -746,6 +754,12 @@ impl ChunkMetrics {
                 "column",
                 Some("bytes"),
                 "The number of bytes used by all columns in the Read Buffer",
+                vec![metrics::KeyValue::new("db", db.clone())],
+            ),
+            column_raw_bytes_total: domain.register_gauge_metric_with_labels(
+                "column_raw",
+                Some("bytes"),
+                "The number of bytes used by all columns if they were uncompressed in the Read Buffer",
                 vec![metrics::KeyValue::new("db", db)],
             ),
         }
@@ -948,7 +962,7 @@ mod test {
     #[test]
     fn add_remove_tables() {
         let reg = metrics::TestMetricRegistry::new(Arc::new(metrics::MetricRegistry::new()));
-        let metrics = ChunkMetrics::new(&reg.registry());
+        let metrics = ChunkMetrics::new_with_db(&reg.registry(), "mydb".to_string());
         let chunk = Chunk::new(22, Arc::new(metrics));
 
         // Add a new table to the chunk.
@@ -1009,29 +1023,39 @@ mod test {
         assert_eq!(
             String::from_utf8(reg.registry().metrics_as_text()).unwrap(),
             vec![
-                "# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
-                "# TYPE read_buffer_column_bytes gauge",
-                r#"read_buffer_column_bytes{encoding="BT_U32",log_data_type="i64"} 108"#,
-                r#"read_buffer_column_bytes{encoding="None",log_data_type="bool"} 1152"#,
-                r#"read_buffer_column_bytes{encoding="None",log_data_type="f64"} 1176"#,
-                r#"read_buffer_column_bytes{encoding="RLE",log_data_type="string"} 750"#,
-                r#"# HELP read_buffer_column_total The number of columns within the Read Buffer"#,
-                r#"# TYPE read_buffer_column_total gauge"#,
-                r#"read_buffer_column_total{encoding="BT_U32",log_data_type="i64"} 3"#,
-                r#"read_buffer_column_total{encoding="None",log_data_type="bool"} 3"#,
-                r#"read_buffer_column_total{encoding="None",log_data_type="f64"} 6"#,
-                r#"read_buffer_column_total{encoding="RLE",log_data_type="string"} 3"#,
-                r#"# HELP read_buffer_column_values The number of values within columns in the Read Buffer"#,
-                r#"# TYPE read_buffer_column_values gauge"#,
-                r#"read_buffer_column_values{encoding="BT_U32",log_data_type="i64",null="false"} 9"#,
-                r#"read_buffer_column_values{encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="bool",null="false"} 9"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="bool",null="true"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="f64",null="false"} 15"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="f64",null="true"} 3"#,
-                r#"read_buffer_column_values{encoding="RLE",log_data_type="string",null="false"} 9"#,
-                r#"read_buffer_column_values{encoding="RLE",log_data_type="string",null="true"} 0"#,
-                "",
+        "# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
+        "# TYPE read_buffer_column_bytes gauge",
+        r#"read_buffer_column_bytes{db="mydb",encoding="BT_U32",log_data_type="i64"} 108"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="None",log_data_type="bool"} 1152"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="None",log_data_type="f64"} 1176"#,
+        r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 750"#,
+        r#"# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer"#,
+        r#"# TYPE read_buffer_column_raw_bytes gauge"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32",log_data_type="i64",null="false"} 144"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="bool",null="false"} 81"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="bool",null="true"} 0"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="f64",null="false"} 264"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="f64",null="true"} 24"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="RLE",log_data_type="string",null="false"} 324"#,
+        r#"read_buffer_column_raw_bytes{db="mydb",encoding="RLE",log_data_type="string",null="true"} 0"#,
+        r#"# HELP read_buffer_column_total The number of columns within the Read Buffer"#,
+        r#"# TYPE read_buffer_column_total gauge"#,
+        r#"read_buffer_column_total{db="mydb",encoding="BT_U32",log_data_type="i64"} 3"#,
+        r#"read_buffer_column_total{db="mydb",encoding="None",log_data_type="bool"} 3"#,
+        r#"read_buffer_column_total{db="mydb",encoding="None",log_data_type="f64"} 6"#,
+        r#"read_buffer_column_total{db="mydb",encoding="RLE",log_data_type="string"} 3"#,
+        r#"# HELP read_buffer_column_values The number of values within columns in the Read Buffer"#,
+        r#"# TYPE read_buffer_column_values gauge"#,
+        r#"read_buffer_column_values{db="mydb",encoding="BT_U32",log_data_type="i64",null="false"} 9"#,
+        r#"read_buffer_column_values{db="mydb",encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
+        r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="bool",null="false"} 9"#,
+        r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="bool",null="true"} 0"#,
+        r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="f64",null="false"} 15"#,
+        r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="f64",null="true"} 3"#,
+        r#"read_buffer_column_values{db="mydb",encoding="RLE",log_data_type="string",null="false"} 9"#,
+        r#"read_buffer_column_values{db="mydb",encoding="RLE",log_data_type="string",null="true"} 0"#,
+        "",
             ]
             .join("\n")
         );
@@ -1041,29 +1065,39 @@ mod test {
         assert_eq!(
             String::from_utf8(reg.registry().metrics_as_text()).unwrap(),
             vec![
-                "# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
-                "# TYPE read_buffer_column_bytes gauge",
-                r#"read_buffer_column_bytes{encoding="BT_U32",log_data_type="i64"} 0"#,
-                r#"read_buffer_column_bytes{encoding="None",log_data_type="bool"} 0"#,
-                r#"read_buffer_column_bytes{encoding="None",log_data_type="f64"} 0"#,
-                r#"read_buffer_column_bytes{encoding="RLE",log_data_type="string"} 0"#,
-                r#"# HELP read_buffer_column_total The number of columns within the Read Buffer"#,
-                r#"# TYPE read_buffer_column_total gauge"#,
-                r#"read_buffer_column_total{encoding="BT_U32",log_data_type="i64"} 0"#,
-                r#"read_buffer_column_total{encoding="None",log_data_type="bool"} 0"#,
-                r#"read_buffer_column_total{encoding="None",log_data_type="f64"} 0"#,
-                r#"read_buffer_column_total{encoding="RLE",log_data_type="string"} 0"#,
-                r#"# HELP read_buffer_column_values The number of values within columns in the Read Buffer"#,
-                r#"# TYPE read_buffer_column_values gauge"#,
-                r#"read_buffer_column_values{encoding="BT_U32",log_data_type="i64",null="false"} 0"#,
-                r#"read_buffer_column_values{encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="bool",null="false"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="bool",null="true"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="f64",null="false"} 0"#,
-                r#"read_buffer_column_values{encoding="None",log_data_type="f64",null="true"} 0"#,
-                r#"read_buffer_column_values{encoding="RLE",log_data_type="string",null="false"} 0"#,
-                r#"read_buffer_column_values{encoding="RLE",log_data_type="string",null="true"} 0"#,
-                "",
+            "# HELP read_buffer_column_bytes The number of bytes used by all columns in the Read Buffer",
+            "# TYPE read_buffer_column_bytes gauge",
+            r#"read_buffer_column_bytes{db="mydb",encoding="BT_U32",log_data_type="i64"} 0"#,
+            r#"read_buffer_column_bytes{db="mydb",encoding="None",log_data_type="bool"} 0"#,
+            r#"read_buffer_column_bytes{db="mydb",encoding="None",log_data_type="f64"} 0"#,
+            r#"read_buffer_column_bytes{db="mydb",encoding="RLE",log_data_type="string"} 0"#,
+            r#"# HELP read_buffer_column_raw_bytes The number of bytes used by all columns if they were uncompressed in the Read Buffer"#,
+            r#"# TYPE read_buffer_column_raw_bytes gauge"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32",log_data_type="i64",null="false"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="bool",null="false"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="bool",null="true"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="f64",null="false"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="None",log_data_type="f64",null="true"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="RLE",log_data_type="string",null="false"} 0"#,
+            r#"read_buffer_column_raw_bytes{db="mydb",encoding="RLE",log_data_type="string",null="true"} 0"#,
+            r#"# HELP read_buffer_column_total The number of columns within the Read Buffer"#,
+            r#"# TYPE read_buffer_column_total gauge"#,
+            r#"read_buffer_column_total{db="mydb",encoding="BT_U32",log_data_type="i64"} 0"#,
+            r#"read_buffer_column_total{db="mydb",encoding="None",log_data_type="bool"} 0"#,
+            r#"read_buffer_column_total{db="mydb",encoding="None",log_data_type="f64"} 0"#,
+            r#"read_buffer_column_total{db="mydb",encoding="RLE",log_data_type="string"} 0"#,
+            r#"# HELP read_buffer_column_values The number of values within columns in the Read Buffer"#,
+            r#"# TYPE read_buffer_column_values gauge"#,
+            r#"read_buffer_column_values{db="mydb",encoding="BT_U32",log_data_type="i64",null="false"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="BT_U32",log_data_type="i64",null="true"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="bool",null="false"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="bool",null="true"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="f64",null="false"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="None",log_data_type="f64",null="true"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="RLE",log_data_type="string",null="false"} 0"#,
+            r#"read_buffer_column_values{db="mydb",encoding="RLE",log_data_type="string",null="true"} 0"#,
+            "",
             ]
             .join("\n")
         );
diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs
index 3b140aad37..2a78bc0309 100644
--- a/read_buffer/src/column.rs
+++ b/read_buffer/src/column.rs
@@ -1343,11 +1343,13 @@ impl Iterator for RowIDsIterator<'_> {
 
 // Statistics about the composition of a column
 pub(crate) struct Statistics {
-    pub enc_type: &'static str,
-    pub log_data_type: &'static str,
-    pub values: u32,
-    pub nulls: u32,
-    pub bytes: usize,
+    pub enc_type: &'static str,      // The encoding type
+    pub log_data_type: &'static str, // The logical data-type
+    pub values: u32,                 // Number of values present (NULL and non-NULL)
+    pub nulls: u32,                  // Number of NULL values present
+    pub bytes: usize,                // Total size of data
+    pub raw_bytes: usize,            // Estimated "uncompressed" size
+    pub raw_bytes_no_null: usize,    // Estimated "uncompressed" size ignoring NULL values
 }
 
 #[cfg(test)]
diff --git a/read_buffer/src/column/boolean.rs b/read_buffer/src/column/boolean.rs
index 47e70ca9b6..26d6570543 100644
--- a/read_buffer/src/column/boolean.rs
+++ b/read_buffer/src/column/boolean.rs
@@ -39,6 +39,8 @@ impl BooleanEncoding {
             values: self.num_rows(),
             nulls: self.null_count(),
             bytes: self.size(),
+            raw_bytes: self.size_raw(true),
+            raw_bytes_no_null: self.size_raw(false),
         }
     }
 
diff --git a/read_buffer/src/column/float.rs b/read_buffer/src/column/float.rs
index be4fb43abd..c9d3967a58 100644
--- a/read_buffer/src/column/float.rs
+++ b/read_buffer/src/column/float.rs
@@ -49,6 +49,8 @@ impl FloatEncoding {
             values: self.num_rows(),
             nulls: self.null_count(),
             bytes: self.size(),
+            raw_bytes: self.size_raw(true),
+            raw_bytes_no_null: self.size_raw(false),
         }
     }
 
diff --git a/read_buffer/src/column/integer.rs b/read_buffer/src/column/integer.rs
index 4e6cbadb79..aaf03dbc85 100644
--- a/read_buffer/src/column/integer.rs
+++ b/read_buffer/src/column/integer.rs
@@ -99,6 +99,8 @@ impl IntegerEncoding {
             values: self.num_rows(),
             nulls: self.null_count(),
             bytes: self.size(),
+            raw_bytes: self.size_raw(true),
+            raw_bytes_no_null: self.size_raw(false),
         }
     }
 
diff --git a/read_buffer/src/column/string.rs b/read_buffer/src/column/string.rs
index a0a200e547..7f34f327cd 100644
--- a/read_buffer/src/column/string.rs
+++ b/read_buffer/src/column/string.rs
@@ -80,6 +80,8 @@ impl StringEncoding {
             values: self.num_rows(),
             nulls: self.null_count(),
             bytes: self.size(),
+            raw_bytes: self.size_raw(true),
+            raw_bytes_no_null: self.size_raw(false),
         }
     }