From ea0ece8b4bc40913488d25b3bcb0d2030823908e Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Wed, 2 Feb 2022 16:23:00 +0000
Subject: [PATCH 01/30] feat: issue read_filter request

---
 influxdb_iox/Cargo.toml                       |  1 +
 influxdb_iox/src/commands/storage.rs          | 62 ++++++++++++++++++-
 influxdb_iox/src/commands/storage/request.rs  | 33 ++++++++++
 influxdb_iox/src/commands/storage/response.rs |  1 +
 influxdb_iox/src/main.rs                      |  3 +-
 5 files changed, 96 insertions(+), 4 deletions(-)
 create mode 100644 influxdb_iox/src/commands/storage/request.rs
 create mode 100644 influxdb_iox/src/commands/storage/response.rs

diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index 5dbfbff2ef..62c169f37c 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -13,6 +13,7 @@ db = { path = "../db" }
 dml = { path = "../dml" }
 generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format", "write_lp"] }
+influxdb_storage_client = { path = "../influxdb_storage_client" }
 influxdb_line_protocol = { path = "../influxdb_line_protocol" }
 ingester = { path = "../ingester" }
 internal_types = { path = "../internal_types" }
diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index a6186f7f4c..739bf05e67 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -1,4 +1,9 @@
+pub(crate) mod request;
+
+use std::num::NonZeroU64;
+
 use generated_types::Predicate;
+use influxdb_storage_client::{connection::Connection, Client, OrgAndBucket};
 use influxrpc_parser::predicate;
 use time;
 
@@ -9,6 +14,9 @@ pub enum Error {
     #[snafu(display("Unable to parse timestamp '{:?}'", t))]
     TimestampParseError { t: String },
 
+    #[snafu(display("Unable to parse database name '{:?}'", db_name))]
+    DBNameParseError { db_name: String },
+
     #[snafu(display("Unable to parse predicate: {:?}", source))]
     PredicateParseError { source: predicate::Error },
 }
@@ -21,6 +29,10 @@ pub struct Config {
     #[clap(subcommand)]
     command: Command,
 
+    /// The name of the database
+    #[clap(parse(try_from_str = parse_db_name))]
+    db_name: OrgAndBucket,
+
     /// The requested start time (inclusive) of the time-range (also accepts RFC3339 format).
     #[clap(long, default_value = "-9223372036854775806", parse(try_from_str = parse_range))]
     start: i64,
@@ -58,6 +70,34 @@ fn parse_predicate(expr: &str) -> Result<Predicate, Error> {
     predicate::expr_to_rpc_predicate(expr).context(PredicateParseSnafu)
 }
 
+// Attempts to parse the database name into and org and bucket ID.
+fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, Error> {
+    let parts = db_name.split("_").collect::<Vec<_>>();
+    if parts.len() != 2 {
+        return DBNameParseSnafu {
+            db_name: db_name.to_owned(),
+        }
+        .fail();
+    }
+
+    let org_id = usize::from_str_radix(parts[0], 16).map_err(|_| Error::DBNameParseError {
+        db_name: db_name.to_owned(),
+    })?;
+
+    let bucket_id = usize::from_str_radix(parts[1], 16).map_err(|_| Error::DBNameParseError {
+        db_name: db_name.to_owned(),
+    })?;
+
+    Ok(OrgAndBucket::new(
+        NonZeroU64::new(org_id as u64).ok_or_else(|| Error::DBNameParseError {
+            db_name: db_name.to_owned(),
+        })?,
+        NonZeroU64::new(bucket_id as u64).ok_or_else(|| Error::DBNameParseError {
+            db_name: db_name.to_owned(),
+        })?,
+    ))
+}
+
 /// All possible subcommands for storage
 #[derive(Debug, clap::Parser)]
 enum Command {
@@ -70,9 +110,25 @@ enum Command {
 struct ReadFilter {}
 
 /// Create and issue read request
-pub async fn command(config: Config) -> Result<()> {
-    // TODO(edd): handle command/config and execute request
-    println!("Unimplemented: config is {:?}", config);
+pub async fn command(connection: Connection, config: Config) -> Result<()> {
+    let mut client = influxdb_storage_client::Client::new(connection);
+
+    // convert predicate with no root node into None.
+    let predicate = config.predicate.root.is_some().then(|| config.predicate);
+
+    let source = Client::read_source(&config.db_name, 0);
+    let result = match config.command {
+        Command::ReadFilter(_) => {
+            client
+                .read_filter(request::read_filter(
+                    source,
+                    config.start,
+                    config.stop,
+                    predicate,
+                ))
+                .await
+        }
+    };
     Ok(())
 }
 
diff --git a/influxdb_iox/src/commands/storage/request.rs b/influxdb_iox/src/commands/storage/request.rs
new file mode 100644
index 0000000000..f0051b11e5
--- /dev/null
+++ b/influxdb_iox/src/commands/storage/request.rs
@@ -0,0 +1,33 @@
+pub mod generated_types {
+    pub use generated_types::influxdata::platform::storage::*;
+}
+
+use self::generated_types::*;
+use ::generated_types::google::protobuf::*;
+
+pub fn read_filter(
+    org_bucket: Any,
+    start: i64,
+    stop: i64,
+    predicate: std::option::Option<Predicate>,
+) -> ReadFilterRequest {
+    generated_types::ReadFilterRequest {
+        predicate,
+        read_source: Some(org_bucket),
+        range: Some(TimestampRange { start, end: stop }),
+        key_sort: read_filter_request::KeySort::Unspecified as i32, // IOx doesn't support any other sort
+        tag_key_meta_names: TagKeyMetaNames::Text as i32,
+    }
+}
+
+// TODO Add the following helpers for building requests:
+//
+// * read_group
+// * read_window_aggregate
+// * tag_keys
+// * tag_values
+// * tag_values_with_measurement_and_key
+// * measurement_names
+// * measurement_tag_keys
+// * measurement_tag_values
+// * measurement_fields
diff --git a/influxdb_iox/src/commands/storage/response.rs b/influxdb_iox/src/commands/storage/response.rs
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/influxdb_iox/src/commands/storage/response.rs
@@ -0,0 +1 @@
+
diff --git a/influxdb_iox/src/main.rs b/influxdb_iox/src/main.rs
index c039cbead7..752cc34f7d 100644
--- a/influxdb_iox/src/main.rs
+++ b/influxdb_iox/src/main.rs
@@ -264,7 +264,8 @@ fn main() -> Result<(), std::io::Error> {
             }
             Command::Storage(config) => {
                 let _tracing_guard = handle_init_logs(init_simple_logs(log_verbose_count));
-                if let Err(e) = commands::storage::command(config).await {
+                let connection = connection().await;
+                if let Err(e) = commands::storage::command(connection, config).await {
                     eprintln!("{}", e);
                     std::process::exit(ReturnCode::Failure as _)
                 }

From 4cdaaf96bf1db60a9249df79a6dd0c2188325a79 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Wed, 2 Feb 2022 22:54:46 +0000
Subject: [PATCH 02/30] refactor: clean up errors

---
 influxdb_iox/src/commands/storage.rs | 31 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index 739bf05e67..9174404098 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -1,4 +1,5 @@
 pub(crate) mod request;
+pub(crate) mod response;
 
 use std::num::NonZeroU64;
 
@@ -10,18 +11,18 @@ use time;
 use snafu::{ResultExt, Snafu};
 
 #[derive(Debug, Snafu)]
-pub enum Error {
+pub enum ParseError {
     #[snafu(display("Unable to parse timestamp '{:?}'", t))]
-    TimestampParseError { t: String },
+    Timestamp { t: String },
 
     #[snafu(display("Unable to parse database name '{:?}'", db_name))]
-    DBNameParseError { db_name: String },
+    DBName { db_name: String },
 
     #[snafu(display("Unable to parse predicate: {:?}", source))]
-    PredicateParseError { source: predicate::Error },
+    Predicate { source: predicate::Error },
 }
 
-pub type Result<T, E = Error> = std::result::Result<T, E>;
+pub type Result<T, E = ParseError> = std::result::Result<T, E>;
 
 /// Craft and submit different types of storage read requests
 #[derive(Debug, clap::Parser)]
@@ -49,12 +50,12 @@ pub struct Config {
 // Attempts to parse either a stringified `i64` value. or alternatively parse an
 // RFC3339 formatted timestamp into an `i64` value representing nanoseconds
 // since the epoch.
-fn parse_range(s: &str) -> Result<i64, Error> {
+fn parse_range(s: &str) -> Result<i64, ParseError> {
     match s.parse::<i64>() {
         Ok(v) => Ok(v),
         Err(_) => {
             // try to parse timestamp
-            let t = time::Time::from_rfc3339(s).or_else(|_| TimestampParseSnafu { t: s }.fail())?;
+            let t = time::Time::from_rfc3339(s).or_else(|_| TimestampSnafu { t: s }.fail())?;
             Ok(t.timestamp_nanos())
         }
     }
@@ -62,37 +63,37 @@ fn parse_range(s: &str) -> Result<i64, Error> {
 
 // Attempts to parse the optional predicate into an `Predicate` RPC node. This
 // node is then used as part of a read request.
-fn parse_predicate(expr: &str) -> Result<Predicate, Error> {
+fn parse_predicate(expr: &str) -> Result<Predicate, ParseError> {
     if expr.is_empty() {
         return Ok(Predicate::default());
     }
 
-    predicate::expr_to_rpc_predicate(expr).context(PredicateParseSnafu)
+    predicate::expr_to_rpc_predicate(expr).context(PredicateSnafu)
 }
 
 // Attempts to parse the database name into and org and bucket ID.
-fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, Error> {
+fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, ParseError> {
     let parts = db_name.split("_").collect::<Vec<_>>();
     if parts.len() != 2 {
-        return DBNameParseSnafu {
+        return DBNameSnafu {
             db_name: db_name.to_owned(),
         }
         .fail();
     }
 
-    let org_id = usize::from_str_radix(parts[0], 16).map_err(|_| Error::DBNameParseError {
+    let org_id = usize::from_str_radix(parts[0], 16).map_err(|_| ParseError::DBName {
         db_name: db_name.to_owned(),
     })?;
 
-    let bucket_id = usize::from_str_radix(parts[1], 16).map_err(|_| Error::DBNameParseError {
+    let bucket_id = usize::from_str_radix(parts[1], 16).map_err(|_| ParseError::DBName {
         db_name: db_name.to_owned(),
     })?;
 
     Ok(OrgAndBucket::new(
-        NonZeroU64::new(org_id as u64).ok_or_else(|| Error::DBNameParseError {
+        NonZeroU64::new(org_id as u64).ok_or_else(|| ParseError::DBName {
             db_name: db_name.to_owned(),
         })?,
-        NonZeroU64::new(bucket_id as u64).ok_or_else(|| Error::DBNameParseError {
+        NonZeroU64::new(bucket_id as u64).ok_or_else(|| ParseError::DBName {
             db_name: db_name.to_owned(),
         })?,
     ))

From d328b37803dd2b407a7e726bf1d46b3d51478bb7 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Fri, 4 Feb 2022 16:12:52 +0000
Subject: [PATCH 03/30] feat: teach IOx to convert RPC frames into
 Recordbatches

---
 influxdb_iox/Cargo.toml                       |   1 +
 influxdb_iox/src/commands/storage.rs          |   2 +-
 influxdb_iox/src/commands/storage/response.rs | 762 ++++++++++++++++++
 3 files changed, 764 insertions(+), 1 deletion(-)

diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index 62c169f37c..7b98ada88b 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -37,6 +37,7 @@ query = { path = "../query" }
 read_buffer = { path = "../read_buffer" }
 router = { path = "../router" }
 router2 = { path = "../router2" }
+schema = { path = "../schema" }
 server = { path = "../server" }
 time = { path = "../time" }
 trace = { path = "../trace" }
diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index 9174404098..456ab84090 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -73,7 +73,7 @@ fn parse_predicate(expr: &str) -> Result<Predicate, ParseError> {
 
 // Attempts to parse the database name into and org and bucket ID.
 fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, ParseError> {
-    let parts = db_name.split("_").collect::<Vec<_>>();
+    let parts = db_name.split('_').collect::<Vec<_>>();
     if parts.len() != 2 {
         return DBNameSnafu {
             db_name: db_name.to_owned(),
diff --git a/influxdb_iox/src/commands/storage/response.rs b/influxdb_iox/src/commands/storage/response.rs
index 8b13789179..5c44468354 100644
--- a/influxdb_iox/src/commands/storage/response.rs
+++ b/influxdb_iox/src/commands/storage/response.rs
@@ -1 +1,763 @@
+use arrow::record_batch::RecordBatch;
+use hashbrown::HashMap;
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    iter,
+    string::FromUtf8Error,
+    sync::Arc,
+};
 
+use generated_types::{
+    read_response::{frame::Data, DataType, SeriesFrame},
+    Tag,
+};
+use schema::{builder::SchemaBuilder, InfluxColumnType, InfluxFieldType, Schema};
+use snafu::{ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("arrow error: {:?}", source))]
+    ArrowError { source: arrow::error::ArrowError },
+
+    #[snafu(display("frame type currently unsupported: {:?}", frame))]
+    UnsupportedFrameType { frame: String },
+
+    #[snafu(display("tag keys must be valid UTF-8: {:?}", source))]
+    InvalidTagKey { source: FromUtf8Error },
+
+    #[snafu(display("tag values must be valid UTF-8: {:?}", source))]
+    InvalidTagValue { source: FromUtf8Error },
+
+    #[snafu(display("measurement name must be valid UTF-8: {:?}", source))]
+    InvalidMeasurementName { source: FromUtf8Error },
+
+    #[snafu(display("unable to build schema: {:?}", source))]
+    SchemaBuildingError { source: schema::builder::Error },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+// This function takes a set of InfluxRPC data frames and converts them into an
+// Arrow record batches, which are suitable for pretty printing.
+fn into_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>> {
+    // Run through all the frames once to build the schema of each table we need
+    // to build as a record batch.
+    let mut table_column_mapping = determine_tag_columns(frames);
+
+    let mut all_tables = BTreeMap::new();
+    let mut current_table_frame: Option<(IntermediateTable, SeriesFrame)> = None;
+
+    for frame in frames {
+        match frame {
+            generated_types::read_response::frame::Data::Group(_) => {
+                return UnsupportedFrameTypeSnafu {
+                    frame: "group_frame".to_owned(),
+                }
+                .fail();
+            }
+            generated_types::read_response::frame::Data::Series(sf) => {
+                let cur_frame_measurement = &sf.tags[0].value;
+
+                // First series frame in result set.
+                if current_table_frame.is_none() {
+                    let table = IntermediateTable::try_new(
+                        table_column_mapping
+                            .remove(cur_frame_measurement)
+                            .expect("table column mappings exists for measurement"),
+                    )?;
+
+                    current_table_frame = Some((table, sf.clone()));
+                    continue;
+                }
+
+                // Subsequent series frames in results.
+                let (mut current_table, prev_series_frame) = current_table_frame.take().unwrap();
+
+                // Series frame has moved on to a different measurement. Push
+                // this table into a record batch and onto final results, then
+                // create a new table.
+                if measurement(&prev_series_frame) != cur_frame_measurement {
+                    let rb: RecordBatch = current_table.try_into()?;
+                    all_tables.insert(
+                        String::from_utf8(measurement(&prev_series_frame).to_owned())
+                            .context(InvalidMeasurementNameSnafu)?,
+                        rb,
+                    );
+
+                    // Initialise next intermediate table to fill.
+                    current_table = IntermediateTable::try_new(
+                        table_column_mapping
+                            .remove(cur_frame_measurement)
+                            .expect("table column mappings exists for measurement"),
+                    )?;
+                }
+
+                // Put current table (which may have been replaced with a new
+                // table if _measurement has changed) and series frame back. The
+                // field key can change on each series frame, so it's important
+                // to update it each time we see a new series frame, so that the
+                // value frames know where to push their data.
+                current_table_frame = Some((current_table, sf.clone()));
+
+                // no new column values written so no need to pad.
+                continue;
+            }
+            generated_types::read_response::frame::Data::FloatPoints(f) => {
+                // Get field key associated with previous series frame.
+                let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+                let column = current_table.field_column(field_name(prev_series_frame));
+
+                let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
+                column.extend_f64(&values);
+
+                let time_column = &mut current_table.time_column;
+                time_column.extend_from_slice(&f.timestamps);
+            }
+            generated_types::read_response::frame::Data::IntegerPoints(f) => {
+                // Get field key associated with previous series frame.
+                let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+                let column = current_table.field_column(field_name(prev_series_frame));
+
+                let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
+                column.extend_i64(&values);
+
+                let time_column = &mut current_table.time_column;
+                time_column.extend_from_slice(&f.timestamps);
+            }
+            generated_types::read_response::frame::Data::UnsignedPoints(f) => {
+                // Get field key associated with previous series frame.
+                let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+                let column = current_table.field_column(field_name(prev_series_frame));
+
+                let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
+                column.extend_u64(&values);
+
+                let time_column = &mut current_table.time_column;
+                time_column.extend_from_slice(&f.timestamps);
+            }
+            generated_types::read_response::frame::Data::BooleanPoints(f) => {
+                // Get field key associated with previous series frame.
+                let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+                let column = current_table.field_column(field_name(prev_series_frame));
+
+                let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
+                column.extend_bool(&values);
+
+                let time_column = &mut current_table.time_column;
+                time_column.extend_from_slice(&f.timestamps);
+            }
+            generated_types::read_response::frame::Data::StringPoints(f) => {
+                // Get field key associated with previous series frame.
+                let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+                let column = current_table.field_column(field_name(prev_series_frame));
+
+                let values = f
+                    .values
+                    .iter()
+                    .map(|x| Some(x.to_owned()))
+                    .collect::<Vec<_>>();
+                column.extend_string(&values);
+
+                let time_column = &mut current_table.time_column;
+                time_column.extend_from_slice(&f.timestamps);
+            }
+        };
+
+        // If the current frame contained field values/timestamps then we need
+        // pad all the other columns with either values or NULL so that all
+        // columns remain the same length.
+        //
+        let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
+        let max_rows = current_table.max_rows();
+
+        // Pad all tag columns with keys present in the previous series frame
+        // with identical values.
+        for Tag { key, value } in &prev_series_frame.tags {
+            if tag_key_is_measurement(key) || tag_key_is_field(key) {
+                continue;
+            }
+
+            let idx = current_table
+                .tag_columns
+                .get(key)
+                .expect("tag column mapping to be present");
+
+            let column = &mut current_table.column_data[*idx];
+            let column_rows = column.len();
+            assert!(max_rows >= column_rows);
+            column.pad_tag(
+                String::from_utf8(value.to_owned()).context(InvalidTagValueSnafu)?,
+                max_rows - column_rows,
+            );
+        }
+
+        // Pad all tag columns that were not present in the previous series
+        // frame with NULL.
+        for (_, &idx) in &current_table.tag_columns {
+            let column = &mut current_table.column_data[idx];
+            let column_rows = column.len();
+            if column_rows < max_rows {
+                column.pad_none(max_rows - column_rows);
+            }
+        }
+
+        // Pad all field columns with NULL such that they're the same length as
+        // the largest column.
+        for (_, &idx) in &current_table.field_columns {
+            let column = &mut current_table.column_data[idx];
+            let column_rows = column.len();
+            if column_rows < max_rows {
+                column.pad_none(max_rows - column_rows);
+            }
+        }
+    }
+
+    // Convert and insert current table
+    let (current_table, prev_series_frame) = current_table_frame.take().unwrap();
+    let rb: RecordBatch = current_table.try_into()?;
+    all_tables.insert(
+        String::from_utf8(measurement(&prev_series_frame).to_owned())
+            .context(InvalidMeasurementNameSnafu)?,
+        rb,
+    );
+
+    Ok(all_tables)
+}
+
+#[derive(Debug)]
+enum ColumnData {
+    Float(Vec<Option<f64>>),
+    Integer(Vec<Option<i64>>),
+    Unsigned(Vec<Option<u64>>),
+    Boolean(Vec<Option<bool>>),
+    String(Vec<Option<String>>),
+    Tag(Vec<Option<String>>),
+}
+
+impl ColumnData {
+    fn pad_tag(&mut self, value: String, additional: usize) {
+        if let Self::Tag(data) = self {
+            data.extend(iter::repeat(Some(value)).take(additional));
+        } else {
+            unreachable!("can't pad strings into {:?} column", self)
+        }
+    }
+
+    fn pad_none(&mut self, additional: usize) {
+        match self {
+            ColumnData::Float(data) => data.extend(iter::repeat(None).take(additional)),
+            ColumnData::Integer(data) => data.extend(iter::repeat(None).take(additional)),
+            ColumnData::Unsigned(data) => data.extend(iter::repeat(None).take(additional)),
+            ColumnData::Boolean(data) => data.extend(iter::repeat(None).take(additional)),
+            ColumnData::String(data) => data.extend(iter::repeat(None).take(additional)),
+            ColumnData::Tag(data) => data.extend(iter::repeat(None).take(additional)),
+        }
+    }
+
+    fn extend_f64(&mut self, arr: &[Option<f64>]) {
+        if let Self::Float(data) = self {
+            data.extend_from_slice(arr);
+        } else {
+            unreachable!("can't extend {:?} column with floats", self)
+        }
+    }
+
+    fn extend_i64(&mut self, arr: &[Option<i64>]) {
+        if let Self::Integer(data) = self {
+            data.extend_from_slice(arr);
+        } else {
+            unreachable!("can't extend {:?} column with integers", self)
+        }
+    }
+
+    fn extend_u64(&mut self, arr: &[Option<u64>]) {
+        if let Self::Unsigned(data) = self {
+            data.extend_from_slice(arr);
+        } else {
+            unreachable!("can't extend {:?} column with unsigned integers", self)
+        }
+    }
+
+    fn extend_bool(&mut self, arr: &[Option<bool>]) {
+        if let Self::Boolean(data) = self {
+            data.extend_from_slice(arr);
+        } else {
+            unreachable!("can't extend {:?} column with bools", self)
+        }
+    }
+
+    fn extend_string(&mut self, arr: &[Option<String>]) {
+        if let Self::String(data) = self {
+            data.extend_from_slice(arr);
+        } else {
+            unreachable!("can't extend {:?} column with strings", self)
+        }
+    }
+
+    fn len(&self) -> usize {
+        match self {
+            ColumnData::Float(arr) => arr.len(),
+            ColumnData::Integer(arr) => arr.len(),
+            ColumnData::Unsigned(arr) => arr.len(),
+            ColumnData::Boolean(arr) => arr.len(),
+            ColumnData::String(arr) => arr.len(),
+            ColumnData::Tag(arr) => arr.len(),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct IntermediateTable {
+    schema: Schema,
+
+    // constant-time access to the correct column from a tag or field key
+    tag_columns: HashMap<Vec<u8>, usize>,
+    field_columns: HashMap<Vec<u8>, usize>,
+
+    column_data: Vec<ColumnData>,
+    time_column: Vec<i64>,
+}
+
+impl IntermediateTable {
+    fn try_new(table_columns: TableColumns) -> Result<Self, Error> {
+        let mut schema_builder = SchemaBuilder::new();
+        let mut tag_columns = HashMap::new();
+        let mut field_columns = HashMap::new();
+        let mut column_data = vec![];
+
+        // First add the tag columns to the schema and column data.
+        for tag_key in table_columns.tag_columns {
+            let column_name = String::from_utf8(tag_key.clone()).context(InvalidTagKeySnafu)?;
+            schema_builder.influx_column(&column_name, InfluxColumnType::Tag);
+
+            // track position of column
+            tag_columns.insert(tag_key, column_data.len());
+            column_data.push(ColumnData::Tag(vec![]));
+        }
+
+        // Then add the field columns to the schema and column data.
+        for (field_key, data_type) in table_columns.field_columns {
+            let column_name = String::from_utf8(field_key.clone()).context(InvalidTagKeySnafu)?;
+            schema_builder.influx_column(
+                &column_name,
+                InfluxColumnType::Field(match data_type {
+                    DataType::Float => InfluxFieldType::Float,
+                    DataType::Integer => InfluxFieldType::Integer,
+                    DataType::Unsigned => InfluxFieldType::UInteger,
+                    DataType::Boolean => InfluxFieldType::Boolean,
+                    DataType::String => InfluxFieldType::String,
+                }),
+            );
+
+            // track position of column
+            field_columns.insert(field_key, column_data.len());
+            column_data.push(match data_type {
+                DataType::Float => ColumnData::Float(vec![]),
+                DataType::Integer => ColumnData::Integer(vec![]),
+                DataType::Unsigned => ColumnData::Unsigned(vec![]),
+                DataType::Boolean => ColumnData::Boolean(vec![]),
+                DataType::String => ColumnData::String(vec![]),
+            });
+        }
+
+        // Finally add the timestamp column.
+        schema_builder.influx_column("time", InfluxColumnType::Timestamp);
+        let time_column = vec![];
+
+        Ok(Self {
+            schema: schema_builder.build().context(SchemaBuildingSnafu)?,
+            tag_columns,
+            field_columns,
+            column_data,
+            time_column,
+        })
+    }
+
+    fn field_column(&mut self, field: &[u8]) -> &mut ColumnData {
+        let idx = self
+            .field_columns
+            .get(field)
+            .expect("field column mapping to be present");
+
+        &mut self.column_data[*idx]
+    }
+
+    // Returns the number of rows in the largest column. Useful for padding the
+    // rest of the columns out.
+    fn max_rows(&self) -> usize {
+        self.column_data
+            .iter()
+            .map(|c| c.len())
+            .max()
+            .unwrap_or_default()
+    }
+}
+
+impl TryFrom<IntermediateTable> for RecordBatch {
+    type Error = Error;
+
+    fn try_from(table: IntermediateTable) -> Result<Self, Self::Error> {
+        let arrow_schema: arrow::datatypes::SchemaRef = table.schema.into();
+
+        let mut rb_columns: Vec<Arc<dyn arrow::array::Array>> =
+            Vec::with_capacity(&table.column_data.len() + 1); // + time column
+
+        for col in table.column_data {
+            match col {
+                ColumnData::Integer(v) => {
+                    rb_columns.push(Arc::new(arrow::array::Int64Array::from(v)));
+                }
+                ColumnData::Unsigned(v) => {
+                    rb_columns.push(Arc::new(arrow::array::UInt64Array::from(v)));
+                }
+                ColumnData::Float(v) => {
+                    rb_columns.push(Arc::new(arrow::array::Float64Array::from(v)));
+                }
+                ColumnData::String(v) => {
+                    rb_columns.push(Arc::new(arrow::array::StringArray::from(
+                        v.iter().map(|s| s.as_deref()).collect::<Vec<_>>(),
+                    )));
+                }
+                ColumnData::Boolean(v) => {
+                    rb_columns.push(Arc::new(arrow::array::BooleanArray::from(v)));
+                }
+                ColumnData::Tag(v) => {
+                    rb_columns.push(Arc::new(arrow::array::DictionaryArray::<
+                        arrow::datatypes::Int32Type,
+                    >::from_iter(
+                        v.iter().map(|s| s.as_deref())
+                    )));
+                }
+            }
+        }
+
+        // time column
+        rb_columns.push(Arc::new(arrow::array::TimestampNanosecondArray::from(
+            table.time_column,
+        )));
+
+        Self::try_new(arrow_schema, rb_columns).context(ArrowSnafu)
+    }
+}
+
+// These constants describe known values for the keys associated with
+// measurements and fields.
+const MEASUREMENT_TAG_KEY_TEXT: [u8; 12] = [
+    b'_', b'm', b'e', b'a', b's', b'u', b'r', b'e', b'm', b'e', b'n', b't',
+];
+const MEASUREMENT_TAG_KEY_BIN: [u8; 1] = [0_u8];
+const FIELD_TAG_KEY_TEXT: [u8; 6] = [b'_', b'f', b'i', b'e', b'l', b'd'];
+const FIELD_TAG_KEY: [u8; 1] = [255_u8];
+
+// Store a collection of column names and types for a single table (measurement).
+#[derive(Debug, Default, PartialEq, Eq)]
+struct TableColumns {
+    tag_columns: BTreeSet<Vec<u8>>,
+    field_columns: BTreeMap<Vec<u8>, DataType>,
+}
+
+// Given a set of data frames determine from the series frames within the set
+// of tag columns for each distinct table (measurement).
+fn determine_tag_columns(frames: &[Data]) -> BTreeMap<Vec<u8>, TableColumns> {
+    let mut schema: BTreeMap<Vec<u8>, TableColumns> = BTreeMap::new();
+    for frame in frames {
+        if let Data::Series(sf) = frame {
+            assert!(!sf.tags.is_empty(), "expected _measurement and _field tags");
+
+            assert!(tag_key_is_measurement(&sf.tags[0].key));
+            // PERF: avoid clone of value
+            let measurement_name = sf.tags[0].value.clone();
+            let table = schema.entry(measurement_name).or_default();
+
+            for Tag { key, value } in sf.tags.iter().skip(1) {
+                if tag_key_is_field(key) {
+                    table.field_columns.insert(value.clone(), sf.data_type());
+                    continue;
+                }
+
+                // PERF: avoid clone of key
+                table.tag_columns.insert(key.clone()); // Add column to table schema
+            }
+        }
+    }
+    schema
+}
+
+// Extract a reference to the measurement name from a Series frame.
+fn measurement(frame: &SeriesFrame) -> &Vec<u8> {
+    assert!(tag_key_is_measurement(&frame.tags[0].key));
+    &frame.tags[0].value
+}
+
+// Extract a reference to the field name from a Series frame.
+fn field_name(frame: &SeriesFrame) -> &Vec<u8> {
+    let idx = frame.tags.len() - 1;
+    assert!(tag_key_is_field(&frame.tags[idx].key));
+    &frame.tags[idx].value
+}
+
+fn tag_key_is_measurement(key: &[u8]) -> bool {
+    (key == MEASUREMENT_TAG_KEY_TEXT) || (key == MEASUREMENT_TAG_KEY_BIN)
+}
+
+fn tag_key_is_field(key: &[u8]) -> bool {
+    (key == FIELD_TAG_KEY_TEXT) || (key == FIELD_TAG_KEY)
+}
+
+#[cfg(test)]
+mod test_super {
+    use arrow::util::pretty::pretty_format_batches;
+    use generated_types::read_response::{
+        BooleanPointsFrame, FloatPointsFrame, IntegerPointsFrame, SeriesFrame, StringPointsFrame,
+        UnsignedPointsFrame,
+    };
+
+    use super::*;
+
+    // converts a vector of key/value pairs into a vector of `Tag`.
+    fn make_tags(pairs: &[(&str, &str)]) -> Vec<Tag> {
+        pairs
+            .iter()
+            .map(|(key, value)| Tag {
+                key: key.as_bytes().to_vec(),
+                value: value.as_bytes().to_vec(),
+            })
+            .collect::<Vec<_>>()
+    }
+
+    struct TableColumnInput<'a> {
+        measurement: &'a str,
+        tags: &'a [&'a str],
+        fields: &'a [(&'a str, DataType)],
+    }
+
+    impl<'a> TableColumnInput<'a> {
+        fn new(measurement: &'a str, tags: &'a [&str], fields: &'a [(&str, DataType)]) -> Self {
+            Self {
+                measurement,
+                tags,
+                fields,
+            }
+        }
+    }
+
+    // converts a vector of key/value tag pairs and a field datatype into a
+    // collection of `TableColumns` objects.
+    fn make_table_columns(input: &'_ [TableColumnInput<'_>]) -> BTreeMap<Vec<u8>, TableColumns> {
+        let mut all_table_columns = BTreeMap::new();
+        for TableColumnInput {
+            measurement,
+            tags,
+            fields,
+        } in input
+        {
+            let tag_columns = tags
+                .iter()
+                .map(|c| c.as_bytes().to_vec())
+                .collect::<Vec<Vec<u8>>>();
+
+            let mut tag_columns_set = BTreeSet::new();
+            for c in tag_columns {
+                tag_columns_set.insert(c);
+            }
+
+            let mut field_columns = BTreeMap::new();
+            for (field, data_type) in *fields {
+                field_columns.insert(field.as_bytes().to_vec(), *data_type);
+            }
+
+            let table_columns = TableColumns {
+                tag_columns: tag_columns_set,
+                field_columns,
+            };
+
+            all_table_columns.insert(measurement.as_bytes().to_vec(), table_columns);
+        }
+        all_table_columns
+    }
+
+    // generate a substantial set of frames across multiple tables.
+    fn gen_frames() -> Vec<Data> {
+        vec![
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "cpu"),
+                    ("host", "foo"),
+                    ("server", "a"),
+                    ("_field", "temp"),
+                ]),
+                data_type: DataType::Float as i32,
+            }),
+            Data::FloatPoints(FloatPointsFrame {
+                timestamps: vec![1, 2, 3, 4],
+                values: vec![1.1, 2.2, 3.3, 4.4],
+            }),
+            Data::FloatPoints(FloatPointsFrame {
+                timestamps: vec![5, 6, 7, 10],
+                values: vec![5.1, 5.2, 5.3, 10.4],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "cpu"),
+                    ("host", "foo"),
+                    ("server", "a"),
+                    ("_field", "voltage"),
+                ]),
+                data_type: DataType::Integer as i32,
+            }),
+            Data::IntegerPoints(IntegerPointsFrame {
+                timestamps: vec![1, 2],
+                values: vec![22, 22],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "cpu"),
+                    ("host", "foo"),
+                    ("new_column", "a"),
+                    ("_field", "voltage"),
+                ]),
+                data_type: DataType::Integer as i32,
+            }),
+            Data::IntegerPoints(IntegerPointsFrame {
+                timestamps: vec![100, 200],
+                values: vec![1000, 2000],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[("_measurement", "another table"), ("_field", "voltage")]),
+                data_type: DataType::String as i32,
+            }),
+            Data::StringPoints(StringPointsFrame {
+                timestamps: vec![200, 201],
+                values: vec!["hello".to_string(), "abc".to_string()],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "another table"),
+                    ("region", "west"),
+                    ("_field", "voltage"),
+                ]),
+                data_type: DataType::String as i32,
+            }),
+            Data::StringPoints(StringPointsFrame {
+                timestamps: vec![302, 304],
+                values: vec!["foo".to_string(), "bar".to_string()],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "another table"),
+                    ("region", "north"),
+                    ("_field", "bool_field"),
+                ]),
+                data_type: DataType::Boolean as i32,
+            }),
+            Data::BooleanPoints(BooleanPointsFrame {
+                timestamps: vec![1000],
+                values: vec![true],
+            }),
+            Data::Series(SeriesFrame {
+                tags: make_tags(&[
+                    ("_measurement", "another table"),
+                    ("region", "south"),
+                    ("_field", "unsigned_field"),
+                ]),
+                data_type: DataType::Unsigned as i32,
+            }),
+            Data::UnsignedPoints(UnsignedPointsFrame {
+                timestamps: vec![2000],
+                values: vec![600],
+            }),
+        ]
+    }
+
+    #[test]
+    fn test_determine_tag_columns() {
+        assert!(determine_tag_columns(&[]).is_empty());
+
+        let frame = Data::Series(SeriesFrame {
+            tags: make_tags(&[("_measurement", "cpu"), ("server", "a"), ("_field", "temp")]),
+            data_type: DataType::Float as i32,
+        });
+
+        let exp = make_table_columns(&[TableColumnInput::new(
+            "cpu",
+            &["server"],
+            &[("temp", DataType::Float)],
+        )]);
+        assert_eq!(determine_tag_columns(&[frame]), exp);
+
+        // larger example
+        let frames = gen_frames();
+
+        let exp = make_table_columns(&[
+            TableColumnInput::new(
+                "cpu",
+                &["host", "new_column", "server"],
+                &[("temp", DataType::Float), ("voltage", DataType::Integer)],
+            ),
+            TableColumnInput::new(
+                "another table",
+                &["region"],
+                &[("voltage", DataType::String)],
+            ),
+        ]);
+        assert_eq!(determine_tag_columns(&frames), exp);
+    }
+
+    #[test]
+    fn test_into_record_batches() {
+        let frames = gen_frames();
+
+        let rbs = into_record_batches(&frames);
+        let exp = vec![
+            (
+                "another table",
+                vec![
+                    "+--------+------------+----------------+---------+-------------------------------+",
+                    "| region | bool_field | unsigned_field | voltage | time                          |",
+                    "+--------+------------+----------------+---------+-------------------------------+",
+                    "|        |            |                | hello   | 1970-01-01 00:00:00.000000200 |",
+                    "|        |            |                | abc     | 1970-01-01 00:00:00.000000201 |",
+                    "| west   |            |                | foo     | 1970-01-01 00:00:00.000000302 |",
+                    "| west   |            |                | bar     | 1970-01-01 00:00:00.000000304 |",
+                    "| north  | true       |                |         | 1970-01-01 00:00:00.000001    |",
+                    "| south  |            | 600            |         | 1970-01-01 00:00:00.000002    |",
+                    "+--------+------------+----------------+---------+-------------------------------+",
+                ],
+            ),
+            (
+                "cpu",
+                vec![
+                "+------+------------+--------+------+---------+-------------------------------+",
+                "| host | new_column | server | temp | voltage | time                          |",
+                "+------+------------+--------+------+---------+-------------------------------+",
+                "| foo  |            | a      | 1.1  |         | 1970-01-01 00:00:00.000000001 |",
+                "| foo  |            | a      | 2.2  |         | 1970-01-01 00:00:00.000000002 |",
+                "| foo  |            | a      | 3.3  |         | 1970-01-01 00:00:00.000000003 |",
+                "| foo  |            | a      | 4.4  |         | 1970-01-01 00:00:00.000000004 |",
+                "| foo  |            | a      | 5.1  |         | 1970-01-01 00:00:00.000000005 |",
+                "| foo  |            | a      | 5.2  |         | 1970-01-01 00:00:00.000000006 |",
+                "| foo  |            | a      | 5.3  |         | 1970-01-01 00:00:00.000000007 |",
+                "| foo  |            | a      | 10.4 |         | 1970-01-01 00:00:00.000000010 |",
+                "| foo  |            | a      |      | 22      | 1970-01-01 00:00:00.000000001 |",
+                "| foo  |            | a      |      | 22      | 1970-01-01 00:00:00.000000002 |",
+                "| foo  | a          |        |      | 1000    | 1970-01-01 00:00:00.000000100 |",
+                "| foo  | a          |        |      | 2000    | 1970-01-01 00:00:00.000000200 |",
+                "+------+------------+--------+------+---------+-------------------------------+",
+            ],
+            ),
+        ]
+        .into_iter()
+        .map(|(k, v)| (k.to_owned(), v.join("\n")))
+        .collect::<BTreeMap<String, String>>();
+
+        let got = rbs
+            .unwrap()
+            .into_iter()
+            .map(|(k, v)| {
+                let table: String = pretty_format_batches(&[v]).unwrap().to_string();
+                (k, table)
+            })
+            .collect::<BTreeMap<String, String>>();
+        assert_eq!(got, exp);
+    }
+}

From a63a617ccafad6ff2fa118f7b63e525ed1c29e31 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 4 Feb 2022 16:25:25 -0500
Subject: [PATCH 04/30] test: Add logging to make `db` tests more debuggable
 (#3643)

* test: enable logging in db tests

* test: log when check passed

* fix: facepalm

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/replay.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/db/src/replay.rs b/db/src/replay.rs
index 650b55030a..ebc2f87f5b 100644
--- a/db/src/replay.rs
+++ b/db/src/replay.rs
@@ -981,6 +981,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_ok_two_partitions_persist_second() {
+        test_helpers::maybe_start_logging();
         // acts as regression test for the following PRs:
         // - https://github.com/influxdata/influxdb_iox/pull/2079
         // - https://github.com/influxdata/influxdb_iox/pull/2084
@@ -1087,6 +1088,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_ok_two_partitions_persist_first() {
+        test_helpers::maybe_start_logging();
         // acts as regression test for the following PRs:
         // - https://github.com/influxdata/influxdb_iox/pull/2079
         // - https://github.com/influxdata/influxdb_iox/pull/2084
@@ -1193,6 +1195,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_ok_nothing_to_replay() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 Step::Restart,
@@ -1227,6 +1230,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_ok_different_sequencer_situations() {
+        test_helpers::maybe_start_logging();
         // three sequencers:
         //   0: no data at all
         //   1: replay required, additional incoming data during downtime
@@ -1338,6 +1342,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_ok_interleaved_writes() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 // let's ingest some data for two partitions a and b
@@ -1581,6 +1586,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_compacts() {
+        test_helpers::maybe_start_logging();
         let tracing_capture = TracingCapture::new();
 
         // these numbers are handtuned to trigger hard buffer limits w/o making the test too big
@@ -1635,6 +1641,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_prune_full_partition() {
+        test_helpers::maybe_start_logging();
         // there the following entries:
         //
         // 0. table 2, partition a:
@@ -1723,6 +1730,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_prune_some_sequences_partition() {
+        test_helpers::maybe_start_logging();
         // there the following entries:
         //
         // 0. table 2, partition a:
@@ -1814,6 +1822,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_prune_rows() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 Step::Ingest(vec![
@@ -1923,6 +1932,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_works_with_checkpoints_all_full_persisted_1() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             catalog_transactions_until_checkpoint: NonZeroU64::new(2).unwrap(),
             steps: vec![
@@ -1962,6 +1972,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_works_with_checkpoints_all_full_persisted_2() {
+        test_helpers::maybe_start_logging();
         // try to provoke an catalog checkpoints that lists database checkpoints in the wrong order
         ReplayTest {
             catalog_transactions_until_checkpoint: NonZeroU64::new(2).unwrap(),
@@ -2050,6 +2061,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_works_partially_persisted_1() {
+        test_helpers::maybe_start_logging();
         // regression test for https://github.com/influxdata/influxdb_iox/issues/2185
         let tracing_capture = TracingCapture::new();
 
@@ -2121,6 +2133,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_works_partially_persisted_2() {
+        test_helpers::maybe_start_logging();
         // regression test for https://github.com/influxdata/influxdb_iox/issues/2185
         let tracing_capture = TracingCapture::new();
 
@@ -2202,6 +2215,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_works_after_skip() {
+        test_helpers::maybe_start_logging();
         let tracing_capture = TracingCapture::new();
 
         ReplayTest {
@@ -2272,6 +2286,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_initializes_max_seen_sequence_numbers() {
+        test_helpers::maybe_start_logging();
         // Ensures that either replay or the catalog loading initializes the maximum seen sequence numbers (per
         // partition) correctly. Before this test (and its fix), sequence numbers were only written if there was any
         // unpersisted range during replay.
@@ -2402,6 +2417,7 @@ mod tests {
 
     #[tokio::test]
     async fn skip_replay_initializes_max_seen_sequence_numbers() {
+        test_helpers::maybe_start_logging();
         // Similar case to `replay_initializes_max_seen_sequence_numbers` but instead of replaying, we skip replay to
         // provoke a similar outcome.
         //
@@ -2528,6 +2544,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_after_drop() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 Step::Ingest(vec![
@@ -2630,6 +2647,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_delete() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 Step::Ingest(vec![TestSequencedEntry {
@@ -2696,6 +2714,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_delete_persisted_chunks() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 Step::Ingest(vec![TestSequencedEntry {
@@ -2751,6 +2770,7 @@ mod tests {
     // This test replay compact os chunks with deletes and duplicates
     #[tokio::test]
     async fn replay_delete_compact_os_chunks() {
+        test_helpers::maybe_start_logging();
         ReplayTest {
             steps: vec![
                 // --------------------------
@@ -2913,6 +2933,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_fail_sequencers_change() {
+        test_helpers::maybe_start_logging();
         // create write buffer w/ sequencer 0 and 1
         let write_buffer_state =
             MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(2).unwrap());
@@ -2951,6 +2972,7 @@ mod tests {
 
     #[tokio::test]
     async fn replay_fail_lost_entry() {
+        test_helpers::maybe_start_logging();
         // create write buffer state with sequence number 0 and 2, 1 is missing
         let write_buffer_state =
             MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
@@ -2990,6 +3012,7 @@ mod tests {
 
     #[tokio::test]
     async fn seek_to_end_works() {
+        test_helpers::maybe_start_logging();
         // setup watermarks:
         // 0 -> 3 + 1 = 4
         // 1 -> 1 + 1 = 2
@@ -3040,6 +3063,7 @@ mod tests {
         loop {
             println!("Try checks...");
             if ReplayTest::eval_checks(&checks, false, &test_db).await {
+                println!("checks passed...");
                 break;
             }
 

From a52c0a26e695766fdf8ea9cec39a0c0701902c10 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Fri, 4 Feb 2022 16:48:08 +0000
Subject: [PATCH 05/30] feat: print read filter results

---
 influxdb_iox/src/commands/storage.rs          | 23 ++++++++++-----
 influxdb_iox/src/commands/storage/response.rs | 29 ++++++++++++++++---
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index 456ab84090..c3ed69135a 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -3,23 +3,30 @@ pub(crate) mod response;
 
 use std::num::NonZeroU64;
 
+use snafu::{ResultExt, Snafu};
+use tonic::Status;
+
 use generated_types::Predicate;
 use influxdb_storage_client::{connection::Connection, Client, OrgAndBucket};
 use influxrpc_parser::predicate;
 use time;
 
-use snafu::{ResultExt, Snafu};
-
 #[derive(Debug, Snafu)]
 pub enum ParseError {
-    #[snafu(display("Unable to parse timestamp '{:?}'", t))]
+    #[snafu(display("unable to parse timestamp '{:?}'", t))]
     Timestamp { t: String },
 
-    #[snafu(display("Unable to parse database name '{:?}'", db_name))]
+    #[snafu(display("unable to parse database name '{:?}'", db_name))]
     DBName { db_name: String },
 
-    #[snafu(display("Unable to parse predicate: {:?}", source))]
+    #[snafu(display("unable to parse predicate: {:?}", source))]
     Predicate { source: predicate::Error },
+
+    #[snafu(display("server error: {:?}", source))]
+    ServerError { source: Status },
+
+    #[snafu(display("error building response: {:?}", source))]
+    ResponseError { source: response::Error },
 }
 
 pub type Result<T, E = ParseError> = std::result::Result<T, E>;
@@ -129,8 +136,10 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                 ))
                 .await
         }
-    };
-    Ok(())
+    }
+    .context(ServerSnafu)?;
+
+    response::pretty_print(&result).context(ResponseSnafu)
 }
 
 #[cfg(test)]
diff --git a/influxdb_iox/src/commands/storage/response.rs b/influxdb_iox/src/commands/storage/response.rs
index 5c44468354..9f3c4a6463 100644
--- a/influxdb_iox/src/commands/storage/response.rs
+++ b/influxdb_iox/src/commands/storage/response.rs
@@ -1,4 +1,4 @@
-use arrow::record_batch::RecordBatch;
+use arrow::{record_batch::RecordBatch, util::pretty::print_batches};
 use hashbrown::HashMap;
 use std::{
     collections::{BTreeMap, BTreeSet},
@@ -17,7 +17,7 @@ use snafu::{ResultExt, Snafu};
 #[derive(Debug, Snafu)]
 pub enum Error {
     #[snafu(display("arrow error: {:?}", source))]
-    ArrowError { source: arrow::error::ArrowError },
+    Arrow { source: arrow::error::ArrowError },
 
     #[snafu(display("frame type currently unsupported: {:?}", frame))]
     UnsupportedFrameType { frame: String },
@@ -32,11 +32,24 @@ pub enum Error {
     InvalidMeasurementName { source: FromUtf8Error },
 
     #[snafu(display("unable to build schema: {:?}", source))]
-    SchemaBuildingError { source: schema::builder::Error },
+    SchemaBuilding { source: schema::builder::Error },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+// Prints the provided data frames in a tabular format grouped into tables per
+// distinct measurement.
+pub fn pretty_print(frames: &[Data]) -> Result<()> {
+    let rbs = into_record_batches(frames)?;
+    for (k, rb) in rbs {
+        println!("_measurement: {}", k);
+        println!("rows: {:?}", &rb.num_rows());
+        print_batches(&[rb]).context(ArrowSnafu)?;
+        println!("\n\n");
+    }
+    Ok(())
+}
+
 // This function takes a set of InfluxRPC data frames and converts them into an
 // Arrow record batches, which are suitable for pretty printing.
 fn into_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>> {
@@ -47,6 +60,10 @@ fn into_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>>
     let mut all_tables = BTreeMap::new();
     let mut current_table_frame: Option<(IntermediateTable, SeriesFrame)> = None;
 
+    if frames.is_empty() {
+        return Ok(all_tables);
+    }
+
     for frame in frames {
         match frame {
             generated_types::read_response::frame::Data::Group(_) => {
@@ -697,7 +714,11 @@ mod test_super {
             TableColumnInput::new(
                 "another table",
                 &["region"],
-                &[("voltage", DataType::String)],
+                &[
+                    ("bool_field", DataType::Boolean),
+                    ("unsigned_field", DataType::Unsigned),
+                    ("voltage", DataType::String),
+                ],
             ),
         ]);
         assert_eq!(determine_tag_columns(&frames), exp);

From e2db1df11f129ebd33c4de735374454338723471 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Mon, 7 Feb 2022 12:24:17 +0000
Subject: [PATCH 06/30] refactor: improve writer buffer consumer interface
 (#3631)

* refactor: improve writer buffer consumer interface

The change looks huge but is actually rather simple. To
understand the interface change, let me first explain what we want:

- be able to fetch watermarks for any sequencer
- have streams:
  - each streams tracks a sequencer and has an offset state (no read
    multiplexing)
  - we can seek a stream
  - seeking and streaming cannot be done at the same time (that would be
    weird and likely leads to many bugs both in write buffer and in the
    user code)
- ideally we don't need to create streams of all sequencers but can
  choose a subset

Before this change we had one mutable consumer struct where you can get
all streams and watermark functions (this mutable-borrows the consumer)
or you can seek a single stream (this also mutable-borrows the
consumer). This is a bit weird for multiple reasons:

- you cannot seek a single stream without dropping all of them
- the mutable-borrow construct makes it really difficult to pass the
  streams into separate threads
- the consumer is boxed (because its mutable) which makes it more
  difficult to handle in a large-scale application

What this change does is the following:

- you have an immutable consumer (similar to the producer)
- the consumer offers the following methods:
  - get the set of sequencer IDs
  - get watermark for any sequencer
  - get a stream handler (see next point) for any sequencer
- the stream handler captures the stream state (offset) and provides you
  a standard `Stream<_>` interface as well as a seek function.
  Mutable-borrows ensure that you cannot use both at the same time.

The stream handler provides you the stream via `handler.stream()`. It
doesn't implement `Stream<_>` itself because the way boxing, dynamic
dispatch work, and pinning interact (i.e. I couldn't get it to work
without the indirection).

As a bonus point (which we don't use however) you can now create
multiple streams for the same sequencer and they all have their own
offset.

* fix: review comments

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/lib.rs                                 |  30 +-
 db/src/replay.rs                              | 152 ++++---
 db/src/write_buffer.rs                        |  43 +-
 influxdb_iox/src/commands/run/ingester.rs     |  22 +-
 .../tests/end_to_end_cases/delete_api.rs      |   8 +-
 .../tests/end_to_end_cases/write_pb.rs        |   8 +-
 ingester/src/handler.rs                       |  90 ++--
 server/src/database/init.rs                   |   6 +-
 write_buffer/src/config.rs                    |  10 +-
 write_buffer/src/core.rs                      | 412 +++++++++---------
 write_buffer/src/file.rs                      | 122 +++---
 write_buffer/src/kafka/config.rs              |   2 +-
 write_buffer/src/kafka/mod.rs                 | 247 +++++------
 write_buffer/src/mock.rs                      | 299 +++++++------
 14 files changed, 740 insertions(+), 711 deletions(-)

diff --git a/db/src/lib.rs b/db/src/lib.rs
index a2d2854037..018d028156 100644
--- a/db/src/lib.rs
+++ b/db/src/lib.rs
@@ -16,7 +16,7 @@ use crate::{
 };
 use ::lifecycle::select_persistable_chunks;
 pub use ::lifecycle::{LifecycleChunk, LockableChunk, LockablePartition};
-use ::write_buffer::core::WriteBufferReading;
+use ::write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
 use async_trait::async_trait;
 use data_types::{
     chunk_metadata::{ChunkId, ChunkLifecycleAction, ChunkOrder, ChunkSummary},
@@ -53,7 +53,7 @@ use schema::Schema;
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{
     any::Any,
-    collections::{HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     sync::{
         atomic::{AtomicUsize, Ordering},
         Arc,
@@ -112,6 +112,11 @@ pub enum Error {
         source: persistence_windows::checkpoint::Error,
     },
 
+    #[snafu(display("Cannot setup write buffer: {}", source))]
+    WriteBuffer {
+        source: ::write_buffer::core::WriteBufferError,
+    },
+
     #[snafu(display("Cannot replay: {}", source))]
     ReplayError { source: crate::replay::Error },
 
@@ -889,16 +894,23 @@ impl Db {
     pub async fn perform_replay(
         &self,
         replay_plan: Option<&ReplayPlan>,
-        consumer: &mut dyn WriteBufferReading,
-    ) -> Result<()> {
+        consumer: Arc<dyn WriteBufferReading>,
+    ) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
         use crate::replay::{perform_replay, seek_to_end};
-        if let Some(replay_plan) = replay_plan {
-            perform_replay(self, replay_plan, consumer)
+
+        let streams = consumer.stream_handlers().await.context(WriteBufferSnafu)?;
+
+        let streams = if let Some(replay_plan) = replay_plan {
+            perform_replay(self, replay_plan, streams)
                 .await
-                .context(ReplaySnafu)
+                .context(ReplaySnafu)?
         } else {
-            seek_to_end(self, consumer).await.context(ReplaySnafu)
-        }
+            seek_to_end(self, consumer.as_ref(), streams)
+                .await
+                .context(ReplaySnafu)?
+        };
+
+        Ok(streams)
     }
 
     /// Background worker function
diff --git a/db/src/replay.rs b/db/src/replay.rs
index ebc2f87f5b..5935e15a57 100644
--- a/db/src/replay.rs
+++ b/db/src/replay.rs
@@ -20,7 +20,7 @@ use std::{
     time::Duration,
 };
 use time::Time;
-use write_buffer::core::WriteBufferReading;
+use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
 
 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@@ -85,22 +85,34 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// operation fails. In that case some of the sequencers in the write buffers might already be seeked and others not.
 /// The caller must NOT use the write buffer in that case without ensuring that it is put into some proper state, e.g.
 /// by retrying this function.
-pub async fn seek_to_end(db: &Db, write_buffer: &mut dyn WriteBufferReading) -> Result<()> {
-    let mut watermarks = vec![];
-    for (sequencer_id, stream) in write_buffer.streams() {
-        let watermark = (stream.fetch_high_watermark)()
-            .await
-            .context(SeekSnafu { sequencer_id })?;
-        watermarks.push((sequencer_id, watermark));
-    }
+pub async fn seek_to_end(
+    db: &Db,
+    write_buffer: &dyn WriteBufferReading,
+    write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
+) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
+    // need to convert the btree into a vec because the btree iterator is not `Send`
+    let write_buffer_streams: Vec<_> = write_buffer_streams.into_iter().collect();
 
-    for (sequencer_id, watermark) in &watermarks {
-        write_buffer
-            .seek(*sequencer_id, *watermark)
+    let mut watermarks = vec![];
+    for (sequencer_id, _handler) in &write_buffer_streams {
+        let watermark = write_buffer
+            .fetch_high_watermark(*sequencer_id)
             .await
             .context(SeekSnafu {
                 sequencer_id: *sequencer_id,
             })?;
+        watermarks.push((*sequencer_id, watermark));
+    }
+
+    let mut write_buffer_streams_res = BTreeMap::new();
+    for ((sequencer_id, watermark), (sequencer_id_2, mut handler)) in
+        watermarks.iter().zip(write_buffer_streams)
+    {
+        assert_eq!(*sequencer_id, sequencer_id_2);
+        handler.seek(*watermark).await.context(SeekSnafu {
+            sequencer_id: *sequencer_id,
+        })?;
+        write_buffer_streams_res.insert(*sequencer_id, handler);
     }
 
     // remember max seen sequence numbers
@@ -142,24 +154,20 @@ pub async fn seek_to_end(db: &Db, write_buffer: &mut dyn WriteBufferReading) ->
         }
     }
 
-    Ok(())
+    Ok(write_buffer_streams_res)
 }
 
 /// Perform sequencer-driven replay for this DB.
 pub async fn perform_replay(
     db: &Db,
     replay_plan: &ReplayPlan,
-    write_buffer: &mut dyn WriteBufferReading,
-) -> Result<()> {
+    write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
+) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
     let db_name = db.rules.read().db_name().to_string();
     info!(%db_name, "starting replay");
 
     // check if write buffer and replay plan agree on the set of sequencer ids
-    let sequencer_ids: BTreeSet<_> = write_buffer
-        .streams()
-        .into_iter()
-        .map(|(sequencer_id, _stream)| sequencer_id)
-        .collect();
+    let sequencer_ids: BTreeSet<_> = write_buffer_streams.keys().copied().collect();
     for sequencer_id in replay_plan.sequencer_ids() {
         if !sequencer_ids.contains(&sequencer_id) {
             return Err(Error::UnknownSequencer {
@@ -179,31 +187,30 @@ pub async fn perform_replay(
         })
         .collect();
 
+    // need to convert the btree into a vec because the btree iterator is not `Send`
+    let mut write_buffer_streams: Vec<_> = write_buffer_streams.into_iter().collect();
+
     // seek write buffer according to the plan
-    for (sequencer_id, min_max) in &replay_ranges {
-        if let Some(min) = min_max.min() {
-            info!(%db_name, sequencer_id, sequence_number=min, "seek sequencer in preperation for replay");
-            write_buffer
-                .seek(*sequencer_id, min)
-                .await
-                .context(SeekSnafu {
+    for (sequencer_id, handler) in write_buffer_streams.iter_mut() {
+        if let Some(min_max) = replay_ranges.get(sequencer_id) {
+            if let Some(min) = min_max.min() {
+                info!(%db_name, sequencer_id, sequence_number=min, "seek sequencer in preperation for replay");
+                handler.seek(min).await.context(SeekSnafu {
                     sequencer_id: *sequencer_id,
                 })?;
-        } else {
-            let sequence_number = min_max.max() + 1;
-            info!(%db_name, sequencer_id, sequence_number, "seek sequencer that did not require replay");
-            write_buffer
-                .seek(*sequencer_id, sequence_number)
-                .await
-                .context(SeekSnafu {
+            } else {
+                let sequence_number = min_max.max() + 1;
+                info!(%db_name, sequencer_id, sequence_number, "seek sequencer that did not require replay");
+                handler.seek(sequence_number).await.context(SeekSnafu {
                     sequencer_id: *sequencer_id,
                 })?;
+            }
         }
     }
 
     // replay ranges
-    for (sequencer_id, mut stream) in write_buffer.streams() {
-        if let Some(min_max) = replay_ranges.get(&sequencer_id) {
+    for (sequencer_id, handler) in write_buffer_streams.iter_mut() {
+        if let Some(min_max) = replay_ranges.get(sequencer_id) {
             if min_max.min().is_none() {
                 // no replay required
                 continue;
@@ -216,19 +223,17 @@ pub async fn perform_replay(
                 "replay sequencer",
             );
 
-            while let Some(dml_operation) = stream
-                .stream
-                .try_next()
-                .await
-                .context(EntrySnafu { sequencer_id })?
-            {
+            let mut stream = handler.stream();
+            while let Some(dml_operation) = stream.try_next().await.context(EntrySnafu {
+                sequencer_id: *sequencer_id,
+            })? {
                 let sequence = *dml_operation
                     .meta()
                     .sequence()
                     .expect("entry must be sequenced");
                 if sequence.number > min_max.max() {
                     return Err(Error::EntryLostError {
-                        sequencer_id,
+                        sequencer_id: *sequencer_id,
                         actual_sequence_number: sequence.number,
                         expected_sequence_number: min_max.max(),
                     });
@@ -253,6 +258,7 @@ pub async fn perform_replay(
                         }
                         Err(crate::DmlError::HardLimitReached {}) if n_try < n_tries => {
                             if !logged_hard_limit {
+                                let sequencer_id: u32 = *sequencer_id;
                                 info!(
                                     %db_name,
                                     sequencer_id,
@@ -313,7 +319,7 @@ pub async fn perform_replay(
         }
     }
 
-    Ok(())
+    Ok(write_buffer_streams.into_iter().collect())
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -610,8 +616,12 @@ mod tests {
 
             let mut lifecycle = LifecycleWorker::new(Arc::clone(&test_db.db));
 
+            let write_buffer =
+                Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
+            let streams = write_buffer.stream_handlers().await.unwrap();
             let mut maybe_consumer = Some(WriteBufferConsumer::new(
-                Box::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap()),
+                write_buffer,
+                streams,
                 Arc::clone(&test_db.db),
                 &registry,
             ));
@@ -664,16 +674,17 @@ mod tests {
                             _ => unreachable!(),
                         };
 
-                        let mut write_buffer =
-                            MockBufferForReading::new(write_buffer_state.clone(), None).unwrap();
+                        let write_buffer: Arc<dyn WriteBufferReading> = Arc::new(
+                            MockBufferForReading::new(write_buffer_state.clone(), None).unwrap(),
+                        );
 
-                        test_db
+                        let streams = test_db
                             .db
-                            .perform_replay(replay_plan, &mut write_buffer)
+                            .perform_replay(replay_plan, Arc::clone(&write_buffer))
                             .await
                             .unwrap();
 
-                        maybe_write_buffer = Some(write_buffer);
+                        maybe_write_buffer = Some((write_buffer, streams));
                     }
                     Step::Persist(partitions) => {
                         let db = &test_db.db;
@@ -736,13 +747,20 @@ mod tests {
                     }
                     Step::Await(checks) => {
                         if maybe_consumer.is_none() {
-                            let write_buffer = match maybe_write_buffer.take() {
-                                Some(write_buffer) => write_buffer,
-                                None => MockBufferForReading::new(write_buffer_state.clone(), None)
-                                    .unwrap(),
+                            let (write_buffer, streams) = match maybe_write_buffer.take() {
+                                Some(x) => x,
+                                None => {
+                                    let write_buffer: Arc<dyn WriteBufferReading> = Arc::new(
+                                        MockBufferForReading::new(write_buffer_state.clone(), None)
+                                            .unwrap(),
+                                    );
+                                    let streams = write_buffer.stream_handlers().await.unwrap();
+                                    (write_buffer, streams)
+                                }
                             };
                             maybe_consumer = Some(WriteBufferConsumer::new(
-                                Box::new(write_buffer),
+                                write_buffer,
+                                streams,
                                 Arc::clone(&test_db.db),
                                 &registry,
                             ));
@@ -2937,7 +2955,8 @@ mod tests {
         // create write buffer w/ sequencer 0 and 1
         let write_buffer_state =
             MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(2).unwrap());
-        let mut write_buffer = MockBufferForReading::new(write_buffer_state, None).unwrap();
+        let write_buffer: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
 
         // create DB
         let db = TestDb::builder().build().await.db;
@@ -2961,9 +2980,7 @@ mod tests {
         let replay_plan = replay_planner.build().unwrap();
 
         // replay fails
-        let res = db
-            .perform_replay(Some(&replay_plan), &mut write_buffer)
-            .await;
+        let res = db.perform_replay(Some(&replay_plan), write_buffer).await;
         assert_contains!(
             res.unwrap_err().to_string(),
             "Replay plan references unknown sequencer"
@@ -2978,7 +2995,8 @@ mod tests {
             MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
         write_buffer_state.push_lp(Sequence::new(0, 0), "cpu bar=1 0");
         write_buffer_state.push_lp(Sequence::new(0, 2), "cpu bar=1 10");
-        let mut write_buffer = MockBufferForReading::new(write_buffer_state, None).unwrap();
+        let write_buffer: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
 
         // create DB
         let db = TestDb::builder().build().await.db;
@@ -3001,9 +3019,7 @@ mod tests {
         let replay_plan = replay_planner.build().unwrap();
 
         // replay fails
-        let res = db
-            .perform_replay(Some(&replay_plan), &mut write_buffer)
-            .await;
+        let res = db.perform_replay(Some(&replay_plan), write_buffer).await;
         assert_contains!(
             res.unwrap_err().to_string(),
             "Cannot replay: For sequencer 0 expected to find sequence 1 but replay jumped to 2"
@@ -3022,14 +3038,18 @@ mod tests {
         write_buffer_state.push_lp(Sequence::new(0, 0), "cpu bar=0 0");
         write_buffer_state.push_lp(Sequence::new(0, 3), "cpu bar=3 3");
         write_buffer_state.push_lp(Sequence::new(1, 1), "cpu bar=11 11");
-        let mut write_buffer = MockBufferForReading::new(write_buffer_state.clone(), None).unwrap();
+        let write_buffer: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
 
         // create DB
         let test_db = TestDb::builder().build().await;
         let db = &test_db.db;
 
         // seek
-        db.perform_replay(None, &mut write_buffer).await.unwrap();
+        let streams = db
+            .perform_replay(None, Arc::clone(&write_buffer))
+            .await
+            .unwrap();
 
         // add more data
         write_buffer_state.push_lp(Sequence::new(0, 4), "cpu bar=4 4");
@@ -3044,7 +3064,7 @@ mod tests {
             tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
 
         let consumer =
-            WriteBufferConsumer::new(Box::new(write_buffer), Arc::clone(db), &Default::default());
+            WriteBufferConsumer::new(write_buffer, streams, Arc::clone(db), &Default::default());
 
         // wait until checks pass
         let checks = vec![Check::Query(
diff --git a/db/src/write_buffer.rs b/db/src/write_buffer.rs
index e18b6b1037..953cbab0b5 100644
--- a/db/src/write_buffer.rs
+++ b/db/src/write_buffer.rs
@@ -2,11 +2,12 @@ use crate::Db;
 use dml::DmlOperation;
 use futures::{
     future::{BoxFuture, Shared},
-    stream::{BoxStream, FuturesUnordered},
+    stream::FuturesUnordered,
     FutureExt, StreamExt, TryFutureExt,
 };
 use observability_deps::tracing::{debug, error, info, warn};
 use std::{
+    collections::BTreeMap,
     future::Future,
     sync::Arc,
     time::{Duration, Instant},
@@ -14,7 +15,7 @@ use std::{
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use trace::span::SpanRecorder;
-use write_buffer::core::{FetchHighWatermark, WriteBufferError, WriteBufferReading};
+use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
 
 use self::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
 pub mod metrics;
@@ -32,7 +33,8 @@ pub struct WriteBufferConsumer {
 
 impl WriteBufferConsumer {
     pub fn new(
-        mut write_buffer: Box<dyn WriteBufferReading>,
+        write_buffer: Arc<dyn WriteBufferReading>,
+        write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
         db: Arc<Db>,
         registry: &metric::Registry,
     ) -> Self {
@@ -42,16 +44,15 @@ impl WriteBufferConsumer {
 
         let shutdown_captured = shutdown.clone();
         let join = tokio::spawn(async move {
-            let mut futures: FuturesUnordered<_> = write_buffer
-                .streams()
+            let mut futures: FuturesUnordered<_> = write_buffer_streams
                 .into_iter()
-                .map(|(sequencer_id, stream)| {
+                .map(|(sequencer_id, handler)| {
                     let metrics = ingest_metrics.new_sequencer_metrics(sequencer_id);
                     stream_in_sequenced_entries(
                         Arc::clone(&db),
+                        Arc::clone(&write_buffer),
                         sequencer_id,
-                        stream.stream,
-                        stream.fetch_high_watermark,
+                        handler,
                         metrics,
                     )
                 })
@@ -100,14 +101,15 @@ impl Drop for WriteBufferConsumer {
 /// buffer are ignored.
 async fn stream_in_sequenced_entries<'a>(
     db: Arc<Db>,
+    write_buffer: Arc<dyn WriteBufferReading>,
     sequencer_id: u32,
-    mut stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
-    f_mark: FetchHighWatermark<'a>,
+    mut stream_handler: Box<dyn WriteBufferStreamHandler>,
     mut metrics: SequencerMetrics,
 ) {
     let db_name = db.rules().name.to_string();
     let mut watermark_last_updated: Option<Instant> = None;
     let mut watermark = 0_u64;
+    let mut stream = stream_handler.stream();
 
     while let Some(db_write_result) = stream.next().await {
         // maybe update sequencer watermark
@@ -118,7 +120,7 @@ async fn stream_in_sequenced_entries<'a>(
             .map(|ts| now.duration_since(ts) > Duration::from_secs(10))
             .unwrap_or(true)
         {
-            match f_mark().await {
+            match write_buffer.fetch_high_watermark(sequencer_id).await {
                 Ok(w) => {
                     watermark = w;
                 }
@@ -251,11 +253,10 @@ mod tests {
         let join_handle =
             tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
 
-        let consumer = WriteBufferConsumer::new(
-            Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
-            Arc::clone(&db),
-            &Default::default(),
-        );
+        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
+        let streams = write_buffer.stream_handlers().await.unwrap();
+        let consumer =
+            WriteBufferConsumer::new(write_buffer, streams, Arc::clone(&db), &Default::default());
 
         // check: after a while the persistence windows should have the expected data
         let t_0 = Instant::now();
@@ -314,8 +315,11 @@ mod tests {
         let join_handle =
             tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
 
+        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
+        let streams = write_buffer.stream_handlers().await.unwrap();
         let consumer = WriteBufferConsumer::new(
-            Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
+            write_buffer,
+            streams,
             Arc::clone(&db),
             test_db.metric_registry.as_ref(),
         );
@@ -457,8 +461,11 @@ mod tests {
         let join_handle =
             tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
 
+        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
+        let streams = write_buffer.stream_handlers().await.unwrap();
         let consumer = WriteBufferConsumer::new(
-            Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
+            write_buffer,
+            streams,
             Arc::clone(&db),
             metric_registry.as_ref(),
         );
diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs
index 29e9b97587..a263bb2e13 100644
--- a/influxdb_iox/src/commands/run/ingester.rs
+++ b/influxdb_iox/src/commands/run/ingester.rs
@@ -50,6 +50,9 @@ pub enum Error {
     #[error("sequencer record not found for partition {0}")]
     SequencerNotFound(KafkaPartition),
 
+    #[error("error initializing ingester: {0}")]
+    Ingester(#[from] ingester::handler::Error),
+
     #[error("error initializing write buffer {0}")]
     WriteBuffer(#[from] write_buffer::core::WriteBufferError),
 }
@@ -150,14 +153,17 @@ pub async fn command(config: Config) -> Result<()> {
         )
         .await?;
 
-    let ingest_handler = Arc::new(IngestHandlerImpl::new(
-        kafka_topic,
-        sequencers,
-        catalog,
-        object_store,
-        write_buffer,
-        &metric_registry,
-    ));
+    let ingest_handler = Arc::new(
+        IngestHandlerImpl::new(
+            kafka_topic,
+            sequencers,
+            catalog,
+            object_store,
+            write_buffer,
+            &metric_registry,
+        )
+        .await?,
+    );
     let http = HttpDelegate::new(Arc::clone(&ingest_handler));
     let grpc = GrpcDelegate::new(ingest_handler);
 
diff --git a/influxdb_iox/tests/end_to_end_cases/delete_api.rs b/influxdb_iox/tests/end_to_end_cases/delete_api.rs
index 2714729b4f..9afc1264ae 100644
--- a/influxdb_iox/tests/end_to_end_cases/delete_api.rs
+++ b/influxdb_iox/tests/end_to_end_cases/delete_api.rs
@@ -162,7 +162,7 @@ pub async fn test_delete_on_router() {
     let fixture = ServerFixture::create_shared(ServerType::Router).await;
 
     let db_name = rand_name();
-    let (_tmpdir, mut write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
+    let (_tmpdir, write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
 
     let table = "cpu";
     let pred = DeletePredicate {
@@ -179,8 +179,10 @@ pub async fn test_delete_on_router() {
         .await
         .expect("cannot delete");
 
-    let mut stream = write_buffer.streams().into_values().next().unwrap();
-    let delete_actual = stream.stream.next().await.unwrap().unwrap();
+    let sequencer_id = write_buffer.sequencer_ids().into_iter().next().unwrap();
+    let mut handler = write_buffer.stream_handler(sequencer_id).await.unwrap();
+    let mut stream = handler.stream();
+    let delete_actual = stream.next().await.unwrap().unwrap();
     let delete_expected = DmlDelete::new(
         &db_name,
         pred,
diff --git a/influxdb_iox/tests/end_to_end_cases/write_pb.rs b/influxdb_iox/tests/end_to_end_cases/write_pb.rs
index 7a5f810e86..760928be76 100644
--- a/influxdb_iox/tests/end_to_end_cases/write_pb.rs
+++ b/influxdb_iox/tests/end_to_end_cases/write_pb.rs
@@ -45,7 +45,7 @@ pub async fn test_write_pb_router() {
     let fixture = ServerFixture::create_shared(ServerType::Router).await;
 
     let db_name = rand_name();
-    let (_tmpdir, mut write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
+    let (_tmpdir, write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
 
     fixture
         .write_client()
@@ -53,8 +53,10 @@ pub async fn test_write_pb_router() {
         .await
         .expect("cannot write");
 
-    let mut stream = write_buffer.streams().into_values().next().unwrap();
-    let write_actual = stream.stream.next().await.unwrap().unwrap();
+    let sequencer_id = write_buffer.sequencer_ids().into_iter().next().unwrap();
+    let mut handler = write_buffer.stream_handler(sequencer_id).await.unwrap();
+    let mut stream = handler.stream();
+    let write_actual = stream.next().await.unwrap().unwrap();
     let write_expected = DmlWrite::new(
         &db_name,
         lines_to_batches("mytable mycol1=5 3", 0).unwrap(),
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index dcac44ceeb..4bf20f0736 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -5,10 +5,9 @@ use object_store::ObjectStore;
 
 use crate::data::{IngesterData, SequencerData};
 use db::write_buffer::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
-use dml::DmlOperation;
-use futures::{stream::BoxStream, StreamExt};
+use futures::StreamExt;
 use observability_deps::tracing::{debug, warn};
-use snafu::Snafu;
+use snafu::{ResultExt, Snafu};
 use std::collections::BTreeMap;
 use std::{
     fmt::Formatter,
@@ -17,7 +16,7 @@ use std::{
 };
 use tokio::task::JoinHandle;
 use trace::span::SpanRecorder;
-use write_buffer::core::{FetchHighWatermark, WriteBufferError, WriteBufferReading};
+use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
 
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
@@ -31,6 +30,11 @@ pub enum Error {
         kafka_topic: String,
         kafka_partition: KafkaPartition,
     },
+
+    #[snafu(display("Write buffer error: {}", source))]
+    WriteBuffer {
+        source: write_buffer::core::WriteBufferError,
+    },
 }
 
 /// A specialized `Error` for Catalog errors
@@ -60,14 +64,14 @@ impl std::fmt::Debug for IngestHandlerImpl {
 
 impl IngestHandlerImpl {
     /// Initialize the Ingester
-    pub fn new(
+    pub async fn new(
         topic: KafkaTopic,
-        mut sequencer_states: BTreeMap<KafkaPartition, Sequencer>,
+        sequencer_states: BTreeMap<KafkaPartition, Sequencer>,
         catalog: Arc<dyn Catalog>,
         object_store: Arc<ObjectStore>,
-        write_buffer: Box<dyn WriteBufferReading>,
+        write_buffer: Arc<dyn WriteBufferReading>,
         registry: &metric::Registry,
-    ) -> Self {
+    ) -> Result<Self> {
         // build the initial ingester data state
         let mut sequencers = BTreeMap::new();
         for s in sequencer_states.values() {
@@ -83,40 +87,33 @@ impl IngestHandlerImpl {
         let kafka_topic_name = topic.name.clone();
         let ingest_metrics = WriteBufferIngestMetrics::new(registry, &topic.name);
 
-        let write_buffer: &'static mut _ = Box::leak(write_buffer);
-        let join_handles: Vec<_> = write_buffer
-            .streams()
-            .into_iter()
-            .filter_map(|(kafka_partition_id, stream)| {
-                // streams may return a stream for every partition in the kafka topic. We only want
-                // to process streams for those specified by the call to new.
-                let kafka_partition = KafkaPartition::new(kafka_partition_id as i32);
-                sequencer_states.remove(&kafka_partition).map(|sequencer| {
-                    let metrics = ingest_metrics.new_sequencer_metrics(kafka_partition_id);
-                    let ingester_data = Arc::clone(&ingester_data);
-                    let kafka_topic_name = kafka_topic_name.clone();
+        let mut join_handles = Vec::with_capacity(sequencer_states.len());
+        for (kafka_partition, sequencer) in sequencer_states {
+            let metrics = ingest_metrics.new_sequencer_metrics(kafka_partition.get() as u32);
+            let ingester_data = Arc::clone(&ingester_data);
+            let kafka_topic_name = kafka_topic_name.clone();
 
-                    tokio::task::spawn(async move {
-                        stream_in_sequenced_entries(
-                            ingester_data,
-                            sequencer.id,
-                            kafka_topic_name,
-                            kafka_partition,
-                            stream.stream,
-                            stream.fetch_high_watermark,
-                            metrics,
-                        )
-                        .await;
-                    })
-                })
-            })
-            .collect();
+            let stream_handler = write_buffer
+                .stream_handler(kafka_partition.get() as u32)
+                .await
+                .context(WriteBufferSnafu)?;
 
-        Self {
+            join_handles.push(tokio::task::spawn(stream_in_sequenced_entries(
+                ingester_data,
+                sequencer.id,
+                kafka_topic_name,
+                kafka_partition,
+                Arc::clone(&write_buffer),
+                stream_handler,
+                metrics,
+            )));
+        }
+
+        Ok(Self {
             data,
             kafka_topic: topic,
             join_handles,
-        }
+        })
     }
 }
 
@@ -135,17 +132,18 @@ impl Drop for IngestHandlerImpl {
 ///
 /// Note all errors reading / parsing / writing entries from the write
 /// buffer are ignored.
-async fn stream_in_sequenced_entries<'a>(
+async fn stream_in_sequenced_entries(
     ingester_data: Arc<IngesterData>,
     sequencer_id: SequencerId,
     kafka_topic: String,
     kafka_partition: KafkaPartition,
-    mut stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
-    f_mark: FetchHighWatermark<'a>,
+    write_buffer: Arc<dyn WriteBufferReading>,
+    mut write_buffer_stream: Box<dyn WriteBufferStreamHandler>,
     mut metrics: SequencerMetrics,
 ) {
     let mut watermark_last_updated: Option<Instant> = None;
     let mut watermark = 0_u64;
+    let mut stream = write_buffer_stream.stream();
 
     while let Some(db_write_result) = stream.next().await {
         // maybe update sequencer watermark
@@ -156,7 +154,10 @@ async fn stream_in_sequenced_entries<'a>(
             .map(|ts| now.duration_since(ts) > Duration::from_secs(10))
             .unwrap_or(true)
         {
-            match f_mark().await {
+            match write_buffer
+                .fetch_high_watermark(sequencer_id.get() as u32)
+                .await
+            {
                 Ok(w) => {
                     watermark = w;
                 }
@@ -290,7 +291,8 @@ mod tests {
             .unwrap()
             .unwrap();
         write_buffer_state.push_write(w2);
-        let reading = Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
+        let reading: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
         let object_store = Arc::new(ObjectStore::new_in_memory());
         let metrics: Arc<metric::Registry> = Default::default();
 
@@ -301,7 +303,9 @@ mod tests {
             object_store,
             reading,
             &metrics,
-        );
+        )
+        .await
+        .unwrap();
 
         // give the writes some time to go through the buffer. Exit once we've verified there's
         // data in there from both writes.
diff --git a/server/src/database/init.rs b/server/src/database/init.rs
index 54cdaa9c66..4bdc7e5fa6 100644
--- a/server/src/database/init.rs
+++ b/server/src/database/init.rs
@@ -511,7 +511,7 @@ impl DatabaseStateCatalogLoaded {
         };
         let write_buffer_consumer = match rules.write_buffer_connection.as_ref() {
             Some(connection) => {
-                let mut consumer = write_buffer_factory
+                let consumer = write_buffer_factory
                     .new_config_read(db_name.as_str(), trace_collector.as_ref(), connection)
                     .await
                     .context(CreateWriteBufferSnafu)?;
@@ -522,12 +522,14 @@ impl DatabaseStateCatalogLoaded {
                     self.replay_plan.as_ref().as_ref()
                 };
 
-                db.perform_replay(replay_plan, consumer.as_mut())
+                let streams = db
+                    .perform_replay(replay_plan, Arc::clone(&consumer))
                     .await
                     .context(ReplaySnafu)?;
 
                 Some(Arc::new(WriteBufferConsumer::new(
                     consumer,
+                    streams,
                     Arc::clone(&db),
                     shared.application.metric_registry().as_ref(),
                 )))
diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs
index f242c9e368..ae88d01929 100644
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@@ -142,7 +142,7 @@ impl WriteBufferConfigFactory {
         db_name: &str,
         trace_collector: Option<&Arc<dyn TraceCollector>>,
         cfg: &WriteBufferConnection,
-    ) -> Result<Box<dyn WriteBufferReading>, WriteBufferError> {
+    ) -> Result<Arc<dyn WriteBufferReading>, WriteBufferError> {
         let reader = match &cfg.type_[..] {
             "file" => {
                 let root = PathBuf::from(&cfg.connection);
@@ -153,7 +153,7 @@ impl WriteBufferConfigFactory {
                     trace_collector,
                 )
                 .await?;
-                Box::new(file_buffer) as _
+                Arc::new(file_buffer) as _
             }
             "kafka" => {
                 let rskafka_buffer = RSKafkaConsumer::new(
@@ -164,17 +164,17 @@ impl WriteBufferConfigFactory {
                     trace_collector.map(Arc::clone),
                 )
                 .await?;
-                Box::new(rskafka_buffer) as _
+                Arc::new(rskafka_buffer) as _
             }
             "mock" => match self.get_mock(&cfg.connection)? {
                 Mock::Normal(state) => {
                     let mock_buffer =
                         MockBufferForReading::new(state, cfg.creation_config.as_ref())?;
-                    Box::new(mock_buffer) as _
+                    Arc::new(mock_buffer) as _
                 }
                 Mock::AlwaysFailing => {
                     let mock_buffer = MockBufferForReadingThatAlwaysErrors {};
-                    Box::new(mock_buffer) as _
+                    Arc::new(mock_buffer) as _
                 }
             },
             other => {
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index 7556f7d905..d250bc5d89 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -5,7 +5,7 @@ use std::{
 
 use async_trait::async_trait;
 use dml::{DmlMeta, DmlOperation, DmlWrite};
-use futures::{future::BoxFuture, stream::BoxStream};
+use futures::stream::BoxStream;
 
 /// Generic boxed error type that is used in this crate.
 ///
@@ -18,7 +18,7 @@ pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
 pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
     /// List all known sequencers.
     ///
-    /// This set  not empty.
+    /// This set not empty.
     fn sequencer_ids(&self) -> BTreeSet<u32>;
 
     /// Send a [`DmlOperation`] to the write buffer using the specified sequencer ID.
@@ -63,47 +63,60 @@ pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
     fn type_name(&self) -> &'static str;
 }
 
-pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
-pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
-
-/// Output stream of [`WriteBufferReading`].
-pub struct WriteStream<'a> {
-    /// Stream that produces entries.
-    pub stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
-
-    /// Get high watermark (= what we believe is the next sequence number to be added).
+/// Handles a stream of a specific sequencer.
+///
+/// This can be used to consume data via a stream or to seek the stream to a given offset.
+#[async_trait]
+pub trait WriteBufferStreamHandler: Sync + Send + Debug + 'static {
+    /// Stream that produces DML operations.
     ///
-    /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
-    /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
-    pub fetch_high_watermark: FetchHighWatermark<'a>,
-}
+    /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
+    /// [`WriteBufferStreamHandler`] instance at the same time. If all streams are dropped and requested again, the last
+    /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
+    /// create a new [`WriteBufferStreamHandler`] or use [`seek`](Self::seek).
+    fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>>;
 
-impl<'a> Debug for WriteStream<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("EntryStream").finish_non_exhaustive()
-    }
+    /// Seek sequencer to given sequence number. The next output of related streams will be an entry with at least
+    /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    ///
+    /// Note that due to the mutable borrow, it is not possible to seek while streams exists.
+    async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError>;
 }
 
 /// Produce streams (one per sequencer) of [`DmlWrite`]s.
 #[async_trait]
 pub trait WriteBufferReading: Sync + Send + Debug + 'static {
-    /// Returns a stream per sequencer.
+    /// List all known sequencers.
     ///
-    /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
-    /// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
-    /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
-    /// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
-    fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>>;
+    /// This set not empty.
+    fn sequencer_ids(&self) -> BTreeSet<u32>;
 
-    /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
-    /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    /// Get stream handler for a dedicated sequencer.
     ///
-    /// Note that due to the mutable borrow, it is not possible to seek while streams exists.
-    async fn seek(
-        &mut self,
+    /// Handlers do NOT share any state (e.g. last offsets).
+    async fn stream_handler(
+        &self,
         sequencer_id: u32,
-        sequence_number: u64,
-    ) -> Result<(), WriteBufferError>;
+    ) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError>;
+
+    /// Get stream handlers for all stream.
+    async fn stream_handlers(
+        &self,
+    ) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>, WriteBufferError> {
+        let mut handlers = BTreeMap::new();
+
+        for sequencer_id in self.sequencer_ids() {
+            handlers.insert(sequencer_id, self.stream_handler(sequencer_id).await?);
+        }
+
+        Ok(handlers)
+    }
+
+    /// Get high watermark (= what we believe is the next sequence number to be added).
+    ///
+    /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
+    /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
+    async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError>;
 
     /// Return type (like `"mock"` or `"kafka"`) of this reader.
     fn type_name(&self) -> &'static str;
@@ -111,16 +124,14 @@ pub trait WriteBufferReading: Sync + Send + Debug + 'static {
 
 pub mod test_utils {
     //! Generic tests for all write buffer implementations.
-    use super::{WriteBufferError, WriteBufferReading, WriteBufferWriting};
+    use super::{
+        WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting,
+    };
     use async_trait::async_trait;
     use dml::{test_util::assert_write_op_eq, DmlMeta, DmlOperation, DmlWrite};
     use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt};
     use std::{
-        collections::{BTreeMap, BTreeSet},
-        convert::TryFrom,
-        num::NonZeroU32,
-        sync::Arc,
-        time::Duration,
+        collections::BTreeSet, convert::TryFrom, num::NonZeroU32, sync::Arc, time::Duration,
     };
     use time::{Time, TimeProvider};
     use trace::{ctx::SpanContext, span::Span, RingBufferTraceCollector};
@@ -246,40 +257,41 @@ pub mod test_utils {
         let entry_3 = "upc user=3 300";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
+        let sequencer_id = set_pop_first(&mut reader.sequencer_ids()).unwrap();
+        let mut stream_handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let mut stream = stream_handler.stream();
 
         let waker = futures::task::noop_waker();
         let mut cx = futures::task::Context::from_waker(&waker);
 
         // empty stream is pending
-        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.poll_next_unpin(&mut cx).is_pending());
 
         // adding content allows us to get results
         let w1 = write("namespace", &writer, entry_1, sequencer_id, None).await;
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
 
         // stream is pending again
-        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.poll_next_unpin(&mut cx).is_pending());
 
         // adding more data unblocks the stream
         let w2 = write("namespace", &writer, entry_2, sequencer_id, None).await;
         let w3 = write("namespace", &writer, entry_3, sequencer_id, None).await;
 
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w3);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w3);
 
         // stream is pending again
-        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.poll_next_unpin(&mut cx).is_pending());
     }
 
-    /// Tests multiple subsequently created streams from a single reader.
+    /// Tests multiple subsequently created streams from a single [`WriteBufferStreamHandler`].
     ///
     /// This tests that:
     /// - readers remember their offset (and "pending" state) even when streams are dropped
+    /// - state is not shared between handlers
     async fn test_multi_stream_io<T>(adapter: &T)
     where
         T: TestAdapter,
@@ -291,7 +303,7 @@ pub mod test_utils {
         let entry_3 = "upc user=3 300";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
         let waker = futures::task::noop_waker();
         let mut cx = futures::task::Context::from_waker(&waker);
@@ -301,35 +313,31 @@ pub mod test_utils {
         let w3 = write("namespace", &writer, entry_3, 0, None).await;
 
         // creating stream, drop stream, re-create it => still starts at first entry
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, stream) = map_pop_first(&mut streams).unwrap();
+        let sequencer_id = set_pop_first(&mut reader.sequencer_ids()).unwrap();
+        let mut stream_handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let stream = stream_handler.stream();
         drop(stream);
-        drop(streams);
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
+        let mut stream = stream_handler.stream();
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
 
         // re-creating stream after reading remembers offset, but wait a bit to provoke the stream to buffer some
         // entries
         tokio::time::sleep(Duration::from_millis(10)).await;
         drop(stream);
-        drop(streams);
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
-
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w3);
+        let mut stream = stream_handler.stream();
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w3);
 
         // re-creating stream after reading everything makes it pending
         drop(stream);
-        drop(streams);
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
-        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+        let mut stream = stream_handler.stream();
+        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+
+        // use a different handler => stream starts from beginning
+        let mut stream_handler2 = reader.stream_handler(sequencer_id).await.unwrap();
+        let mut stream2 = stream_handler2.stream();
+        assert_write_op_eq(&stream2.next().await.unwrap().unwrap(), &w1);
+        assert!(stream.poll_next_unpin(&mut cx).is_pending());
     }
 
     /// Test single reader-writer IO w/ multiple sequencers.
@@ -348,37 +356,43 @@ pub mod test_utils {
         let entry_3 = "upc user=3 300";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 2);
-        let (sequencer_id_1, mut stream_1) = map_pop_first(&mut streams).unwrap();
-        let (sequencer_id_2, mut stream_2) = map_pop_first(&mut streams).unwrap();
+        // check that we have two different sequencer IDs
+        let mut sequencer_ids = reader.sequencer_ids();
+        assert_eq!(sequencer_ids.len(), 2);
+        let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
+        let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
         assert_ne!(sequencer_id_1, sequencer_id_2);
 
         let waker = futures::task::noop_waker();
         let mut cx = futures::task::Context::from_waker(&waker);
 
+        let mut stream_handler_1 = reader.stream_handler(sequencer_id_1).await.unwrap();
+        let mut stream_handler_2 = reader.stream_handler(sequencer_id_2).await.unwrap();
+        let mut stream_1 = stream_handler_1.stream();
+        let mut stream_2 = stream_handler_2.stream();
+
         // empty streams are pending
-        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
 
         // entries arrive at the right target stream
         let w1 = write("namespace", &writer, entry_1, sequencer_id_1, None).await;
-        assert_write_op_eq(&stream_1.stream.next().await.unwrap().unwrap(), &w1);
-        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        assert_write_op_eq(&stream_1.next().await.unwrap().unwrap(), &w1);
+        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
 
         let w2 = write("namespace", &writer, entry_2, sequencer_id_2, None).await;
-        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
-        assert_write_op_eq(&stream_2.stream.next().await.unwrap().unwrap(), &w2);
+        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
+        assert_write_op_eq(&stream_2.next().await.unwrap().unwrap(), &w2);
 
         let w3 = write("namespace", &writer, entry_3, sequencer_id_1, None).await;
-        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
-        assert_write_op_eq(&stream_1.stream.next().await.unwrap().unwrap(), &w3);
+        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert_write_op_eq(&stream_1.next().await.unwrap().unwrap(), &w3);
 
         // streams are pending again
-        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
     }
 
     /// Test multiple multiple writers and multiple readers on multiple sequencers.
@@ -400,8 +414,8 @@ pub mod test_utils {
 
         let writer_1 = context.writing(true).await.unwrap();
         let writer_2 = context.writing(true).await.unwrap();
-        let mut reader_1 = context.reading(true).await.unwrap();
-        let mut reader_2 = context.reading(true).await.unwrap();
+        let reader_1 = context.reading(true).await.unwrap();
+        let reader_2 = context.reading(true).await.unwrap();
 
         let mut sequencer_ids_1 = writer_1.sequencer_ids();
         let sequencer_ids_2 = writer_2.sequencer_ids();
@@ -414,22 +428,15 @@ pub mod test_utils {
         let w_west_1 = write("namespace", &writer_1, entry_west_1, sequencer_id_2, None).await;
         let w_east_2 = write("namespace", &writer_2, entry_east_2, sequencer_id_1, None).await;
 
-        assert_reader_content(
-            &mut reader_1,
-            &[
-                (sequencer_id_1, &[&w_east_1, &w_east_2]),
-                (sequencer_id_2, &[&w_west_1]),
-            ],
-        )
-        .await;
-        assert_reader_content(
-            &mut reader_2,
-            &[
-                (sequencer_id_1, &[&w_east_1, &w_east_2]),
-                (sequencer_id_2, &[&w_west_1]),
-            ],
-        )
-        .await;
+        let mut handler_1_1 = reader_1.stream_handler(sequencer_id_1).await.unwrap();
+        let mut handler_1_2 = reader_1.stream_handler(sequencer_id_2).await.unwrap();
+        let mut handler_2_1 = reader_2.stream_handler(sequencer_id_1).await.unwrap();
+        let mut handler_2_2 = reader_2.stream_handler(sequencer_id_2).await.unwrap();
+
+        assert_reader_content(&mut handler_1_1, &[&w_east_1, &w_east_2]).await;
+        assert_reader_content(&mut handler_1_2, &[&w_west_1]).await;
+        assert_reader_content(&mut handler_2_1, &[&w_east_1, &w_east_2]).await;
+        assert_reader_content(&mut handler_2_2, &[&w_west_1]).await;
     }
 
     /// Test seek implemention of readers.
@@ -455,46 +462,47 @@ pub mod test_utils {
 
         let writer = context.writing(true).await.unwrap();
 
-        let w_east_1 = write("namespace", &writer, entry_east_1, 0, None).await;
-        let w_east_2 = write("namespace", &writer, entry_east_2, 0, None).await;
-        let w_west_1 = write("namespace", &writer, entry_west_1, 1, None).await;
+        let mut sequencer_ids = writer.sequencer_ids();
+        let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
+        let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
 
-        let mut reader_1 = context.reading(true).await.unwrap();
-        let mut reader_2 = context.reading(true).await.unwrap();
+        let w_east_1 = write("namespace", &writer, entry_east_1, sequencer_id_1, None).await;
+        let w_east_2 = write("namespace", &writer, entry_east_2, sequencer_id_1, None).await;
+        let w_west_1 = write("namespace", &writer, entry_west_1, sequencer_id_2, None).await;
+
+        let reader_1 = context.reading(true).await.unwrap();
+        let reader_2 = context.reading(true).await.unwrap();
+
+        let mut handler_1_1_a = reader_1.stream_handler(sequencer_id_1).await.unwrap();
+        let mut handler_1_2_a = reader_1.stream_handler(sequencer_id_2).await.unwrap();
+        let mut handler_1_1_b = reader_1.stream_handler(sequencer_id_1).await.unwrap();
+        let mut handler_1_2_b = reader_1.stream_handler(sequencer_id_2).await.unwrap();
+        let mut handler_2_1 = reader_2.stream_handler(sequencer_id_1).await.unwrap();
+        let mut handler_2_2 = reader_2.stream_handler(sequencer_id_2).await.unwrap();
 
         // forward seek
-        reader_1
-            .seek(0, w_east_2.meta().sequence().unwrap().number)
+        handler_1_1_a
+            .seek(w_east_2.meta().sequence().unwrap().number)
             .await
             .unwrap();
 
-        assert_reader_content(&mut reader_1, &[(0, &[&w_east_2]), (1, &[&w_west_1])]).await;
-        assert_reader_content(
-            &mut reader_2,
-            &[(0, &[&w_east_1, &w_east_2]), (1, &[&w_west_1])],
-        )
-        .await;
+        assert_reader_content(&mut handler_1_1_a, &[&w_east_2]).await;
+        assert_reader_content(&mut handler_1_2_a, &[&w_west_1]).await;
+        assert_reader_content(&mut handler_1_1_b, &[&w_east_1, &w_east_2]).await;
+        assert_reader_content(&mut handler_1_2_b, &[&w_west_1]).await;
+        assert_reader_content(&mut handler_2_1, &[&w_east_1, &w_east_2]).await;
+        assert_reader_content(&mut handler_2_2, &[&w_west_1]).await;
 
         // backward seek
-        reader_1.seek(0, 0).await.unwrap();
-        assert_reader_content(&mut reader_1, &[(0, &[&w_east_1, &w_east_2]), (1, &[])]).await;
+        handler_1_1_a.seek(0).await.unwrap();
+        assert_reader_content(&mut handler_1_1_a, &[&w_east_1, &w_east_2]).await;
 
         // seek to far end and then add data
-        reader_1.seek(0, 1_000_000).await.unwrap();
+        handler_1_1_a.seek(1_000_000).await.unwrap();
         write("namespace", &writer, entry_east_3, 0, None).await;
 
-        let mut streams = reader_1.streams();
-        assert_eq!(streams.len(), 2);
-        let (_sequencer_id, mut stream_1) = map_pop_first(&mut streams).unwrap();
-        let (_sequencer_id, mut stream_2) = map_pop_first(&mut streams).unwrap();
-        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
-        drop(stream_1);
-        drop(stream_2);
-        drop(streams);
-
-        // seeking unknown sequencer is NOT an error
-        reader_1.seek(0, 42).await.unwrap();
+        assert!(handler_1_1_a.stream().poll_next_unpin(&mut cx).is_pending());
+        assert!(handler_1_2_a.stream().poll_next_unpin(&mut cx).is_pending());
     }
 
     /// Test watermark fetching.
@@ -513,28 +521,33 @@ pub mod test_utils {
         let entry_west_1 = "upc,region=west user=1 200";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 2);
-        let (sequencer_id_1, stream_1) = map_pop_first(&mut streams).unwrap();
-        let (sequencer_id_2, stream_2) = map_pop_first(&mut streams).unwrap();
+        let mut sequencer_ids = writer.sequencer_ids();
+        let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
+        let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
 
         // start at watermark 0
-        assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
-        assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
+        assert_eq!(
+            reader.fetch_high_watermark(sequencer_id_1).await.unwrap(),
+            0
+        );
+        assert_eq!(
+            reader.fetch_high_watermark(sequencer_id_2).await.unwrap(),
+            0
+        );
 
         // high water mark moves
         write("namespace", &writer, entry_east_1, sequencer_id_1, None).await;
         let w1 = write("namespace", &writer, entry_east_2, sequencer_id_1, None).await;
         let w2 = write("namespace", &writer, entry_west_1, sequencer_id_2, None).await;
         assert_eq!(
-            (stream_1.fetch_high_watermark)().await.unwrap(),
+            reader.fetch_high_watermark(sequencer_id_1).await.unwrap(),
             w1.meta().sequence().unwrap().number + 1
         );
 
         assert_eq!(
-            (stream_2.fetch_high_watermark)().await.unwrap(),
+            reader.fetch_high_watermark(sequencer_id_2).await.unwrap(),
             w2.meta().sequence().unwrap().number + 1
         );
     }
@@ -557,11 +570,11 @@ pub mod test_utils {
         let entry = "upc user=1 100";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
+        let mut sequencer_ids = writer.sequencer_ids();
+        assert_eq!(sequencer_ids.len(), 1);
+        let sequencer_id = set_pop_first(&mut sequencer_ids).unwrap();
 
         let write = write("namespace", &writer, entry, sequencer_id, None).await;
         let reported_ts = write.meta().producer_ts().unwrap();
@@ -570,7 +583,8 @@ pub mod test_utils {
         time.inc(Duration::from_secs(10));
 
         // check that the timestamp records the ingestion time, not the read time
-        let sequenced_entry = stream.stream.next().await.unwrap().unwrap();
+        let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let sequenced_entry = handler.stream().next().await.unwrap().unwrap();
         let ts_entry = sequenced_entry.meta().producer_ts().unwrap();
         assert_eq!(ts_entry, t0);
         assert_eq!(reported_ts, t0);
@@ -603,7 +617,7 @@ pub mod test_utils {
         context.writing(false).await.unwrap();
     }
 
-    /// Test sequencer IDs reporting of writers.
+    /// Test sequencer IDs reporting of readers and writers.
     ///
     /// This tests that:
     /// - all sequencers are reported
@@ -618,11 +632,17 @@ pub mod test_utils {
 
         let writer_1 = context.writing(true).await.unwrap();
         let writer_2 = context.writing(true).await.unwrap();
+        let reader_1 = context.reading(true).await.unwrap();
+        let reader_2 = context.reading(true).await.unwrap();
 
         let sequencer_ids_1 = writer_1.sequencer_ids();
         let sequencer_ids_2 = writer_2.sequencer_ids();
-        assert_eq!(sequencer_ids_1, sequencer_ids_2);
+        let sequencer_ids_3 = reader_1.sequencer_ids();
+        let sequencer_ids_4 = reader_2.sequencer_ids();
         assert_eq!(sequencer_ids_1.len(), n_sequencers as usize);
+        assert_eq!(sequencer_ids_1, sequencer_ids_2);
+        assert_eq!(sequencer_ids_1, sequencer_ids_3);
+        assert_eq!(sequencer_ids_1, sequencer_ids_4);
     }
 
     /// Test that span contexts are propagated through the system.
@@ -635,11 +655,13 @@ pub mod test_utils {
         let entry = "upc user=1 100";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 1);
-        let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
+        let mut sequencer_ids = writer.sequencer_ids();
+        assert_eq!(sequencer_ids.len(), 1);
+        let sequencer_id = set_pop_first(&mut sequencer_ids).unwrap();
+        let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let mut stream = handler.stream();
 
         // 1: no context
         write("namespace", &writer, entry, sequencer_id, None).await;
@@ -669,16 +691,16 @@ pub mod test_utils {
         .await;
 
         // check write 1
-        let write_1 = stream.stream.next().await.unwrap().unwrap();
+        let write_1 = stream.next().await.unwrap().unwrap();
         assert!(write_1.meta().span_context().is_none());
 
         // check write 2
-        let write_2 = stream.stream.next().await.unwrap().unwrap();
+        let write_2 = stream.next().await.unwrap().unwrap();
         let actual_context_1 = write_2.meta().span_context().unwrap();
         assert_span_context_eq_or_linked(&span_context_1, actual_context_1, collector.spans());
 
         // check write 3
-        let write_3 = stream.stream.next().await.unwrap().unwrap();
+        let write_3 = stream.next().await.unwrap().unwrap();
         let actual_context_2 = write_3.meta().span_context().unwrap();
         assert_span_context_eq_or_linked(&span_context_2, actual_context_2, collector.spans());
     }
@@ -719,7 +741,7 @@ pub mod test_utils {
         let entry_2 = "upc,region=east user=2 200";
 
         let writer = context.writing(true).await.unwrap();
-        let mut reader = context.reading(true).await.unwrap();
+        let reader = context.reading(true).await.unwrap();
 
         let mut sequencer_ids = writer.sequencer_ids();
         assert_eq!(sequencer_ids.len(), 1);
@@ -728,7 +750,8 @@ pub mod test_utils {
         let w1 = write("namespace_1", &writer, entry_2, sequencer_id, None).await;
         let w2 = write("namespace_2", &writer, entry_1, sequencer_id, None).await;
 
-        assert_reader_content(&mut reader, &[(sequencer_id, &[&w1, &w2])]).await;
+        let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
+        assert_reader_content(&mut handler, &[&w1, &w2]).await;
     }
 
     /// Dummy test to ensure that flushing somewhat works.
@@ -770,57 +793,30 @@ pub mod test_utils {
 
     /// Assert that the content of the reader is as expected.
     ///
-    /// This will read `expected.len()` from the reader and then ensures that the stream is pending.
-    async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&DmlWrite])])
-    where
-        R: WriteBufferReading,
-    {
-        // normalize expected values
-        let expected = {
-            let mut expected = expected.to_vec();
-            expected.sort_by_key(|(sequencer_id, _entries)| *sequencer_id);
-            expected
-        };
+    /// This will read `expected_writes.len()` from the reader and then ensures that the stream is pending.
+    async fn assert_reader_content(
+        actual_stream_handler: &mut Box<dyn WriteBufferStreamHandler>,
+        expected_writes: &[&DmlWrite],
+    ) {
+        let actual_stream = actual_stream_handler.stream();
 
-        // Ensure content of the streams
-        let streams = reader.streams();
-        assert_eq!(streams.len(), expected.len());
+        // we need to limit the stream to `expected_writes.len()` elements, otherwise it might be pending forever
+        let actual_writes: Vec<_> = actual_stream
+            .take(expected_writes.len())
+            .try_collect()
+            .await
+            .unwrap();
 
-        for ((actual_sequencer_id, actual_stream), (expected_sequencer_id, expected_writes)) in
-            streams.into_iter().zip(expected.iter())
-        {
-            assert_eq!(actual_sequencer_id, *expected_sequencer_id);
-
-            // we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
-            let results: Vec<_> = actual_stream
-                .stream
-                .take(expected_writes.len())
-                .try_collect()
-                .await
-                .unwrap();
-
-            let actual_writes: Vec<_> = results.iter().collect();
-            assert_eq!(actual_writes.len(), expected_writes.len());
-            for (actual, expected) in actual_writes.iter().zip(expected_writes.iter()) {
-                assert_write_op_eq(actual, expected);
-            }
+        assert_eq!(actual_writes.len(), expected_writes.len());
+        for (actual, expected) in actual_writes.iter().zip(expected_writes.iter()) {
+            assert_write_op_eq(actual, expected);
         }
 
-        // Ensure that streams a pending
-        let streams = reader.streams();
-        assert_eq!(streams.len(), expected.len());
-
+        // Ensure that stream is pending
         let waker = futures::task::noop_waker();
         let mut cx = futures::task::Context::from_waker(&waker);
-
-        for ((actual_sequencer_id, mut actual_stream), (expected_sequencer_id, _expected_writes)) in
-            streams.into_iter().zip(expected.iter())
-        {
-            assert_eq!(actual_sequencer_id, *expected_sequencer_id);
-
-            // empty stream is pending
-            assert!(actual_stream.stream.poll_next_unpin(&mut cx).is_pending());
-        }
+        let mut actual_stream = actual_stream_handler.stream();
+        assert!(actual_stream.poll_next_unpin(&mut cx).is_pending());
     }
 
     /// Asserts that given span context are the same or that `second` links back to `first`.
@@ -854,20 +850,6 @@ pub mod test_utils {
         assert_eq!(first.parent_span_id, second.parent_span_id);
     }
 
-    /// Pops first entry from map.
-    ///
-    /// Helper until <https://github.com/rust-lang/rust/issues/62924> is stable.
-    pub(crate) fn map_pop_first<K, V>(map: &mut BTreeMap<K, V>) -> Option<(K, V)>
-    where
-        K: Clone + Ord,
-    {
-        map.keys()
-            .next()
-            .cloned()
-            .map(|k| map.remove_entry(&k))
-            .flatten()
-    }
-
     /// Pops first entry from set.
     ///
     /// Helper until <https://github.com/rust-lang/rust/issues/62924> is stable.
diff --git a/write_buffer/src/file.rs b/write_buffer/src/file.rs
index 6337137963..b1d7a5fec9 100644
--- a/write_buffer/src/file.rs
+++ b/write_buffer/src/file.rs
@@ -119,21 +119,21 @@ use std::{
     },
 };
 
-use crate::codec::{ContentType, IoxHeaders};
+use crate::{
+    codec::{ContentType, IoxHeaders},
+    core::WriteBufferStreamHandler,
+};
 use async_trait::async_trait;
 use data_types::{sequence::Sequence, write_buffer::WriteBufferCreationConfig};
 use dml::{DmlMeta, DmlOperation};
-use futures::{FutureExt, Stream, StreamExt};
+use futures::{stream::BoxStream, Stream, StreamExt};
 use pin_project::pin_project;
 use time::{Time, TimeProvider};
 use tokio_util::sync::ReusableBoxFuture;
 use trace::TraceCollector;
 use uuid::Uuid;
 
-use crate::core::{
-    FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
-    WriteBufferWriting, WriteStream,
-};
+use crate::core::{WriteBufferError, WriteBufferReading, WriteBufferWriting};
 
 /// Header used to declare the creation time of the message.
 pub const HEADER_TIME: &str = "last-modified";
@@ -260,6 +260,35 @@ impl WriteBufferWriting for FileBufferProducer {
     }
 }
 
+#[derive(Debug)]
+pub struct FileBufferStreamHandler {
+    sequencer_id: u32,
+    path: PathBuf,
+    next_sequence_number: Arc<AtomicU64>,
+    trace_collector: Option<Arc<dyn TraceCollector>>,
+}
+
+#[async_trait]
+impl WriteBufferStreamHandler for FileBufferStreamHandler {
+    fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
+        let committed = self.path.join("committed");
+
+        ConsumerStream::new(
+            self.sequencer_id,
+            committed,
+            Arc::clone(&self.next_sequence_number),
+            self.trace_collector.clone(),
+        )
+        .boxed()
+    }
+
+    async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
+        self.next_sequence_number
+            .store(sequence_number, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
 /// File-based write buffer reader.
 #[derive(Debug)]
 pub struct FileBufferConsumer {
@@ -291,56 +320,39 @@ impl FileBufferConsumer {
 
 #[async_trait]
 impl WriteBufferReading for FileBufferConsumer {
-    fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
-        let mut streams = BTreeMap::default();
-
-        for (sequencer_id, (sequencer_path, next_sequence_number)) in &self.dirs {
-            let committed = sequencer_path.join("committed");
-
-            let stream = ConsumerStream::new(
-                *sequencer_id,
-                committed.clone(),
-                Arc::clone(next_sequence_number),
-                self.trace_collector.clone(),
-            )
-            .boxed();
-
-            let fetch_high_watermark = move || {
-                let committed = committed.clone();
-
-                let fut = async move { watermark(&committed).await };
-                fut.boxed() as FetchHighWatermarkFut<'_>
-            };
-            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
-
-            streams.insert(
-                *sequencer_id,
-                WriteStream {
-                    stream,
-                    fetch_high_watermark,
-                },
-            );
-        }
-
-        streams
+    fn sequencer_ids(&self) -> BTreeSet<u32> {
+        self.dirs.keys().copied().collect()
     }
 
-    async fn seek(
-        &mut self,
+    async fn stream_handler(
+        &self,
         sequencer_id: u32,
-        sequence_number: u64,
-    ) -> Result<(), WriteBufferError> {
-        let path_and_next_sequence_number = self
+    ) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
+        let (path, _next_sequence_number) = self
             .dirs
             .get(&sequencer_id)
             .ok_or_else::<WriteBufferError, _>(|| {
                 format!("Unknown sequencer: {}", sequencer_id).into()
             })?;
-        path_and_next_sequence_number
-            .1
-            .store(sequence_number, Ordering::SeqCst);
 
-        Ok(())
+        Ok(Box::new(FileBufferStreamHandler {
+            sequencer_id,
+            path: path.clone(),
+            next_sequence_number: Arc::new(AtomicU64::new(0)),
+            trace_collector: self.trace_collector.clone(),
+        }))
+    }
+
+    async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
+        let (path, _next_sequence_number) = self
+            .dirs
+            .get(&sequencer_id)
+            .ok_or_else::<WriteBufferError, _>(|| {
+                format!("Unknown sequencer: {}", sequencer_id).into()
+            })?;
+        let committed = path.join("committed");
+
+        watermark(&committed).await
     }
 
     fn type_name(&self) -> &'static str {
@@ -792,11 +804,12 @@ mod tests {
         )
         .await;
 
-        let mut reader = ctx.reading(true).await.unwrap();
-        let mut stream = reader.streams().remove(&sequencer_id).unwrap();
+        let reader = ctx.reading(true).await.unwrap();
+        let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let mut stream = handler.stream();
 
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w4);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w4);
     }
 
     #[tokio::test]
@@ -820,9 +833,10 @@ mod tests {
         )
         .await;
 
-        let mut reader = ctx.reading(true).await.unwrap();
-        let mut stream = reader.streams().remove(&sequencer_id).unwrap();
+        let reader = ctx.reading(true).await.unwrap();
+        let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
+        let mut stream = handler.stream();
 
-        assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
+        assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
     }
 }
diff --git a/write_buffer/src/kafka/config.rs b/write_buffer/src/kafka/config.rs
index 2d862e73fa..61ea8c0c54 100644
--- a/write_buffer/src/kafka/config.rs
+++ b/write_buffer/src/kafka/config.rs
@@ -63,7 +63,7 @@ impl TryFrom<&WriteBufferCreationConfig> for TopicCreationConfig {
 }
 
 /// Config for consumers.
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct ConsumerConfig {
     /// Will wait for at least `min_batch_size` bytes of data
     ///
diff --git a/write_buffer/src/kafka/mod.rs b/write_buffer/src/kafka/mod.rs
index d694f5c618..1a47531f5e 100644
--- a/write_buffer/src/kafka/mod.rs
+++ b/write_buffer/src/kafka/mod.rs
@@ -9,7 +9,7 @@ use std::{
 use async_trait::async_trait;
 use data_types::{sequence::Sequence, write_buffer::WriteBufferCreationConfig};
 use dml::{DmlMeta, DmlOperation};
-use futures::{FutureExt, StreamExt};
+use futures::{stream::BoxStream, StreamExt};
 use rskafka::client::{
     consumer::StreamConsumerBuilder,
     error::{Error as RSKafkaError, ProtocolError},
@@ -22,10 +22,7 @@ use trace::TraceCollector;
 
 use crate::{
     codec::IoxHeaders,
-    core::{
-        FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
-        WriteBufferWriting, WriteStream,
-    },
+    core::{WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting},
 };
 
 use self::{
@@ -119,14 +116,81 @@ impl WriteBufferWriting for RSKafkaProducer {
 }
 
 #[derive(Debug)]
-struct ConsumerPartition {
+pub struct RSKafkaStreamHandler {
     partition_client: Arc<PartitionClient>,
     next_offset: Arc<AtomicI64>,
+    trace_collector: Option<Arc<dyn TraceCollector>>,
+    consumer_config: ConsumerConfig,
+    sequencer_id: u32,
+}
+
+#[async_trait]
+impl WriteBufferStreamHandler for RSKafkaStreamHandler {
+    fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
+        let trace_collector = self.trace_collector.clone();
+        let next_offset = Arc::clone(&self.next_offset);
+
+        let mut stream_builder = StreamConsumerBuilder::new(
+            Arc::clone(&self.partition_client),
+            next_offset.load(Ordering::SeqCst),
+        );
+        if let Some(max_wait_ms) = self.consumer_config.max_wait_ms {
+            stream_builder = stream_builder.with_max_wait_ms(max_wait_ms);
+        }
+        if let Some(min_batch_size) = self.consumer_config.min_batch_size {
+            stream_builder = stream_builder.with_min_batch_size(min_batch_size);
+        }
+        if let Some(max_batch_size) = self.consumer_config.max_batch_size {
+            stream_builder = stream_builder.with_max_batch_size(max_batch_size);
+        }
+        let stream = stream_builder.build();
+
+        let stream = stream.map(move |res| {
+            let (record, _watermark) = res?;
+
+            // store new offset already so we don't get stuck on invalid records
+            next_offset.store(record.offset + 1, Ordering::SeqCst);
+
+            let kafka_read_size = record.record.approximate_size();
+
+            let headers =
+                IoxHeaders::from_headers(record.record.headers, trace_collector.as_ref())?;
+
+            let sequence = Sequence {
+                id: self.sequencer_id,
+                number: record.offset.try_into()?,
+            };
+
+            let timestamp_millis =
+                i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)?;
+            let timestamp = Time::from_timestamp_millis_opt(timestamp_millis)
+                .ok_or_else::<WriteBufferError, _>(|| {
+                    format!(
+                        "Cannot parse timestamp for milliseconds: {}",
+                        timestamp_millis
+                    )
+                    .into()
+                })?;
+
+            let value = record
+                .record
+                .value
+                .ok_or_else::<WriteBufferError, _>(|| "Value missing".to_string().into())?;
+            crate::codec::decode(&value, headers, sequence, timestamp, kafka_read_size)
+        });
+        stream.boxed()
+    }
+
+    async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
+        let offset = i64::try_from(sequence_number)?;
+        self.next_offset.store(offset, Ordering::SeqCst);
+        Ok(())
+    }
 }
 
 #[derive(Debug)]
 pub struct RSKafkaConsumer {
-    partitions: BTreeMap<u32, ConsumerPartition>,
+    partition_clients: BTreeMap<u32, Arc<PartitionClient>>,
     trace_collector: Option<Arc<dyn TraceCollector>>,
     consumer_config: ConsumerConfig,
 }
@@ -147,24 +211,13 @@ impl RSKafkaConsumer {
         )
         .await?;
 
-        let partitions = partition_clients
+        let partition_clients = partition_clients
             .into_iter()
-            .map(|(partition_id, partition_client)| {
-                let partition_client = Arc::new(partition_client);
-                let next_offset = Arc::new(AtomicI64::new(0));
-
-                (
-                    partition_id,
-                    ConsumerPartition {
-                        partition_client,
-                        next_offset,
-                    },
-                )
-            })
+            .map(|(k, v)| (k, Arc::new(v)))
             .collect();
 
         Ok(Self {
-            partitions,
+            partition_clients,
             trace_collector,
             consumer_config: ConsumerConfig::try_from(connection_config)?,
         })
@@ -173,103 +226,40 @@ impl RSKafkaConsumer {
 
 #[async_trait]
 impl WriteBufferReading for RSKafkaConsumer {
-    fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
-        let mut streams = BTreeMap::new();
-
-        for (sequencer_id, partition) in &self.partitions {
-            let trace_collector = self.trace_collector.clone();
-            let next_offset = Arc::clone(&partition.next_offset);
-
-            let mut stream_builder = StreamConsumerBuilder::new(
-                Arc::clone(&partition.partition_client),
-                next_offset.load(Ordering::SeqCst),
-            );
-            if let Some(max_wait_ms) = self.consumer_config.max_wait_ms {
-                stream_builder = stream_builder.with_max_wait_ms(max_wait_ms);
-            }
-            if let Some(min_batch_size) = self.consumer_config.min_batch_size {
-                stream_builder = stream_builder.with_min_batch_size(min_batch_size);
-            }
-            if let Some(max_batch_size) = self.consumer_config.max_batch_size {
-                stream_builder = stream_builder.with_max_batch_size(max_batch_size);
-            }
-            let stream = stream_builder.build();
-
-            let stream = stream.map(move |res| {
-                let (record, _watermark) = res?;
-
-                // store new offset already so we don't get stuck on invalid records
-                next_offset.store(record.offset + 1, Ordering::SeqCst);
-
-                let kafka_read_size = record.record.approximate_size();
-
-                let headers =
-                    IoxHeaders::from_headers(record.record.headers, trace_collector.as_ref())?;
-
-                let sequence = Sequence {
-                    id: *sequencer_id,
-                    number: record.offset.try_into()?,
-                };
-
-                let timestamp_millis =
-                    i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)?;
-                let timestamp = Time::from_timestamp_millis_opt(timestamp_millis)
-                    .ok_or_else::<WriteBufferError, _>(|| {
-                    format!(
-                        "Cannot parse timestamp for milliseconds: {}",
-                        timestamp_millis
-                    )
-                    .into()
-                })?;
-
-                let value = record
-                    .record
-                    .value
-                    .ok_or_else::<WriteBufferError, _>(|| "Value missing".to_string().into())?;
-                crate::codec::decode(&value, headers, sequence, timestamp, kafka_read_size)
-            });
-            let stream = stream.boxed();
-
-            let partition_client = Arc::clone(&partition.partition_client);
-            let fetch_high_watermark = move || {
-                let partition_client = Arc::clone(&partition_client);
-                let fut = async move {
-                    let watermark = partition_client.get_high_watermark().await?;
-                    u64::try_from(watermark).map_err(|e| Box::new(e) as WriteBufferError)
-                };
-
-                fut.boxed() as FetchHighWatermarkFut<'_>
-            };
-            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
-
-            streams.insert(
-                *sequencer_id,
-                WriteStream {
-                    stream,
-                    fetch_high_watermark,
-                },
-            );
-        }
-
-        streams
+    fn sequencer_ids(&self) -> BTreeSet<u32> {
+        self.partition_clients.keys().copied().collect()
     }
 
-    async fn seek(
-        &mut self,
+    async fn stream_handler(
+        &self,
         sequencer_id: u32,
-        sequence_number: u64,
-    ) -> Result<(), WriteBufferError> {
-        let partition = self
-            .partitions
-            .get_mut(&sequencer_id)
+    ) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
+        let partition_client = self
+            .partition_clients
+            .get(&sequencer_id)
             .ok_or_else::<WriteBufferError, _>(|| {
                 format!("Unknown partition: {}", sequencer_id).into()
             })?;
 
-        let offset = i64::try_from(sequence_number)?;
-        partition.next_offset.store(offset, Ordering::SeqCst);
+        Ok(Box::new(RSKafkaStreamHandler {
+            partition_client: Arc::clone(partition_client),
+            next_offset: Arc::new(AtomicI64::new(0)),
+            trace_collector: self.trace_collector.clone(),
+            consumer_config: self.consumer_config.clone(),
+            sequencer_id,
+        }))
+    }
 
-        Ok(())
+    async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
+        let partition_client = self
+            .partition_clients
+            .get(&sequencer_id)
+            .ok_or_else::<WriteBufferError, _>(|| {
+                format!("Unknown partition: {}", sequencer_id).into()
+            })?;
+
+        let watermark = partition_client.get_high_watermark().await?;
+        u64::try_from(watermark).map_err(|e| Box::new(e) as WriteBufferError)
     }
 
     fn type_name(&self) -> &'static str {
@@ -344,8 +334,8 @@ mod tests {
 
     use crate::{
         core::test_utils::{
-            assert_span_context_eq_or_linked, map_pop_first, perform_generic_tests,
-            random_topic_name, set_pop_first, TestAdapter, TestContext,
+            assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
+            set_pop_first, TestAdapter, TestContext,
         },
         maybe_skip_kafka_integration,
     };
@@ -506,22 +496,18 @@ mod tests {
         )
         .await;
 
-        let mut consumer = ctx.reading(true).await.unwrap();
+        let consumer = ctx.reading(true).await.unwrap();
+        let mut handler = consumer.stream_handler(sequencer_id).await.unwrap();
 
         // read broken message from stream
-        let mut streams = consumer.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
-        let err = stream.stream.next().await.unwrap().unwrap_err();
+        let mut stream = handler.stream();
+        let err = stream.next().await.unwrap().unwrap_err();
         assert_eq!(err.to_string(), "No content type header");
 
         // re-creating the stream should advance past the broken message
         drop(stream);
-        drop(streams);
-        let mut streams = consumer.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
-        let op = stream.stream.next().await.unwrap().unwrap();
+        let mut stream = handler.stream();
+        let op = stream.next().await.unwrap().unwrap();
         assert_write_op_eq(&op, &w);
     }
 
@@ -564,17 +550,16 @@ mod tests {
         assert_ne!(w2_1.sequence().unwrap(), w1_1.sequence().unwrap());
         assert_eq!(w2_1.sequence().unwrap(), w2_2.sequence().unwrap());
 
-        let mut consumer = ctx.reading(true).await.unwrap();
-        let mut streams = consumer.streams();
-        assert_eq!(streams.len(), 1);
-        let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
+        let consumer = ctx.reading(true).await.unwrap();
+        let mut handler = consumer.stream_handler(sequencer_id).await.unwrap();
+        let mut stream = handler.stream();
 
         // get output, note that the write operations were fused
-        let op_w1_12 = stream.stream.next().await.unwrap().unwrap();
-        let op_d1_1 = stream.stream.next().await.unwrap().unwrap();
-        let op_d1_2 = stream.stream.next().await.unwrap().unwrap();
-        let op_w1_34 = stream.stream.next().await.unwrap().unwrap();
-        let op_w2_12 = stream.stream.next().await.unwrap().unwrap();
+        let op_w1_12 = stream.next().await.unwrap().unwrap();
+        let op_d1_1 = stream.next().await.unwrap().unwrap();
+        let op_d1_2 = stream.next().await.unwrap().unwrap();
+        let op_w1_34 = stream.next().await.unwrap().unwrap();
+        let op_w2_12 = stream.next().await.unwrap().unwrap();
 
         // ensure that sequence numbers map as expected
         assert_eq!(
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index 2f4b7a349e..c815fe1b8a 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -6,7 +6,7 @@ use std::{
 };
 
 use async_trait::async_trait;
-use futures::{stream, FutureExt, StreamExt};
+use futures::{stream::BoxStream, StreamExt};
 use parking_lot::Mutex;
 
 use data_types::sequence::Sequence;
@@ -15,8 +15,7 @@ use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
 use time::TimeProvider;
 
 use crate::core::{
-    FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
-    WriteBufferWriting, WriteStream,
+    WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting,
 };
 
 #[derive(Debug, Default)]
@@ -344,18 +343,9 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
     }
 }
 
-/// Sequencer-specific playback state
-struct PlaybackState {
-    /// Index within the entry vector.
-    vector_index: usize,
-
-    /// Offset within the sequencer IDs.
-    offset: u64,
-}
-
 pub struct MockBufferForReading {
     shared_state: MockBufferSharedState,
-    playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
+    n_sequencers: u32,
 }
 
 impl MockBufferForReading {
@@ -375,21 +365,10 @@ impl MockBufferForReading {
             };
             entries.len() as u32
         };
-        let playback_states: BTreeMap<_, _> = (0..n_sequencers)
-            .map(|sequencer_id| {
-                (
-                    sequencer_id,
-                    PlaybackState {
-                        vector_index: 0,
-                        offset: 0,
-                    },
-                )
-            })
-            .collect();
 
         Ok(Self {
             shared_state: state,
-            playback_states: Arc::new(Mutex::new(playback_states)),
+            n_sequencers,
         })
     }
 }
@@ -400,104 +379,106 @@ impl std::fmt::Debug for MockBufferForReading {
     }
 }
 
+/// Sequencer-specific playback state
+#[derive(Debug)]
+pub struct MockBufferStreamHandler {
+    /// Shared state.
+    shared_state: MockBufferSharedState,
+
+    /// Own sequencer ID.
+    sequencer_id: u32,
+
+    /// Index within the entry vector.
+    vector_index: usize,
+
+    /// Offset within the sequencer IDs.
+    offset: u64,
+}
+
 #[async_trait]
-impl WriteBufferReading for MockBufferForReading {
-    fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
-        let sequencer_ids: Vec<_> = {
-            let playback_states = self.playback_states.lock();
-            playback_states.keys().copied().collect()
-        };
+impl WriteBufferStreamHandler for MockBufferStreamHandler {
+    fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
+        futures::stream::poll_fn(|cx| {
+            let mut guard = self.shared_state.writes.lock();
+            let writes = guard.as_mut().unwrap();
+            let writes_vec = writes.get_mut(&self.sequencer_id).unwrap();
 
-        let mut streams = BTreeMap::new();
-        for sequencer_id in sequencer_ids {
-            let shared_state = self.shared_state.clone();
-            let playback_states = Arc::clone(&self.playback_states);
+            let entries = &writes_vec.writes;
+            while entries.len() > self.vector_index {
+                let write_result = &entries[self.vector_index];
 
-            let stream = stream::poll_fn(move |cx| {
-                let mut guard = shared_state.writes.lock();
-                let writes = guard.as_mut().unwrap();
-                let writes_vec = writes.get_mut(&sequencer_id).unwrap();
+                // consume entry
+                self.vector_index += 1;
 
-                let mut playback_states = playback_states.lock();
-                let playback_state = playback_states.get_mut(&sequencer_id).unwrap();
-
-                let entries = &writes_vec.writes;
-                while entries.len() > playback_state.vector_index {
-                    let write_result = &entries[playback_state.vector_index];
-
-                    // consume entry
-                    playback_state.vector_index += 1;
-
-                    match write_result {
-                        Ok(write) => {
-                            // found an entry => need to check if it is within the offset
-                            let sequence = write.meta().sequence().unwrap();
-                            if sequence.number >= playback_state.offset {
-                                // within offset => return entry to caller
-                                return Poll::Ready(Some(Ok(write.clone())));
-                            } else {
-                                // offset is larger then the current entry => ignore entry and try next
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            // found an error => return entry to caller
-                            return Poll::Ready(Some(Err(e.to_string().into())));
+                match write_result {
+                    Ok(write) => {
+                        // found an entry => need to check if it is within the offset
+                        let sequence = write.meta().sequence().unwrap();
+                        if sequence.number >= self.offset {
+                            // within offset => return entry to caller
+                            return Poll::Ready(Some(Ok(write.clone())));
+                        } else {
+                            // offset is larger then the current entry => ignore entry and try next
+                            continue;
                         }
                     }
+                    Err(e) => {
+                        // found an error => return entry to caller
+                        return Poll::Ready(Some(Err(e.to_string().into())));
+                    }
                 }
+            }
 
-                // we are at the end of the recorded entries => report pending
-                writes_vec.register_waker(cx.waker());
-                Poll::Pending
-            })
-            .boxed();
-
-            let shared_state = self.shared_state.clone();
-
-            let fetch_high_watermark = move || {
-                let shared_state = shared_state.clone();
-
-                let fut = async move {
-                    let guard = shared_state.writes.lock();
-                    let entries = guard.as_ref().unwrap();
-                    let entry_vec = entries.get(&sequencer_id).unwrap();
-                    let watermark = entry_vec.max_seqno.map(|n| n + 1).unwrap_or(0);
-
-                    Ok(watermark)
-                };
-                fut.boxed() as FetchHighWatermarkFut<'_>
-            };
-            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
-
-            streams.insert(
-                sequencer_id,
-                WriteStream {
-                    stream,
-                    fetch_high_watermark,
-                },
-            );
-        }
-
-        streams
+            // we are at the end of the recorded entries => report pending
+            writes_vec.register_waker(cx.waker());
+            Poll::Pending
+        })
+        .boxed()
     }
 
-    async fn seek(
-        &mut self,
-        sequencer_id: u32,
-        sequence_number: u64,
-    ) -> Result<(), WriteBufferError> {
-        let mut playback_states = self.playback_states.lock();
+    async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
+        self.offset = sequence_number;
 
-        if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
-            playback_state.offset = sequence_number;
-
-            // reset position to start since seeking might go backwards
-            playback_state.vector_index = 0;
-        }
+        // reset position to start since seeking might go backwards
+        self.vector_index = 0;
 
         Ok(())
     }
+}
+
+#[async_trait]
+impl WriteBufferReading for MockBufferForReading {
+    fn sequencer_ids(&self) -> BTreeSet<u32> {
+        (0..self.n_sequencers).into_iter().collect()
+    }
+    async fn stream_handler(
+        &self,
+        sequencer_id: u32,
+    ) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
+        if sequencer_id >= self.n_sequencers {
+            return Err(format!("Unknown sequencer: {}", sequencer_id).into());
+        }
+
+        Ok(Box::new(MockBufferStreamHandler {
+            shared_state: self.shared_state.clone(),
+            sequencer_id,
+            vector_index: 0,
+            offset: 0,
+        }))
+    }
+
+    async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
+        let guard = self.shared_state.writes.lock();
+        let entries = guard.as_ref().unwrap();
+        let entry_vec = entries
+            .get(&sequencer_id)
+            .ok_or_else::<WriteBufferError, _>(|| {
+                format!("Unknown sequencer: {}", sequencer_id).into()
+            })?;
+        let watermark = entry_vec.max_seqno.map(|n| n + 1).unwrap_or(0);
+
+        Ok(watermark)
+    }
 
     fn type_name(&self) -> &'static str {
         "mock"
@@ -507,40 +488,42 @@ impl WriteBufferReading for MockBufferForReading {
 #[derive(Debug, Default, Clone, Copy)]
 pub struct MockBufferForReadingThatAlwaysErrors;
 
+#[derive(Debug, Default, Clone, Copy)]
+pub struct MockStreamHandlerThatAlwaysErrors;
+
 #[async_trait]
-impl WriteBufferReading for MockBufferForReadingThatAlwaysErrors {
-    fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
-        let stream = stream::poll_fn(|_ctx| {
+impl WriteBufferStreamHandler for MockStreamHandlerThatAlwaysErrors {
+    fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
+        futures::stream::poll_fn(|_cx| {
             Poll::Ready(Some(Err(String::from(
                 "Something bad happened while reading from stream",
             )
             .into())))
         })
-        .boxed();
-        let fetch_high_watermark = move || {
-            let fut = async move {
-                Err(String::from("Something bad happened while fetching the high watermark").into())
-            };
-            fut.boxed() as FetchHighWatermarkFut<'_>
-        };
-        let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
-        IntoIterator::into_iter([(
-            0,
-            WriteStream {
-                stream,
-                fetch_high_watermark,
-            },
-        )])
-        .collect()
+        .boxed()
     }
 
-    async fn seek(
-        &mut self,
-        _sequencer_id: u32,
-        _sequence_number: u64,
-    ) -> Result<(), WriteBufferError> {
+    async fn seek(&mut self, _sequence_number: u64) -> Result<(), WriteBufferError> {
         Err(String::from("Something bad happened while seeking the stream").into())
     }
+}
+
+#[async_trait]
+impl WriteBufferReading for MockBufferForReadingThatAlwaysErrors {
+    fn sequencer_ids(&self) -> BTreeSet<u32> {
+        BTreeSet::from([0])
+    }
+
+    async fn stream_handler(
+        &self,
+        _sequencer_id: u32,
+    ) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
+        Ok(Box::new(MockStreamHandlerThatAlwaysErrors {}))
+    }
+
+    async fn fetch_high_watermark(&self, _sequencer_id: u32) -> Result<u64, WriteBufferError> {
+        Err(String::from("Something bad happened while fetching the high watermark").into())
+    }
 
     fn type_name(&self) -> &'static str {
         "mock_failing"
@@ -552,11 +535,12 @@ mod tests {
     use std::convert::TryFrom;
     use std::time::Duration;
 
+    use futures::StreamExt;
     use mutable_batch_lp::lines_to_batches;
     use time::TimeProvider;
     use trace::RingBufferTraceCollector;
 
-    use crate::core::test_utils::{map_pop_first, perform_generic_tests, TestAdapter, TestContext};
+    use crate::core::test_utils::{perform_generic_tests, TestAdapter, TestContext};
 
     use super::*;
 
@@ -739,26 +723,34 @@ mod tests {
 
     #[tokio::test]
     async fn test_always_error_read() {
-        let mut reader = MockBufferForReadingThatAlwaysErrors {};
+        let reader = MockBufferForReadingThatAlwaysErrors {};
 
         assert_eq!(
-            reader.seek(0, 0).await.unwrap_err().to_string(),
-            "Something bad happened while seeking the stream"
-        );
-
-        let mut streams = reader.streams();
-        let (_id, mut stream) = map_pop_first(&mut streams).unwrap();
-        assert_eq!(
-            stream.stream.next().await.unwrap().unwrap_err().to_string(),
-            "Something bad happened while reading from stream"
-        );
-        assert_eq!(
-            (stream.fetch_high_watermark)()
+            reader
+                .fetch_high_watermark(0)
                 .await
                 .unwrap_err()
                 .to_string(),
             "Something bad happened while fetching the high watermark"
         );
+
+        let mut stream_handler = reader.stream_handler(0).await.unwrap();
+
+        assert_eq!(
+            stream_handler.seek(0).await.unwrap_err().to_string(),
+            "Something bad happened while seeking the stream"
+        );
+
+        assert_eq!(
+            stream_handler
+                .stream()
+                .next()
+                .await
+                .unwrap()
+                .unwrap_err()
+                .to_string(),
+            "Something bad happened while reading from stream"
+        );
     }
 
     #[tokio::test]
@@ -823,19 +815,20 @@ mod tests {
 
         state.push_lp(Sequence::new(0, 0), "mem foo=1 10");
 
-        let mut read = MockBufferForReading::new(state.clone(), None).unwrap();
-        let playback_state = Arc::clone(&read.playback_states);
+        let read = MockBufferForReading::new(state.clone(), None).unwrap();
 
+        let barrier = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier_captured = Arc::clone(&barrier);
         let consumer = tokio::spawn(async move {
-            let mut stream = map_pop_first(&mut read.streams()).unwrap().1.stream;
+            let mut stream_handler = read.stream_handler(0).await.unwrap();
+            let mut stream = stream_handler.stream();
             stream.next().await.unwrap().unwrap();
+            barrier_captured.wait().await;
             stream.next().await.unwrap().unwrap();
         });
 
         // Wait for consumer to read first entry
-        while playback_state.lock().get(&0).unwrap().vector_index < 1 {
-            tokio::time::sleep(Duration::from_millis(1)).await;
-        }
+        barrier.wait().await;
 
         state.push_lp(Sequence::new(0, 1), "mem foo=2 20");
 

From 17fbeaaadedf8eafe1a7561db17fd87ae7b7e178 Mon Sep 17 00:00:00 2001
From: Nga Tran <nga-tran@live.com>
Date: Mon, 7 Feb 2022 09:44:15 -0500
Subject: [PATCH 07/30] feat: insert the persisted info into the catalog in one
 transaction (#3636)

* feat: add ProcessedTombstoneRepo

* feat: add function add_parquet_file_with_tombstones

* fix: remove unecessary use

* feat: handling transaction when adding parquet file and its processed tombstones

* feat: tests update catalog for parquet file and processed tombstones

* fix: make add parquet file & its processed tombstones fully transactional

* chore: cleanup

* test: add integration tests for new catalog update functions

* chore: remove catalog_update.rs

* chore: cleanup

* fix: assert the right values

* fix: create unique namespace

* fix: support non transaction create_many

* test: remove tests that do not work in a transaction

* fix: one more case with unique namespace

* chore: more verification around for better understanding why certain tests fail

* fix: compare difference rather than absolute becasue the DB already has data

* fix: fix the argument provided to SQL

* fix: return non-empty processed tombstones

* fix: insert the right parquet file

* chore: remove unsed file

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 iox_catalog/src/interface.rs | 289 +++++++++++++++++++++++++++++++++++
 iox_catalog/src/mem.rs       | 161 ++++++++++++++++++-
 iox_catalog/src/postgres.rs  | 215 +++++++++++++++++++++++---
 parquet_file/src/metadata.rs |  24 ++-
 4 files changed, 666 insertions(+), 23 deletions(-)

diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index 043855e05d..2ca9d3e4a0 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -4,6 +4,7 @@ use async_trait::async_trait;
 use influxdb_line_protocol::FieldValue;
 use schema::{InfluxColumnType, InfluxFieldType};
 use snafu::{OptionExt, Snafu};
+use sqlx::{Postgres, Transaction};
 use std::convert::TryFrom;
 use std::fmt::Formatter;
 use std::{collections::BTreeMap, fmt::Debug};
@@ -41,6 +42,12 @@ pub enum Error {
     #[snafu(display("parquet file with object_store_id {} already exists", object_store_id))]
     FileExists { object_store_id: Uuid },
 
+    #[snafu(display("parquet file with id {} does not exist. Foreign key violation", id))]
+    FileNotFound { id: i64 },
+
+    #[snafu(display("tombstone with id {} does not exist. Foreign key violation", id))]
+    TombstoneNotFound { id: i64 },
+
     #[snafu(display("parquet_file record {} not found", id))]
     ParquetRecordNotFound { id: ParquetFileId },
 
@@ -49,6 +56,25 @@ pub enum Error {
         source: Box<dyn std::error::Error + Send>,
         name: String,
     },
+
+    #[snafu(display("Cannot start a transaction: {}", source))]
+    StartTransaction { source: sqlx::Error },
+
+    #[snafu(display("No transaction provided"))]
+    NoTransaction,
+
+    #[snafu(display(
+        "the tombstone {} already processed for parquet file {}",
+        tombstone_id,
+        parquet_file_id
+    ))]
+    ProcessTombstoneExists {
+        tombstone_id: i64,
+        parquet_file_id: i64,
+    },
+
+    #[snafu(display("Error while converting usize {} to i64", value))]
+    InvalidValue { value: usize },
 }
 
 /// A specialized `Error` for Catalog errors
@@ -309,6 +335,16 @@ pub trait Catalog: Send + Sync + Debug {
 
     /// repo for parquet_files
     fn parquet_files(&self) -> &dyn ParquetFileRepo;
+
+    /// repo for processed_tombstones
+    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo;
+
+    /// Insert the conpacted parquet file and its tombstones into the catalog in one transaction
+    async fn add_parquet_file_with_tombstones(
+        &self,
+        parquet_file: &ParquetFile,
+        tombstones: &[Tombstone],
+    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error>;
 }
 
 /// Functions for working with Kafka topics in the catalog.
@@ -443,6 +479,8 @@ pub trait ParquetFileRepo: Send + Sync {
     #[allow(clippy::too_many_arguments)]
     async fn create(
         &self,
+        // this transaction is only provided when this record is inserted in a transaction
+        txt: Option<&mut Transaction<'_, Postgres>>,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -465,6 +503,34 @@ pub trait ParquetFileRepo: Send + Sync {
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<ParquetFile>>;
+
+    /// Verify if the parquet file exists by selecting its id
+    async fn exist(&self, id: ParquetFileId) -> Result<bool>;
+
+    /// Return count
+    async fn count(&self) -> Result<i64>;
+}
+
+/// Functions for working with processed tombstone pointers in the catalog
+#[async_trait]
+pub trait ProcessedTombstoneRepo: Send + Sync {
+    /// create processed tombstones
+    async fn create_many(
+        &self,
+        txt: Option<&mut Transaction<'_, Postgres>>,
+        parquet_file_id: ParquetFileId,
+        tombstones: &[Tombstone],
+    ) -> Result<Vec<ProcessedTombstone>>;
+
+    /// Verify if a processed tombstone exists in the catalog
+    async fn exist(
+        &self,
+        parquet_file_id: ParquetFileId,
+        tombstone_id: TombstoneId,
+    ) -> Result<bool>;
+
+    /// Return count
+    async fn count(&self) -> Result<i64>;
 }
 
 /// Data object for a kafka topic
@@ -864,6 +930,15 @@ pub struct ParquetFile {
     pub to_delete: bool,
 }
 
+/// Data for a processed tombstone reference in the catalog.
+#[derive(Debug, Copy, Clone, PartialEq, sqlx::FromRow)]
+pub struct ProcessedTombstone {
+    /// the id of the tombstone applied to the parquet file
+    pub tombstone_id: TombstoneId,
+    /// the id of the parquet file the tombstone was applied
+    pub parquet_file_id: ParquetFileId,
+}
+
 #[cfg(test)]
 pub(crate) mod test_helpers {
     use super::*;
@@ -881,6 +956,7 @@ pub(crate) mod test_helpers {
         test_partition(Arc::clone(&catalog)).await;
         test_tombstone(Arc::clone(&catalog)).await;
         test_parquet_file(Arc::clone(&catalog)).await;
+        test_add_parquet_file_with_tombstones(Arc::clone(&catalog)).await;
     }
 
     async fn test_setup(catalog: Arc<dyn Catalog>) {
@@ -1275,8 +1351,14 @@ pub(crate) mod test_helpers {
         let max_time = Timestamp::new(10);
 
         let parquet_repo = catalog.parquet_files();
+
+        // Must have no rows
+        let row_count = parquet_repo.count().await.unwrap();
+        assert_eq!(row_count, 0);
+
         let parquet_file = parquet_repo
             .create(
+                None,
                 sequencer.id,
                 partition.table_id,
                 partition.id,
@@ -1292,6 +1374,7 @@ pub(crate) mod test_helpers {
         // verify that trying to create a file with the same UUID throws an error
         let err = parquet_repo
             .create(
+                None,
                 sequencer.id,
                 partition.table_id,
                 partition.id,
@@ -1307,6 +1390,7 @@ pub(crate) mod test_helpers {
 
         let other_file = parquet_repo
             .create(
+                None,
                 sequencer.id,
                 other_partition.table_id,
                 other_partition.id,
@@ -1319,6 +1403,17 @@ pub(crate) mod test_helpers {
             .await
             .unwrap();
 
+        // Must have 2 rows
+        let row_count = parquet_repo.count().await.unwrap();
+        assert_eq!(row_count, 2);
+
+        let exist_id = parquet_file.id;
+        let non_exist_id = ParquetFileId::new(other_file.id.get() + 10);
+        // make sure exists_id != non_exist_id
+        assert_ne!(exist_id, non_exist_id);
+        assert!(parquet_repo.exist(exist_id).await.unwrap());
+        assert!(!parquet_repo.exist(non_exist_id).await.unwrap());
+
         let files = parquet_repo
             .list_by_sequencer_greater_than(sequencer.id, SequenceNumber::new(1))
             .await
@@ -1339,4 +1434,198 @@ pub(crate) mod test_helpers {
             .unwrap();
         assert!(files.first().unwrap().to_delete);
     }
+
+    async fn test_add_parquet_file_with_tombstones(catalog: Arc<dyn Catalog>) {
+        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = catalog
+            .namespaces()
+            .create(
+                "namespace_parquet_file_with_tombstones_test",
+                "inf",
+                kafka.id,
+                pool.id,
+            )
+            .await
+            .unwrap();
+        let table = catalog
+            .tables()
+            .create_or_get("test_table", namespace.id)
+            .await
+            .unwrap();
+        let sequencer = catalog
+            .sequencers()
+            .create_or_get(&kafka, KafkaPartition::new(1))
+            .await
+            .unwrap();
+        let partition = catalog
+            .partitions()
+            .create_or_get("one", sequencer.id, table.id)
+            .await
+            .unwrap();
+
+        // Add tombstones
+        let min_time = Timestamp::new(1);
+        let max_time = Timestamp::new(10);
+        let t1 = catalog
+            .tombstones()
+            .create_or_get(
+                table.id,
+                sequencer.id,
+                SequenceNumber::new(1),
+                min_time,
+                max_time,
+                "whatevs",
+            )
+            .await
+            .unwrap();
+        let t2 = catalog
+            .tombstones()
+            .create_or_get(
+                table.id,
+                sequencer.id,
+                SequenceNumber::new(2),
+                min_time,
+                max_time,
+                "bleh",
+            )
+            .await
+            .unwrap();
+        let t3 = catalog
+            .tombstones()
+            .create_or_get(
+                table.id,
+                sequencer.id,
+                SequenceNumber::new(3),
+                min_time,
+                max_time,
+                "meh",
+            )
+            .await
+            .unwrap();
+
+        // Prepare metadata in form of ParquetFile to get added with tombstone
+        let parquet = ParquetFile {
+            id: ParquetFileId::new(0), //fake id that will never be used
+            sequencer_id: sequencer.id,
+            table_id: table.id,
+            partition_id: partition.id,
+            object_store_id: Uuid::new_v4(),
+            min_sequence_number: SequenceNumber::new(4),
+            max_sequence_number: SequenceNumber::new(10),
+            min_time,
+            max_time,
+            to_delete: false,
+        };
+        let other_parquet = ParquetFile {
+            id: ParquetFileId::new(0), //fake id that will never be used
+            sequencer_id: sequencer.id,
+            table_id: table.id,
+            partition_id: partition.id,
+            object_store_id: Uuid::new_v4(),
+            min_sequence_number: SequenceNumber::new(11),
+            max_sequence_number: SequenceNumber::new(20),
+            min_time,
+            max_time,
+            to_delete: false,
+        };
+        let another_parquet = ParquetFile {
+            id: ParquetFileId::new(0), //fake id that will never be used
+            sequencer_id: sequencer.id,
+            table_id: table.id,
+            partition_id: partition.id,
+            object_store_id: Uuid::new_v4(),
+            min_sequence_number: SequenceNumber::new(21),
+            max_sequence_number: SequenceNumber::new(30),
+            min_time,
+            max_time,
+            to_delete: false,
+        };
+
+        let parquet_file_count_before = catalog.parquet_files().count().await.unwrap();
+        let pt_count_before = catalog.processed_tombstones().count().await.unwrap();
+
+        // Add parquet and processed tombstone in one transaction
+        let (parquet_file, p_tombstones) = catalog
+            .add_parquet_file_with_tombstones(&parquet, &[t1.clone(), t2.clone()])
+            .await
+            .unwrap();
+        assert_eq!(p_tombstones.len(), 2);
+        assert_eq!(t1.id, p_tombstones[0].tombstone_id);
+        assert_eq!(t2.id, p_tombstones[1].tombstone_id);
+
+        // verify the catalog
+        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
+        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
+        assert_eq!(pt_count_after - pt_count_before, 2);
+        assert_eq!(parquet_file_count_after - parquet_file_count_before, 1);
+        let pt_count_before = pt_count_after;
+        let parquet_file_count_before = parquet_file_count_after;
+
+        assert!(catalog
+            .parquet_files()
+            .exist(parquet_file.id)
+            .await
+            .unwrap());
+        assert!(catalog
+            .processed_tombstones()
+            .exist(parquet_file.id, t1.id)
+            .await
+            .unwrap());
+        assert!(catalog
+            .processed_tombstones()
+            .exist(parquet_file.id, t1.id)
+            .await
+            .unwrap());
+
+        // Error due to duplicate parquet file
+        catalog
+            .add_parquet_file_with_tombstones(&parquet, &[t3.clone(), t1.clone()])
+            .await
+            .unwrap_err();
+        // Since the transaction is rollback, t3 is not yet added
+        assert!(!catalog
+            .processed_tombstones()
+            .exist(parquet_file.id, t3.id)
+            .await
+            .unwrap());
+
+        // Add new parquet and new tombstone. Should go trhough
+        let (parquet_file, p_tombstones) = catalog
+            .add_parquet_file_with_tombstones(&other_parquet, &[t3.clone()])
+            .await
+            .unwrap();
+        assert_eq!(p_tombstones.len(), 1);
+        assert_eq!(t3.id, p_tombstones[0].tombstone_id);
+        assert!(catalog
+            .processed_tombstones()
+            .exist(parquet_file.id, t3.id)
+            .await
+            .unwrap());
+        assert!(catalog
+            .parquet_files()
+            .exist(parquet_file.id)
+            .await
+            .unwrap());
+
+        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
+        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
+        assert_eq!(pt_count_after - pt_count_before, 1);
+        assert_eq!(parquet_file_count_after - parquet_file_count_before, 1);
+        let pt_count_before = pt_count_after;
+        let parquet_file_count_before = parquet_file_count_after;
+
+        // Add non-exist tombstone t4 and should fail
+        let mut t4 = t3.clone();
+        t4.id = TombstoneId::new(t4.id.get() + 10);
+        catalog
+            .add_parquet_file_with_tombstones(&another_parquet, &[t4])
+            .await
+            .unwrap_err();
+        // Still same count as before
+        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
+        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
+        assert_eq!(pt_count_after - pt_count_before, 0);
+        assert_eq!(parquet_file_count_after - parquet_file_count_before, 0);
+    }
 }
diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs
index 9c9f75b2e1..94ea5602c6 100644
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@@ -4,11 +4,13 @@
 use crate::interface::{
     Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic,
     KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
-    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionRepo, QueryPool, QueryPoolId,
-    QueryPoolRepo, Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId,
-    TableRepo, Timestamp, Tombstone, TombstoneId, TombstoneRepo,
+    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionRepo, ProcessedTombstone,
+    ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result, SequenceNumber,
+    Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp, Tombstone,
+    TombstoneId, TombstoneRepo,
 };
 use async_trait::async_trait;
+use sqlx::{Postgres, Transaction};
 use std::convert::TryFrom;
 use std::fmt::Formatter;
 use std::sync::Mutex;
@@ -26,6 +28,15 @@ impl MemCatalog {
     pub fn new() -> Self {
         Self::default()
     }
+
+    // Since this is test catalog that do not handle transaction
+    // this is a help function to fake `rollback` work
+    fn remove_parquet_file(&self, object_store_id: Uuid) {
+        let mut collections = self.collections.lock().expect("mutex poisoned");
+        collections
+            .parquet_files
+            .retain(|f| f.object_store_id != object_store_id);
+    }
 }
 
 impl std::fmt::Debug for MemCatalog {
@@ -46,6 +57,7 @@ struct MemCollections {
     partitions: Vec<Partition>,
     tombstones: Vec<Tombstone>,
     parquet_files: Vec<ParquetFile>,
+    processed_tombstones: Vec<ProcessedTombstone>,
 }
 
 #[async_trait]
@@ -90,6 +102,50 @@ impl Catalog for MemCatalog {
     fn parquet_files(&self) -> &dyn ParquetFileRepo {
         self
     }
+
+    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo {
+        self
+    }
+
+    async fn add_parquet_file_with_tombstones(
+        &self,
+        parquet_file: &ParquetFile,
+        tombstones: &[Tombstone],
+    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
+        // The activities in this file must either be all succeed or all fail
+
+        // Create a parquet file in the catalog first
+        let parquet = self
+            .parquet_files()
+            .create(
+                None,
+                parquet_file.sequencer_id,
+                parquet_file.table_id,
+                parquet_file.partition_id,
+                parquet_file.object_store_id,
+                parquet_file.min_sequence_number,
+                parquet_file.max_sequence_number,
+                parquet_file.min_time,
+                parquet_file.max_time,
+            )
+            .await?;
+
+        // Now the parquet available, let create its dependent processed tombstones
+        let processed_tombstones = self
+            .processed_tombstones()
+            .create_many(None, parquet.id, tombstones)
+            .await;
+
+        if let Err(error) = processed_tombstones {
+            // failed to insert processed tombstone, remove the above
+            // inserted parquet file from the catalog
+            self.remove_parquet_file(parquet.object_store_id);
+            return Err(error);
+        }
+        let processed_tombstones = processed_tombstones.unwrap();
+
+        Ok((parquet, processed_tombstones))
+    }
 }
 
 #[async_trait]
@@ -438,6 +494,7 @@ impl TombstoneRepo for MemCatalog {
 impl ParquetFileRepo for MemCatalog {
     async fn create(
         &self,
+        _txt: Option<&mut Transaction<'_, Postgres>>,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -497,6 +554,104 @@ impl ParquetFileRepo for MemCatalog {
             .collect();
         Ok(files)
     }
+
+    async fn exist(&self, id: ParquetFileId) -> Result<bool> {
+        let collections = self.collections.lock().expect("mutex poisoned");
+        Ok(collections.parquet_files.iter().any(|f| f.id == id))
+    }
+
+    async fn count(&self) -> Result<i64> {
+        let collections = self.collections.lock().expect("mutex poisoned");
+        let count = collections.parquet_files.len();
+        let count_i64 = i64::try_from(count);
+        if count_i64.is_err() {
+            return Err(Error::InvalidValue { value: count });
+        }
+        Ok(count_i64.unwrap())
+    }
+}
+
+#[async_trait]
+impl ProcessedTombstoneRepo for MemCatalog {
+    async fn create_many(
+        &self,
+        _txt: Option<&mut Transaction<'_, Postgres>>,
+        parquet_file_id: ParquetFileId,
+        tombstones: &[Tombstone],
+    ) -> Result<Vec<ProcessedTombstone>> {
+        let mut collections = self.collections.lock().expect("mutex poisoned");
+
+        // check if the parquet file available
+        if !collections
+            .parquet_files
+            .iter()
+            .any(|f| f.id == parquet_file_id)
+        {
+            return Err(Error::FileNotFound {
+                id: parquet_file_id.get(),
+            });
+        }
+
+        let mut processed_tombstones = vec![];
+        for tombstone in tombstones {
+            // check if tomstone exists
+            if !collections.tombstones.iter().any(|f| f.id == tombstone.id) {
+                return Err(Error::TombstoneNotFound {
+                    id: tombstone.id.get(),
+                });
+            }
+
+            if collections
+                .processed_tombstones
+                .iter()
+                .any(|pt| pt.tombstone_id == tombstone.id && pt.parquet_file_id == parquet_file_id)
+            {
+                // The tombstone was already proccessed for this file
+                return Err(Error::ProcessTombstoneExists {
+                    parquet_file_id: parquet_file_id.get(),
+                    tombstone_id: tombstone.id.get(),
+                });
+            }
+
+            let processed_tombstone = ProcessedTombstone {
+                tombstone_id: tombstone.id,
+                parquet_file_id,
+            };
+            processed_tombstones.push(processed_tombstone);
+        }
+
+        // save for returning
+        let return_processed_tombstones = processed_tombstones.clone();
+
+        // Add to the catalog
+        collections
+            .processed_tombstones
+            .append(&mut processed_tombstones);
+
+        Ok(return_processed_tombstones)
+    }
+
+    async fn exist(
+        &self,
+        parquet_file_id: ParquetFileId,
+        tombstone_id: TombstoneId,
+    ) -> Result<bool> {
+        let collections = self.collections.lock().expect("mutex poisoned");
+        Ok(collections
+            .processed_tombstones
+            .iter()
+            .any(|f| f.parquet_file_id == parquet_file_id && f.tombstone_id == tombstone_id))
+    }
+
+    async fn count(&self) -> Result<i64> {
+        let collections = self.collections.lock().expect("mutex poisoned");
+        let count = collections.processed_tombstones.len();
+        let count_i64 = i64::try_from(count);
+        if count_i64.is_err() {
+            return Err(Error::InvalidValue { value: count });
+        }
+        Ok(count_i64.unwrap())
+    }
 }
 
 #[cfg(test)]
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 862f4addac..990e7d604e 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -3,13 +3,14 @@
 use crate::interface::{
     Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic, KafkaTopicId,
     KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile, ParquetFileId,
-    ParquetFileRepo, Partition, PartitionId, PartitionRepo, QueryPool, QueryPoolId, QueryPoolRepo,
-    Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo,
-    Timestamp, Tombstone, TombstoneRepo,
+    ParquetFileRepo, Partition, PartitionId, PartitionRepo, ProcessedTombstone,
+    ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result, SequenceNumber,
+    Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp, Tombstone,
+    TombstoneId, TombstoneRepo,
 };
 use async_trait::async_trait;
-use observability_deps::tracing::info;
-use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres};
+use observability_deps::tracing::{info, warn};
+use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Transaction};
 use std::time::Duration;
 use uuid::Uuid;
 
@@ -27,6 +28,12 @@ pub struct PostgresCatalog {
     pool: Pool<Postgres>,
 }
 
+// struct to get return value from "select count(*) ..." wuery"
+#[derive(sqlx::FromRow)]
+struct Count {
+    count: i64,
+}
+
 impl PostgresCatalog {
     /// Connect to the catalog store.
     pub async fn connect(
@@ -109,6 +116,71 @@ impl Catalog for PostgresCatalog {
     fn parquet_files(&self) -> &dyn ParquetFileRepo {
         self
     }
+
+    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo {
+        self
+    }
+
+    async fn add_parquet_file_with_tombstones(
+        &self,
+        parquet_file: &ParquetFile,
+        tombstones: &[Tombstone],
+    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
+        // Start a transaction
+        let txt = self.pool.begin().await;
+        if let Err(error) = txt {
+            return Err(Error::StartTransaction { source: error });
+        }
+        let mut txt = txt.unwrap();
+
+        // create a parquet file in the catalog first
+        let parquet = self
+            .parquet_files()
+            .create(
+                Some(&mut txt),
+                parquet_file.sequencer_id,
+                parquet_file.table_id,
+                parquet_file.partition_id,
+                parquet_file.object_store_id,
+                parquet_file.min_sequence_number,
+                parquet_file.max_sequence_number,
+                parquet_file.min_time,
+                parquet_file.max_time,
+            )
+            .await;
+
+        if let Err(error) = parquet {
+            // Error while adding parquet file into the catalog, stop the transaction
+            warn!(object_store_id=?parquet_file.object_store_id.to_string(), "{}", error.to_string());
+            let _rollback = txt.rollback().await;
+            return Err(error);
+        }
+        let parquet = parquet.unwrap();
+
+        // Now the parquet available, create its processed tombstones
+        let processed_tombstones = self
+            .processed_tombstones()
+            .create_many(Some(&mut txt), parquet.id, tombstones)
+            .await;
+
+        let processed_tombstones = match processed_tombstones {
+            Ok(processed_tombstones) => processed_tombstones,
+            Err(e) => {
+                // Error while adding processed tombstones
+                warn!(
+                    "Error while adding processed tombstone: {}. Transaction stops.",
+                    e.to_string()
+                );
+                let _rollback = txt.rollback().await;
+                return Err(e);
+            }
+        };
+
+        // Commit the transaction
+        let _commit = txt.commit().await;
+
+        Ok((parquet, processed_tombstones))
+    }
 }
 
 #[async_trait]
@@ -495,6 +567,7 @@ impl TombstoneRepo for PostgresCatalog {
 impl ParquetFileRepo for PostgresCatalog {
     async fn create(
         &self,
+        txt: Option<&mut Transaction<'_, Postgres>>,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -518,20 +591,22 @@ RETURNING *
             .bind(min_sequence_number) // $5
             .bind(max_sequence_number) // $6
             .bind(min_time) // $7
-            .bind(max_time) // $8
-            .fetch_one(&self.pool)
-            .await
-            .map_err(|e| {
-                if is_unique_violation(&e) {
-                    Error::FileExists {
-                        object_store_id,
-                    }
-                } else if is_fk_violation(&e) {
-                    Error::ForeignKeyViolation { source: e }
-                } else {
-                    Error::SqlxError { source: e }
-                }
-            })?;
+            .bind(max_time); // $8
+
+        let rec = match txt {
+            Some(txt) => rec.fetch_one(txt).await,
+            None => rec.fetch_one(&self.pool).await,
+        };
+
+        let rec = rec.map_err(|e| {
+            if is_unique_violation(&e) {
+                Error::FileExists { object_store_id }
+            } else if is_fk_violation(&e) {
+                Error::ForeignKeyViolation { source: e }
+            } else {
+                Error::SqlxError { source: e }
+            }
+        })?;
 
         Ok(rec)
     }
@@ -558,6 +633,104 @@ RETURNING *
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
+
+    async fn exist(&self, id: ParquetFileId) -> Result<bool> {
+        let read_result = sqlx::query_as::<_, Count>(
+            r#"SELECT count(*) as count FROM parquet_file WHERE id = $1;"#,
+        )
+        .bind(&id) // $1
+        .fetch_one(&self.pool)
+        .await
+        .map_err(|e| Error::SqlxError { source: e })?;
+
+        Ok(read_result.count > 0)
+    }
+
+    async fn count(&self) -> Result<i64> {
+        let read_result =
+            sqlx::query_as::<_, Count>(r#"SELECT count(*) as count  FROM parquet_file;"#)
+                .fetch_one(&self.pool)
+                .await
+                .map_err(|e| Error::SqlxError { source: e })?;
+
+        Ok(read_result.count)
+    }
+}
+
+#[async_trait]
+impl ProcessedTombstoneRepo for PostgresCatalog {
+    async fn create_many(
+        &self,
+        txt: Option<&mut Transaction<'_, Postgres>>,
+        parquet_file_id: ParquetFileId,
+        tombstones: &[Tombstone],
+    ) -> Result<Vec<ProcessedTombstone>> {
+        if txt.is_none() {
+            return Err(Error::NoTransaction);
+        }
+        let txt = txt.unwrap();
+
+        // no transaction provided
+        // todo: we should never needs this but since right now we implement 2 catalogs,
+        // postgres (for production)  and mem (for testing only) that does not need to provide txt
+        // this will be refactor when Marco has his new abstraction done
+        let mut processed_tombstones = vec![];
+        for tombstone in tombstones {
+            let processed_tombstone = sqlx::query_as::<_, ProcessedTombstone>(
+                r#"
+                INSERT INTO processed_tombstone ( tombstone_id, parquet_file_id )
+                VALUES ( $1, $2 )
+                RETURNING *
+                "#,
+            )
+            .bind(tombstone.id) // $1
+            .bind(parquet_file_id) // $2
+            .fetch_one(&mut *txt)
+            .await
+            .map_err(|e| {
+                if is_unique_violation(&e) {
+                    Error::ProcessTombstoneExists {
+                        tombstone_id: tombstone.id.get(),
+                        parquet_file_id: parquet_file_id.get(),
+                    }
+                } else if is_fk_violation(&e) {
+                    Error::ForeignKeyViolation { source: e }
+                } else {
+                    Error::SqlxError { source: e }
+                }
+            })?;
+
+            processed_tombstones.push(processed_tombstone);
+        }
+
+        Ok(processed_tombstones)
+    }
+
+    async fn exist(
+        &self,
+        parquet_file_id: ParquetFileId,
+        tombstone_id: TombstoneId,
+    ) -> Result<bool> {
+        let read_result = sqlx::query_as::<_, Count>(
+            r#"SELECT count(*) as count FROM processed_tombstone WHERE parquet_file_id = $1 AND tombstone_id = $2;"#)
+            .bind(&parquet_file_id) // $1
+            .bind(&tombstone_id) // $2
+            .fetch_one(&self.pool)
+            .await
+            .map_err(|e| Error::SqlxError { source: e })?;
+
+        Ok(read_result.count > 0)
+    }
+
+    async fn count(&self) -> Result<i64> {
+        let read_result =
+            sqlx::query_as::<_, Count>(r#"SELECT count(*) as count FROM processed_tombstone;"#)
+                .fetch_one(&self.pool)
+                .await
+                .map_err(|e| Error::SqlxError { source: e })?;
+
+        Ok(read_result.count)
+    }
 }
 
 /// The error code returned by Postgres for a unique constraint violation.
@@ -659,6 +832,10 @@ mod tests {
     }
 
     async fn clear_schema(pool: &Pool<Postgres>) {
+        sqlx::query("delete from processed_tombstone;")
+            .execute(pool)
+            .await
+            .unwrap();
         sqlx::query("delete from tombstone;")
             .execute(pool)
             .await
diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs
index fd89c55d37..02f3d22863 100644
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@@ -92,7 +92,9 @@ use data_types::{
 };
 use generated_types::influxdata::iox::ingest::v1 as proto;
 use generated_types::influxdata::iox::preserved_catalog::v1 as preserved_catalog;
-use iox_catalog::interface::{NamespaceId, PartitionId, SequenceNumber, SequencerId, TableId};
+use iox_catalog::interface::{
+    NamespaceId, ParquetFile, ParquetFileId, PartitionId, SequenceNumber, SequencerId, TableId,
+};
 use parquet::{
     arrow::parquet_to_arrow_schema,
     file::{
@@ -589,6 +591,26 @@ impl IoxMetadata {
     pub fn match_object_store_id(&self, uuid: Uuid) -> bool {
         uuid == self.object_store_id
     }
+
+    // create a corresponding iox catalog's ParquetFile
+    pub fn to_parquet_file(&self) -> ParquetFile {
+        ParquetFile {
+            id: ParquetFileId::new(0), // this will be created in the DB. This 0 won't be used anywhere
+            sequencer_id: self.sequencer_id,
+            table_id: self.table_id,
+            partition_id: self.partition_id,
+            object_store_id: self.object_store_id,
+            min_sequence_number: self.min_sequence_number,
+            max_sequence_number: self.max_sequence_number,
+            min_time: iox_catalog::interface::Timestamp::new(
+                self.time_of_first_write.timestamp_nanos(),
+            ),
+            max_time: iox_catalog::interface::Timestamp::new(
+                self.time_of_last_write.timestamp_nanos(),
+            ),
+            to_delete: false,
+        }
+    }
 }
 
 /// Parse big-endian UUID from protobuf.

From 2e30483f1f852f881fc5b0ad1c8ceb0486a61e22 Mon Sep 17 00:00:00 2001
From: "Carol (Nichols || Goulding)"
 <193874+carols10cents@users.noreply.github.com>
Date: Mon, 7 Feb 2022 09:54:07 -0500
Subject: [PATCH 08/30] refactor: Remove predicate module from predicate crate
 (#3648)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/access.rs                              |   4 +-
 db/src/chunk.rs                               |   2 +-
 db/src/lib.rs                                 |   2 +-
 db/src/pred.rs                                |   4 +-
 .../server_type/database/rpc/storage/expr.rs  |   4 +-
 .../database/rpc/storage/service.rs           |   4 +-
 ingester/src/query.rs                         |   5 +-
 parquet_file/src/chunk.rs                     |   2 +-
 parquet_file/src/storage.rs                   |   2 +-
 predicate/src/delete_predicate.rs             |   2 +-
 predicate/src/lib.rs                          | 643 ++++++++++++++++-
 predicate/src/predicate.rs                    | 648 ------------------
 predicate/src/rpc_predicate.rs                |   3 +-
 query/src/frontend/influxrpc.rs               |   4 +-
 query/src/lib.rs                              |   5 +-
 query/src/provider.rs                         |   2 +-
 query/src/provider/physical.rs                |   2 +-
 query/src/pruning.rs                          |   4 +-
 query_tests/src/influxrpc/field_columns.rs    |   2 +-
 query_tests/src/influxrpc/read_filter.rs      |   2 +-
 query_tests/src/influxrpc/read_group.rs       |   2 +-
 .../src/influxrpc/read_window_aggregate.rs    |   2 +-
 query_tests/src/influxrpc/table_names.rs      |   2 +-
 query_tests/src/influxrpc/tag_keys.rs         |   2 +-
 query_tests/src/influxrpc/tag_values.rs       |   2 +-
 query_tests/src/pruning.rs                    |   2 +-
 server_benchmarks/benches/read_filter.rs      |   2 +-
 server_benchmarks/benches/read_group.rs       |   2 +-
 server_benchmarks/benches/tag_values.rs       |   2 +-
 29 files changed, 675 insertions(+), 689 deletions(-)
 delete mode 100644 predicate/src/predicate.rs

diff --git a/db/src/access.rs b/db/src/access.rs
index 3e05728d62..b3d9519a22 100644
--- a/db/src/access.rs
+++ b/db/src/access.rs
@@ -14,7 +14,7 @@ use job_registry::JobRegistry;
 use metric::{Attributes, DurationCounter, Metric, U64Counter};
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
-use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
+use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 use query::{
     provider::{ChunkPruner, ProviderBuilder},
     pruning::{prune_chunks, PruningObserver},
@@ -398,7 +398,7 @@ mod tests {
     use super::*;
     use crate::test_helpers::write_lp;
     use crate::utils::make_db;
-    use predicate::predicate::PredicateBuilder;
+    use predicate::PredicateBuilder;
 
     #[tokio::test]
     async fn test_filtered_chunks() {
diff --git a/db/src/chunk.rs b/db/src/chunk.rs
index 8ab333ec7e..e1c8e7824c 100644
--- a/db/src/chunk.rs
+++ b/db/src/chunk.rs
@@ -15,7 +15,7 @@ use mutable_buffer::snapshot::ChunkSnapshot;
 use observability_deps::tracing::debug;
 use parquet_file::chunk::ParquetChunk;
 use partition_metadata::TableSummary;
-use predicate::predicate::{Predicate, PredicateMatch};
+use predicate::{Predicate, PredicateMatch};
 use query::{exec::stringset::StringSet, QueryChunk, QueryChunkMeta};
 use read_buffer::RBChunk;
 use schema::InfluxColumnType;
diff --git a/db/src/lib.rs b/db/src/lib.rs
index 018d028156..15f82d5899 100644
--- a/db/src/lib.rs
+++ b/db/src/lib.rs
@@ -42,7 +42,7 @@ use parquet_catalog::{
     prune::prune_history as prune_catalog_transaction_history,
 };
 use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::PersistenceWindows};
-use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
+use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 use query::{
     exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext},
     QueryCompletedToken, QueryDatabase,
diff --git a/db/src/pred.rs b/db/src/pred.rs
index 0b8f456a2f..2af8dd2820 100644
--- a/db/src/pred.rs
+++ b/db/src/pred.rs
@@ -3,7 +3,7 @@
 
 use std::convert::TryFrom;
 
-use predicate::predicate::Predicate;
+use predicate::Predicate;
 use snafu::Snafu;
 
 #[derive(Debug, Snafu)]
@@ -55,7 +55,7 @@ pub mod test {
     use datafusion::logical_plan::{col, lit, Expr};
 
     use datafusion::scalar::ScalarValue;
-    use predicate::predicate::PredicateBuilder;
+    use predicate::PredicateBuilder;
     use read_buffer::BinaryExpr as RBBinaryExpr;
     use read_buffer::Predicate as RBPredicate;
 
diff --git a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/expr.rs b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/expr.rs
index a05fe82ebd..e311d88b09 100644
--- a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/expr.rs
+++ b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/expr.rs
@@ -26,9 +26,9 @@ use super::{TAG_KEY_FIELD, TAG_KEY_MEASUREMENT};
 use observability_deps::tracing::warn;
 use predicate::rpc_predicate::InfluxRpcPredicate;
 use predicate::{
-    predicate::PredicateBuilder,
     regex::regex_match_expr,
     rpc_predicate::{FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME},
+    PredicateBuilder,
 };
 use query::group_by::{Aggregate as QueryAggregate, WindowDuration};
 use snafu::{OptionExt, ResultExt, Snafu};
@@ -867,7 +867,7 @@ fn format_comparison(v: i32, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 #[cfg(test)]
 mod tests {
     use generated_types::node::Type as RPCNodeType;
-    use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
+    use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
     use std::{collections::BTreeSet, sync::Arc};
 
     use super::*;
diff --git a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
index 7aa9fb5cc1..b1faba9b0a 100644
--- a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
+++ b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
@@ -1351,7 +1351,7 @@ mod tests {
         Client as StorageClient, OrgAndBucket,
     };
     use panic_logging::SendPanicsToTracing;
-    use predicate::predicate::{PredicateBuilder, PredicateMatch};
+    use predicate::{PredicateBuilder, PredicateMatch};
     use query::{
         exec::Executor,
         test::{TestChunk, TestDatabase, TestError},
@@ -2971,7 +2971,7 @@ mod tests {
             db_name: &str,
             partition_key: &str,
             chunk_id: u128,
-            expected_predicate: &predicate::predicate::Predicate,
+            expected_predicate: &predicate::Predicate,
         ) {
             let actual_predicates = self
                 .test_storage
diff --git a/ingester/src/query.rs b/ingester/src/query.rs
index 0c783669f6..f63393cb22 100644
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@@ -15,10 +15,7 @@ use datafusion::physical_plan::{
     SendableRecordBatchStream,
 };
 use iox_catalog::interface::{SequenceNumber, Tombstone};
-use predicate::{
-    delete_predicate::parse_delete_predicate,
-    predicate::{Predicate, PredicateMatch},
-};
+use predicate::{delete_predicate::parse_delete_predicate, Predicate, PredicateMatch};
 use query::{exec::stringset::StringSet, QueryChunk, QueryChunkMeta};
 use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
 use snafu::{ResultExt, Snafu};
diff --git a/parquet_file/src/chunk.rs b/parquet_file/src/chunk.rs
index 6b8c149444..0fe2f328d4 100644
--- a/parquet_file/src/chunk.rs
+++ b/parquet_file/src/chunk.rs
@@ -5,7 +5,7 @@ use data_types::{
 };
 use datafusion::physical_plan::SendableRecordBatchStream;
 use iox_object_store::{IoxObjectStore, ParquetFilePath};
-use predicate::predicate::Predicate;
+use predicate::Predicate;
 use schema::selection::Selection;
 use schema::{Schema, TIME_COLUMN_NAME};
 use snafu::{ResultExt, Snafu};
diff --git a/parquet_file/src/storage.rs b/parquet_file/src/storage.rs
index 1d231892a6..90600d6809 100644
--- a/parquet_file/src/storage.rs
+++ b/parquet_file/src/storage.rs
@@ -22,7 +22,7 @@ use parquet::{
     basic::Compression,
     file::{metadata::KeyValue, properties::WriterProperties, writer::TryClone},
 };
-use predicate::predicate::Predicate;
+use predicate::Predicate;
 use schema::selection::Selection;
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::{
diff --git a/predicate/src/delete_predicate.rs b/predicate/src/delete_predicate.rs
index cedaddf34a..95a6114ff5 100644
--- a/predicate/src/delete_predicate.rs
+++ b/predicate/src/delete_predicate.rs
@@ -68,7 +68,7 @@ pub enum Error {
 /// Result type for Parser Cient
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-impl From<DeletePredicate> for crate::predicate::Predicate {
+impl From<DeletePredicate> for crate::Predicate {
     fn from(pred: DeletePredicate) -> Self {
         Self {
             field_columns: None,
diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs
index da8da2042e..4b6728d1a3 100644
--- a/predicate/src/lib.rs
+++ b/predicate/src/lib.rs
@@ -10,7 +10,648 @@
 
 pub mod delete_expr;
 pub mod delete_predicate;
-pub mod predicate;
 pub mod regex;
 pub mod rewrite;
 pub mod rpc_predicate;
+
+use data_types::timestamp::{TimestampRange, MAX_NANO_TIME, MIN_NANO_TIME};
+use datafusion::{
+    error::DataFusionError,
+    logical_plan::{col, lit_timestamp_nano, Column, Expr, Operator},
+    optimizer::utils,
+};
+use datafusion_util::{make_range_expr, AndExprBuilder};
+use observability_deps::tracing::debug;
+use schema::TIME_COLUMN_NAME;
+use std::{
+    collections::{BTreeSet, HashSet},
+    fmt,
+};
+
+/// This `Predicate` represents the empty predicate (aka that evaluates to true for all rows).
+pub const EMPTY_PREDICATE: Predicate = Predicate {
+    field_columns: None,
+    exprs: vec![],
+    range: None,
+    partition_key: None,
+    value_expr: vec![],
+};
+
+/// A unified Predicate structure for IOx queries that can select and filter Fields and Tags from
+/// the InfluxDB data mode, as well as for arbitrary other predicates that are expressed  by
+/// DataFusion's [`Expr`] type.
+///
+/// Note that the InfluxDB data model (e.g. ParsedLine's) distinguishes between some types of
+/// columns (tags and fields), and likewise the semantics of this structure can express some types
+/// of restrictions that only apply to certain types of columns.
+#[derive(Clone, Debug, Default, PartialEq, PartialOrd)]
+pub struct Predicate {
+    /// Optional field restriction. If present, restricts the results to only
+    /// tables which have *at least one* of the fields in field_columns.
+    pub field_columns: Option<BTreeSet<String>>,
+
+    /// Optional partition key filter
+    pub partition_key: Option<String>,
+
+    /// Optional timestamp range: only rows within this range are included in
+    /// results. Other rows are excluded
+    pub range: Option<TimestampRange>,
+
+    /// Optional arbitrary predicates, represented as list of
+    /// DataFusion expressions applied a logical conjunction (aka they
+    /// are 'AND'ed together). Only rows that evaluate to TRUE for all
+    /// these expressions should be returned. Other rows are excluded
+    /// from the results.
+    pub exprs: Vec<Expr>,
+
+    /// Optional arbitrary predicates on the special `_value` column. These
+    /// expressions are applied to `field_columns` projections in the form of
+    /// `CASE` statement conditions.
+    pub value_expr: Vec<BinaryExpr>,
+}
+
+impl Predicate {
+    /// Return true if this predicate has any general purpose predicates
+    pub fn has_exprs(&self) -> bool {
+        !self.exprs.is_empty()
+    }
+
+    /// Return a DataFusion `Expr` predicate representing the
+    /// combination of all predicate (`exprs`) and timestamp
+    /// restriction in this Predicate. Returns None if there are no
+    /// `Expr`'s restricting the data
+    pub fn filter_expr(&self) -> Option<Expr> {
+        let mut builder =
+            AndExprBuilder::default().append_opt(self.make_timestamp_predicate_expr());
+
+        for expr in &self.exprs {
+            builder = builder.append_expr(expr.clone());
+        }
+
+        builder.build()
+    }
+
+    /// Return true if the field should be included in results
+    pub fn should_include_field(&self, field_name: &str) -> bool {
+        match &self.field_columns {
+            None => true, // No field restriction on predicate
+            Some(field_names) => field_names.contains(field_name),
+        }
+    }
+
+    /// Creates a DataFusion predicate for appliying a timestamp range:
+    ///
+    /// `range.start <= time and time < range.end`
+    fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
+        self.range
+            .map(|range| make_range_expr(range.start(), range.end(), TIME_COLUMN_NAME))
+    }
+
+    /// Returns true if ths predicate evaluates to true for all rows
+    pub fn is_empty(&self) -> bool {
+        self == &EMPTY_PREDICATE
+    }
+
+    /// Return a negated DF logical expression for the given delete predicates
+    pub fn negated_expr<S>(delete_predicates: &[S]) -> Option<Expr>
+    where
+        S: AsRef<Self>,
+    {
+        if delete_predicates.is_empty() {
+            return None;
+        }
+
+        let mut pred = PredicateBuilder::default().build();
+        pred.merge_delete_predicates(delete_predicates);
+
+        // Make a conjunctive expression of the pred.exprs
+        let mut val = None;
+        for e in pred.exprs {
+            match val {
+                None => val = Some(e),
+                Some(expr) => val = Some(expr.and(e)),
+            }
+        }
+
+        val
+    }
+
+    /// Merge the given delete predicates into this select predicate.
+    /// Since we want to eliminate data filtered by the delete predicates,
+    /// they are first converted into their negated form: NOT(delete_predicate)
+    /// then added/merged into the selection one
+    pub fn merge_delete_predicates<S>(&mut self, delete_predicates: &[S])
+    where
+        S: AsRef<Self>,
+    {
+        // Create a list of disjunctive negated expressions.
+        // Example: there are two deletes as follows (note that time_range is stored separated in the Predicate
+        //  but we need to put it together with the exprs here)
+        //   . Delete_1: WHERE city != "Boston"  AND temp = 70  AND time_range in [10, 30)
+        //   . Delete 2: WHERE state = "NY" AND route != "I90" AND time_range in [20, 50)
+        // The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
+        //    NOT(city != "Boston"  AND temp = 70 AND time_range in [10, 30]),  NOT(state = "NY" AND route != "I90" AND time_range in [20, 50]) which means
+        //   [NOT(city = Boston") OR NOT(temp = 70) OR NOT(time_range in [10, 30])], [NOT(state = "NY") OR NOT(route != "I90") OR NOT(time_range in [20, 50])]
+        // Note that the "NOT(time_range in [20, 50])]" or "NOT(20 <= time <= 50)"" is replaced with "time < 20 OR time > 50"
+
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            let mut expr: Option<Expr> = None;
+
+            // Time range
+            if let Some(range) = pred.range {
+                // time_expr =  NOT(start <= time_range <= end)
+                // Equivalent to: (time < start OR time > end)
+                let time_expr = col(TIME_COLUMN_NAME)
+                    .lt(lit_timestamp_nano(range.start()))
+                    .or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end())));
+
+                match expr {
+                    None => expr = Some(time_expr),
+                    Some(e) => expr = Some(e.or(time_expr)),
+                }
+            }
+
+            // Exprs
+            for exp in &pred.exprs {
+                match expr {
+                    None => expr = Some(exp.clone().not()),
+                    Some(e) => expr = Some(e.or(exp.clone().not())),
+                }
+            }
+
+            // Push the negated expression of the delete predicate into the list exprs of the select predicate
+            if let Some(e) = expr {
+                self.exprs.push(e);
+            }
+        }
+    }
+
+    /// Removes the timestamp range from this predicate, if the range
+    /// is for the entire min/max valid range.
+    ///
+    /// This is used in certain cases to retain compatibility with the
+    /// existing storage engine
+    pub(crate) fn clear_timestamp_if_max_range(mut self) -> Self {
+        self.range = self.range.take().and_then(|range| {
+            if range.start() <= MIN_NANO_TIME && range.end() >= MAX_NANO_TIME {
+                None
+            } else {
+                Some(range)
+            }
+        });
+
+        self
+    }
+}
+
+impl fmt::Display for Predicate {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fn iter_to_str<S>(s: impl IntoIterator<Item = S>) -> String
+        where
+            S: ToString,
+        {
+            s.into_iter()
+                .map(|v| v.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        }
+
+        write!(f, "Predicate")?;
+
+        if let Some(field_columns) = &self.field_columns {
+            write!(f, " field_columns: {{{}}}", iter_to_str(field_columns))?;
+        }
+
+        if let Some(partition_key) = &self.partition_key {
+            write!(f, " partition_key: '{}'", partition_key)?;
+        }
+
+        if let Some(range) = &self.range {
+            // TODO: could be nice to show this as actual timestamps (not just numbers)?
+            write!(f, " range: [{} - {}]", range.start(), range.end())?;
+        }
+
+        if !self.exprs.is_empty() {
+            write!(f, " exprs: [")?;
+            for (i, expr) in self.exprs.iter().enumerate() {
+                write!(f, "{}", expr)?;
+                if i < self.exprs.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+/// The result of evaluating a predicate on a set of rows
+pub enum PredicateMatch {
+    /// There is at least one row that matches the predicate that has
+    /// at least one non null value in each field of the predicate
+    AtLeastOneNonNullField,
+
+    /// There are exactly zero rows that match the predicate
+    Zero,
+
+    /// There *may* be rows that match, OR there *may* be no rows that
+    /// match
+    Unknown,
+}
+
+/// Structure for building [`Predicate`]s
+///
+/// Example:
+/// ```
+/// use predicate::PredicateBuilder;
+/// use datafusion::logical_plan::{col, lit};
+///
+/// let p = PredicateBuilder::new()
+///    .timestamp_range(1, 100)
+///    .add_expr(col("foo").eq(lit(42)))
+///    .build();
+///
+/// assert_eq!(
+///   p.to_string(),
+///   "Predicate range: [1 - 100] exprs: [#foo = Int32(42)]"
+/// );
+/// ```
+#[derive(Debug, Default)]
+pub struct PredicateBuilder {
+    inner: Predicate,
+}
+
+impl From<Predicate> for PredicateBuilder {
+    fn from(inner: Predicate) -> Self {
+        Self { inner }
+    }
+}
+
+impl PredicateBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Sets the timestamp range
+    pub fn timestamp_range(mut self, start: i64, end: i64) -> Self {
+        // Without more thought, redefining the timestamp range would
+        // lose the old range. Asser that that cannot happen.
+        assert!(
+            self.inner.range.is_none(),
+            "Unexpected re-definition of timestamp range"
+        );
+
+        self.inner.range = Some(TimestampRange::new(start, end));
+        self
+    }
+
+    /// sets the optional timestamp range, if any
+    pub fn timestamp_range_option(mut self, range: Option<TimestampRange>) -> Self {
+        // Without more thought, redefining the timestamp range would
+        // lose the old range. Asser that that cannot happen.
+        assert!(
+            range.is_none() || self.inner.range.is_none(),
+            "Unexpected re-definition of timestamp range"
+        );
+        self.inner.range = range;
+        self
+    }
+
+    /// Adds an expression to the list of general purpose predicates
+    pub fn add_expr(mut self, expr: Expr) -> Self {
+        self.inner.exprs.push(expr);
+        self
+    }
+
+    /// Builds a regex matching expression from the provided column name and
+    /// pattern. Values not matching the regex will be filtered out.
+    pub fn build_regex_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
+        self.regex_match_expr(column, pattern, true)
+    }
+
+    /// Builds a regex "not matching" expression from the provided column name
+    /// and pattern. Values *matching* the regex will be filtered out.
+    pub fn build_regex_not_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
+        self.regex_match_expr(column, pattern, false)
+    }
+
+    fn regex_match_expr(mut self, column: &str, pattern: impl Into<String>, matches: bool) -> Self {
+        let expr = crate::regex::regex_match_expr(col(column), pattern.into(), matches);
+        self.inner.exprs.push(expr);
+        self
+    }
+
+    /// Sets field_column restriction
+    pub fn field_columns(mut self, columns: Vec<impl Into<String>>) -> Self {
+        // We need to distinguish predicates like `column_name In
+        // (foo, bar)` and `column_name = foo and column_name = bar` in order to handle
+        // this
+        if self.inner.field_columns.is_some() {
+            unimplemented!("Complex/Multi field predicates are not yet supported");
+        }
+
+        let column_names = columns
+            .into_iter()
+            .map(|s| s.into())
+            .collect::<BTreeSet<_>>();
+
+        self.inner.field_columns = Some(column_names);
+        self
+    }
+
+    /// Set the partition key restriction
+    pub fn partition_key(mut self, partition_key: impl Into<String>) -> Self {
+        assert!(
+            self.inner.partition_key.is_none(),
+            "multiple partition key predicates not suported"
+        );
+        self.inner.partition_key = Some(partition_key.into());
+        self
+    }
+
+    /// Create a predicate, consuming this builder
+    pub fn build(self) -> Predicate {
+        self.inner
+    }
+
+    /// Adds only the expressions from `filters` that can be pushed down to
+    /// execution engines.
+    pub fn add_pushdown_exprs(mut self, filters: &[Expr]) -> Self {
+        // For each expression of the filters, recursively split it, if it is is an AND conjunction
+        // For example, expression (x AND y) will be split into a vector of 2 expressions [x, y]
+        let mut exprs = vec![];
+        filters
+            .iter()
+            .for_each(|expr| Self::split_members(expr, &mut exprs));
+
+        // Only keep single_column and primitive binary expressions
+        let mut pushdown_exprs: Vec<Expr> = vec![];
+        let exprs_result = exprs
+            .into_iter()
+            .try_for_each::<_, Result<_, DataFusionError>>(|expr| {
+                let mut columns = HashSet::new();
+                utils::expr_to_columns(&expr, &mut columns)?;
+
+                if columns.len() == 1 && Self::primitive_binary_expr(&expr) {
+                    pushdown_exprs.push(expr);
+                }
+                Ok(())
+            });
+
+        match exprs_result {
+            Ok(()) => {
+                // Return the builder with only the pushdownable expressions on it.
+                self.inner.exprs.append(&mut pushdown_exprs);
+            }
+            Err(e) => {
+                debug!("Error, {}, building push-down predicates for filters: {:#?}. No predicates are pushed down", e, filters);
+            }
+        }
+
+        self
+    }
+
+    /// Recursively split all "AND" expressions into smaller one
+    /// Example: "A AND B AND C" => [A, B, C]
+    pub fn split_members(predicate: &Expr, predicates: &mut Vec<Expr>) {
+        match predicate {
+            Expr::BinaryExpr {
+                right,
+                op: Operator::And,
+                left,
+            } => {
+                Self::split_members(left, predicates);
+                Self::split_members(right, predicates);
+            }
+            other => predicates.push(other.clone()),
+        }
+    }
+
+    /// Return true if the given expression is in a primitive binary in the form: `column op constant`
+    // and op must be a comparison one
+    pub fn primitive_binary_expr(expr: &Expr) -> bool {
+        match expr {
+            Expr::BinaryExpr { left, op, right } => {
+                matches!(
+                    (&**left, &**right),
+                    (Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_))
+                ) && matches!(
+                    op,
+                    Operator::Eq
+                        | Operator::NotEq
+                        | Operator::Lt
+                        | Operator::LtEq
+                        | Operator::Gt
+                        | Operator::GtEq
+                )
+            }
+            _ => false,
+        }
+    }
+}
+
+// A representation of the `BinaryExpr` variant of a Datafusion expression.
+#[derive(Clone, Debug, PartialEq, PartialOrd)]
+pub struct BinaryExpr {
+    pub left: Column,
+    pub op: Operator,
+    pub right: Expr,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::logical_plan::{col, lit};
+
+    #[test]
+    fn test_default_predicate_is_empty() {
+        let p = Predicate::default();
+        assert!(p.is_empty());
+    }
+
+    #[test]
+    fn test_non_default_predicate_is_not_empty() {
+        let p = PredicateBuilder::new().timestamp_range(1, 100).build();
+
+        assert!(!p.is_empty());
+    }
+
+    #[test]
+    fn test_pushdown_predicates() {
+        let mut filters = vec![];
+
+        // state = CA
+        let expr1 = col("state").eq(lit("CA"));
+        filters.push(expr1);
+
+        // "price > 10"
+        let expr2 = col("price").gt(lit(10));
+        filters.push(expr2);
+
+        // a < 10 AND b >= 50  --> will be split to [a < 10, b >= 50]
+        let expr3 = col("a").lt(lit(10)).and(col("b").gt_eq(lit(50)));
+        filters.push(expr3);
+
+        // c != 3 OR d = 8  --> won't be pushed down
+        let expr4 = col("c").not_eq(lit(3)).or(col("d").eq(lit(8)));
+        filters.push(expr4);
+
+        // e is null --> won't be pushed down
+        let expr5 = col("e").is_null();
+        filters.push(expr5);
+
+        // f <= 60
+        let expr6 = col("f").lt_eq(lit(60));
+        filters.push(expr6);
+
+        // g is not null --> won't be pushed down
+        let expr7 = col("g").is_not_null();
+        filters.push(expr7);
+
+        // h + i  --> won't be pushed down
+        let expr8 = col("h") + col("i");
+        filters.push(expr8);
+
+        // city = Boston
+        let expr9 = col("city").eq(lit("Boston"));
+        filters.push(expr9);
+
+        // city != Braintree
+        let expr9 = col("city").not_eq(lit("Braintree"));
+        filters.push(expr9);
+
+        // city != state --> won't be pushed down
+        let expr10 = col("city").not_eq(col("state"));
+        filters.push(expr10);
+
+        // city = state --> won't be pushed down
+        let expr11 = col("city").eq(col("state"));
+        filters.push(expr11);
+
+        // city_state = city + state --> won't be pushed down
+        let expr12 = col("city_sate").eq(col("city") + col("state"));
+        filters.push(expr12);
+
+        // city = city + 5 --> won't be pushed down
+        let expr13 = col("city").eq(col("city") + lit(5));
+        filters.push(expr13);
+
+        // city = city --> won't be pushed down
+        let expr14 = col("city").eq(col("city"));
+        filters.push(expr14);
+
+        // city + 5 = city --> won't be pushed down
+        let expr15 = (col("city") + lit(5)).eq(col("city"));
+        filters.push(expr15);
+
+        // 5 = city
+        let expr16 = lit(5).eq(col("city"));
+        filters.push(expr16);
+
+        println!(" --------------- Filters: {:#?}", filters);
+
+        // Expected pushdown predicates: [state = CA, price > 10, a < 10, b >= 50, f <= 60, city = Boston, city != Braintree, 5 = city]
+        let predicate = PredicateBuilder::default()
+            .add_pushdown_exprs(&filters)
+            .build();
+
+        println!(" ------------- Predicates: {:#?}", predicate);
+        assert_eq!(predicate.exprs.len(), 8);
+        assert_eq!(predicate.exprs[0], col("state").eq(lit("CA")));
+        assert_eq!(predicate.exprs[1], col("price").gt(lit(10)));
+        assert_eq!(predicate.exprs[2], col("a").lt(lit(10)));
+        assert_eq!(predicate.exprs[3], col("b").gt_eq(lit(50)));
+        assert_eq!(predicate.exprs[4], col("f").lt_eq(lit(60)));
+        assert_eq!(predicate.exprs[5], col("city").eq(lit("Boston")));
+        assert_eq!(predicate.exprs[6], col("city").not_eq(lit("Braintree")));
+        assert_eq!(predicate.exprs[7], lit(5).eq(col("city")));
+    }
+    #[test]
+    fn predicate_display_ts() {
+        // TODO make this a doc example?
+        let p = PredicateBuilder::new().timestamp_range(1, 100).build();
+
+        assert_eq!(p.to_string(), "Predicate range: [1 - 100]");
+    }
+
+    #[test]
+    fn predicate_display_ts_and_expr() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(1, 100)
+            .add_expr(col("foo").eq(lit(42)).and(col("bar").lt(lit(11))))
+            .build();
+
+        assert_eq!(
+            p.to_string(),
+            "Predicate range: [1 - 100] exprs: [#foo = Int32(42) AND #bar < Int32(11)]"
+        );
+    }
+
+    #[test]
+    fn predicate_display_full() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(1, 100)
+            .add_expr(col("foo").eq(lit(42)))
+            .field_columns(vec!["f1", "f2"])
+            .partition_key("the_key")
+            .build();
+
+        assert_eq!(p.to_string(), "Predicate field_columns: {f1, f2} partition_key: 'the_key' range: [1 - 100] exprs: [#foo = Int32(42)]");
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(1, 100)
+            .add_expr(col("foo").eq(lit(42)))
+            .build();
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range_low() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(MIN_NANO_TIME, 100)
+            .add_expr(col("foo").eq(lit(42)))
+            .build();
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range_high() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(0, MAX_NANO_TIME)
+            .add_expr(col("foo").eq(lit(42)))
+            .build();
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_in_range() {
+        let p = PredicateBuilder::new()
+            .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
+            .add_expr(col("foo").eq(lit(42)))
+            .build();
+
+        let expected = PredicateBuilder::new()
+            .add_expr(col("foo").eq(lit(42)))
+            .build();
+        // rewrite
+        assert_eq!(p.clear_timestamp_if_max_range(), expected);
+    }
+}
diff --git a/predicate/src/predicate.rs b/predicate/src/predicate.rs
deleted file mode 100644
index 246c6888f0..0000000000
--- a/predicate/src/predicate.rs
+++ /dev/null
@@ -1,648 +0,0 @@
-//! This module contains a unified Predicate structure for IOx qieries
-//! that can select and filter Fields and Tags from the InfluxDB data
-//! mode as well as for arbitrary other predicates that are expressed
-//! by DataFusion's `Expr` type.
-
-use std::{
-    collections::{BTreeSet, HashSet},
-    fmt,
-};
-
-use data_types::timestamp::{TimestampRange, MAX_NANO_TIME, MIN_NANO_TIME};
-use datafusion::{
-    error::DataFusionError,
-    logical_plan::{col, lit_timestamp_nano, Column, Expr, Operator},
-    optimizer::utils,
-};
-use datafusion_util::{make_range_expr, AndExprBuilder};
-use observability_deps::tracing::debug;
-use schema::TIME_COLUMN_NAME;
-
-/// This `Predicate` represents the empty predicate (aka that
-/// evaluates to true for all rows).
-pub const EMPTY_PREDICATE: Predicate = Predicate {
-    field_columns: None,
-    exprs: vec![],
-    range: None,
-    partition_key: None,
-    value_expr: vec![],
-};
-
-#[derive(Debug, Clone, Copy)]
-/// The result of evaluating a predicate on a set of rows
-pub enum PredicateMatch {
-    /// There is at least one row that matches the predicate that has
-    /// at least one non null value in each field of the predicate
-    AtLeastOneNonNullField,
-
-    /// There are exactly zero rows that match the predicate
-    Zero,
-
-    /// There *may* be rows that match, OR there *may* be no rows that
-    /// match
-    Unknown,
-}
-
-/// Represents a parsed predicate for evaluation by the InfluxDB IOx
-/// query engine.
-///
-/// Note that the InfluxDB data model (e.g. ParsedLine's)
-/// distinguishes between some types of columns (tags and fields), and
-/// likewise the semantics of this structure can express some types of
-/// restrictions that only apply to certain types of columns.
-#[derive(Clone, Debug, Default, PartialEq, PartialOrd)]
-pub struct Predicate {
-    /// Optional field restriction. If present, restricts the results to only
-    /// tables which have *at least one* of the fields in field_columns.
-    pub field_columns: Option<BTreeSet<String>>,
-
-    /// Optional partition key filter
-    pub partition_key: Option<String>,
-
-    /// Optional timestamp range: only rows within this range are included in
-    /// results. Other rows are excluded
-    pub range: Option<TimestampRange>,
-
-    /// Optional arbitrary predicates, represented as list of
-    /// DataFusion expressions applied a logical conjunction (aka they
-    /// are 'AND'ed together). Only rows that evaluate to TRUE for all
-    /// these expressions should be returned. Other rows are excluded
-    /// from the results.
-    pub exprs: Vec<Expr>,
-
-    /// Optional arbitrary predicates on the special `_value` column. These
-    /// expressions are applied to `field_columns` projections in the form of
-    /// `CASE` statement conditions.
-    pub value_expr: Vec<BinaryExpr>,
-}
-
-impl Predicate {
-    /// Return true if this predicate has any general purpose predicates
-    pub fn has_exprs(&self) -> bool {
-        !self.exprs.is_empty()
-    }
-
-    /// Return a DataFusion `Expr` predicate representing the
-    /// combination of all predicate (`exprs`) and timestamp
-    /// restriction in this Predicate. Returns None if there are no
-    /// `Expr`'s restricting the data
-    pub fn filter_expr(&self) -> Option<Expr> {
-        let mut builder =
-            AndExprBuilder::default().append_opt(self.make_timestamp_predicate_expr());
-
-        for expr in &self.exprs {
-            builder = builder.append_expr(expr.clone());
-        }
-
-        builder.build()
-    }
-
-    /// Return true if the field should be included in results
-    pub fn should_include_field(&self, field_name: &str) -> bool {
-        match &self.field_columns {
-            None => true, // No field restriction on predicate
-            Some(field_names) => field_names.contains(field_name),
-        }
-    }
-
-    /// Creates a DataFusion predicate for appliying a timestamp range:
-    ///
-    /// `range.start <= time and time < range.end`
-    fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
-        self.range
-            .map(|range| make_range_expr(range.start(), range.end(), TIME_COLUMN_NAME))
-    }
-
-    /// Returns true if ths predicate evaluates to true for all rows
-    pub fn is_empty(&self) -> bool {
-        self == &EMPTY_PREDICATE
-    }
-
-    /// Return a negated DF logical expression for the given delete predicates
-    pub fn negated_expr<S>(delete_predicates: &[S]) -> Option<Expr>
-    where
-        S: AsRef<Self>,
-    {
-        if delete_predicates.is_empty() {
-            return None;
-        }
-
-        let mut pred = PredicateBuilder::default().build();
-        pred.merge_delete_predicates(delete_predicates);
-
-        // Make a conjunctive expression of the pred.exprs
-        let mut val = None;
-        for e in pred.exprs {
-            match val {
-                None => val = Some(e),
-                Some(expr) => val = Some(expr.and(e)),
-            }
-        }
-
-        val
-    }
-
-    /// Merge the given delete predicates into this select predicate.
-    /// Since we want to eliminate data filtered by the delete predicates,
-    /// they are first converted into their negated form: NOT(delete_predicate)
-    /// then added/merged into the selection one
-    pub fn merge_delete_predicates<S>(&mut self, delete_predicates: &[S])
-    where
-        S: AsRef<Self>,
-    {
-        // Create a list of disjunctive negated expressions.
-        // Example: there are two deletes as follows (note that time_range is stored separated in the Predicate
-        //  but we need to put it together with the exprs here)
-        //   . Delete_1: WHERE city != "Boston"  AND temp = 70  AND time_range in [10, 30)
-        //   . Delete 2: WHERE state = "NY" AND route != "I90" AND time_range in [20, 50)
-        // The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
-        //    NOT(city != "Boston"  AND temp = 70 AND time_range in [10, 30]),  NOT(state = "NY" AND route != "I90" AND time_range in [20, 50]) which means
-        //   [NOT(city = Boston") OR NOT(temp = 70) OR NOT(time_range in [10, 30])], [NOT(state = "NY") OR NOT(route != "I90") OR NOT(time_range in [20, 50])]
-        // Note that the "NOT(time_range in [20, 50])]" or "NOT(20 <= time <= 50)"" is replaced with "time < 20 OR time > 50"
-
-        for pred in delete_predicates {
-            let pred = pred.as_ref();
-
-            let mut expr: Option<Expr> = None;
-
-            // Time range
-            if let Some(range) = pred.range {
-                // time_expr =  NOT(start <= time_range <= end)
-                // Equivalent to: (time < start OR time > end)
-                let time_expr = col(TIME_COLUMN_NAME)
-                    .lt(lit_timestamp_nano(range.start()))
-                    .or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end())));
-
-                match expr {
-                    None => expr = Some(time_expr),
-                    Some(e) => expr = Some(e.or(time_expr)),
-                }
-            }
-
-            // Exprs
-            for exp in &pred.exprs {
-                match expr {
-                    None => expr = Some(exp.clone().not()),
-                    Some(e) => expr = Some(e.or(exp.clone().not())),
-                }
-            }
-
-            // Push the negated expression of the delete predicate into the list exprs of the select predicate
-            if let Some(e) = expr {
-                self.exprs.push(e);
-            }
-        }
-    }
-
-    /// Removes the timestamp range from this predicate, if the range
-    /// is for the entire min/max valid range.
-    ///
-    /// This is used in certain cases to retain compatibility with the
-    /// existing storage engine
-    pub(crate) fn clear_timestamp_if_max_range(mut self) -> Self {
-        self.range = self.range.take().and_then(|range| {
-            if range.start() <= MIN_NANO_TIME && range.end() >= MAX_NANO_TIME {
-                None
-            } else {
-                Some(range)
-            }
-        });
-
-        self
-    }
-}
-
-impl fmt::Display for Predicate {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fn iter_to_str<S>(s: impl IntoIterator<Item = S>) -> String
-        where
-            S: ToString,
-        {
-            s.into_iter()
-                .map(|v| v.to_string())
-                .collect::<Vec<_>>()
-                .join(", ")
-        }
-
-        write!(f, "Predicate")?;
-
-        if let Some(field_columns) = &self.field_columns {
-            write!(f, " field_columns: {{{}}}", iter_to_str(field_columns))?;
-        }
-
-        if let Some(partition_key) = &self.partition_key {
-            write!(f, " partition_key: '{}'", partition_key)?;
-        }
-
-        if let Some(range) = &self.range {
-            // TODO: could be nice to show this as actual timestamps (not just numbers)?
-            write!(f, " range: [{} - {}]", range.start(), range.end())?;
-        }
-
-        if !self.exprs.is_empty() {
-            write!(f, " exprs: [")?;
-            for (i, expr) in self.exprs.iter().enumerate() {
-                write!(f, "{}", expr)?;
-                if i < self.exprs.len() - 1 {
-                    write!(f, ", ")?;
-                }
-            }
-            write!(f, "]")?;
-        }
-        Ok(())
-    }
-}
-
-#[derive(Debug, Default)]
-/// Structure for building [`Predicate`]s
-///
-/// Example:
-/// ```
-/// use predicate::predicate::PredicateBuilder;
-/// use datafusion::logical_plan::{col, lit};
-///
-/// let p = PredicateBuilder::new()
-///    .timestamp_range(1, 100)
-///    .add_expr(col("foo").eq(lit(42)))
-///    .build();
-///
-/// assert_eq!(
-///   p.to_string(),
-///   "Predicate range: [1 - 100] exprs: [#foo = Int32(42)]"
-/// );
-/// ```
-pub struct PredicateBuilder {
-    inner: Predicate,
-}
-
-impl From<Predicate> for PredicateBuilder {
-    fn from(inner: Predicate) -> Self {
-        Self { inner }
-    }
-}
-
-impl PredicateBuilder {
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    /// Sets the timestamp range
-    pub fn timestamp_range(mut self, start: i64, end: i64) -> Self {
-        // Without more thought, redefining the timestamp range would
-        // lose the old range. Asser that that cannot happen.
-        assert!(
-            self.inner.range.is_none(),
-            "Unexpected re-definition of timestamp range"
-        );
-
-        self.inner.range = Some(TimestampRange::new(start, end));
-        self
-    }
-
-    /// sets the optional timestamp range, if any
-    pub fn timestamp_range_option(mut self, range: Option<TimestampRange>) -> Self {
-        // Without more thought, redefining the timestamp range would
-        // lose the old range. Asser that that cannot happen.
-        assert!(
-            range.is_none() || self.inner.range.is_none(),
-            "Unexpected re-definition of timestamp range"
-        );
-        self.inner.range = range;
-        self
-    }
-
-    /// Adds an expression to the list of general purpose predicates
-    pub fn add_expr(mut self, expr: Expr) -> Self {
-        self.inner.exprs.push(expr);
-        self
-    }
-
-    /// Builds a regex matching expression from the provided column name and
-    /// pattern. Values not matching the regex will be filtered out.
-    pub fn build_regex_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
-        self.regex_match_expr(column, pattern, true)
-    }
-
-    /// Builds a regex "not matching" expression from the provided column name
-    /// and pattern. Values *matching* the regex will be filtered out.
-    pub fn build_regex_not_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
-        self.regex_match_expr(column, pattern, false)
-    }
-
-    fn regex_match_expr(mut self, column: &str, pattern: impl Into<String>, matches: bool) -> Self {
-        let expr = crate::regex::regex_match_expr(col(column), pattern.into(), matches);
-        self.inner.exprs.push(expr);
-        self
-    }
-
-    /// Sets field_column restriction
-    pub fn field_columns(mut self, columns: Vec<impl Into<String>>) -> Self {
-        // We need to distinguish predicates like `column_name In
-        // (foo, bar)` and `column_name = foo and column_name = bar` in order to handle
-        // this
-        if self.inner.field_columns.is_some() {
-            unimplemented!("Complex/Multi field predicates are not yet supported");
-        }
-
-        let column_names = columns
-            .into_iter()
-            .map(|s| s.into())
-            .collect::<BTreeSet<_>>();
-
-        self.inner.field_columns = Some(column_names);
-        self
-    }
-
-    /// Set the partition key restriction
-    pub fn partition_key(mut self, partition_key: impl Into<String>) -> Self {
-        assert!(
-            self.inner.partition_key.is_none(),
-            "multiple partition key predicates not suported"
-        );
-        self.inner.partition_key = Some(partition_key.into());
-        self
-    }
-
-    /// Create a predicate, consuming this builder
-    pub fn build(self) -> Predicate {
-        self.inner
-    }
-
-    /// Adds only the expressions from `filters` that can be pushed down to
-    /// execution engines.
-    pub fn add_pushdown_exprs(mut self, filters: &[Expr]) -> Self {
-        // For each expression of the filters, recursively split it, if it is is an AND conjunction
-        // For example, expression (x AND y) will be split into a vector of 2 expressions [x, y]
-        let mut exprs = vec![];
-        filters
-            .iter()
-            .for_each(|expr| Self::split_members(expr, &mut exprs));
-
-        // Only keep single_column and primitive binary expressions
-        let mut pushdown_exprs: Vec<Expr> = vec![];
-        let exprs_result = exprs
-            .into_iter()
-            .try_for_each::<_, Result<_, DataFusionError>>(|expr| {
-                let mut columns = HashSet::new();
-                utils::expr_to_columns(&expr, &mut columns)?;
-
-                if columns.len() == 1 && Self::primitive_binary_expr(&expr) {
-                    pushdown_exprs.push(expr);
-                }
-                Ok(())
-            });
-
-        match exprs_result {
-            Ok(()) => {
-                // Return the builder with only the pushdownable expressions on it.
-                self.inner.exprs.append(&mut pushdown_exprs);
-            }
-            Err(e) => {
-                debug!("Error, {}, building push-down predicates for filters: {:#?}. No predicates are pushed down", e, filters);
-            }
-        }
-
-        self
-    }
-
-    /// Recursively split all "AND" expressions into smaller one
-    /// Example: "A AND B AND C" => [A, B, C]
-    pub fn split_members(predicate: &Expr, predicates: &mut Vec<Expr>) {
-        match predicate {
-            Expr::BinaryExpr {
-                right,
-                op: Operator::And,
-                left,
-            } => {
-                Self::split_members(left, predicates);
-                Self::split_members(right, predicates);
-            }
-            other => predicates.push(other.clone()),
-        }
-    }
-
-    /// Return true if the given expression is in a primitive binary in the form: `column op constant`
-    // and op must be a comparison one
-    pub fn primitive_binary_expr(expr: &Expr) -> bool {
-        match expr {
-            Expr::BinaryExpr { left, op, right } => {
-                matches!(
-                    (&**left, &**right),
-                    (Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_))
-                ) && matches!(
-                    op,
-                    Operator::Eq
-                        | Operator::NotEq
-                        | Operator::Lt
-                        | Operator::LtEq
-                        | Operator::Gt
-                        | Operator::GtEq
-                )
-            }
-            _ => false,
-        }
-    }
-}
-
-// A representation of the `BinaryExpr` variant of a Datafusion expression.
-#[derive(Clone, Debug, PartialEq, PartialOrd)]
-pub struct BinaryExpr {
-    pub left: Column,
-    pub op: Operator,
-    pub right: Expr,
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use datafusion::logical_plan::{col, lit};
-
-    #[test]
-    fn test_default_predicate_is_empty() {
-        let p = Predicate::default();
-        assert!(p.is_empty());
-    }
-
-    #[test]
-    fn test_non_default_predicate_is_not_empty() {
-        let p = PredicateBuilder::new().timestamp_range(1, 100).build();
-
-        assert!(!p.is_empty());
-    }
-
-    #[test]
-    fn test_pushdown_predicates() {
-        let mut filters = vec![];
-
-        // state = CA
-        let expr1 = col("state").eq(lit("CA"));
-        filters.push(expr1);
-
-        // "price > 10"
-        let expr2 = col("price").gt(lit(10));
-        filters.push(expr2);
-
-        // a < 10 AND b >= 50  --> will be split to [a < 10, b >= 50]
-        let expr3 = col("a").lt(lit(10)).and(col("b").gt_eq(lit(50)));
-        filters.push(expr3);
-
-        // c != 3 OR d = 8  --> won't be pushed down
-        let expr4 = col("c").not_eq(lit(3)).or(col("d").eq(lit(8)));
-        filters.push(expr4);
-
-        // e is null --> won't be pushed down
-        let expr5 = col("e").is_null();
-        filters.push(expr5);
-
-        // f <= 60
-        let expr6 = col("f").lt_eq(lit(60));
-        filters.push(expr6);
-
-        // g is not null --> won't be pushed down
-        let expr7 = col("g").is_not_null();
-        filters.push(expr7);
-
-        // h + i  --> won't be pushed down
-        let expr8 = col("h") + col("i");
-        filters.push(expr8);
-
-        // city = Boston
-        let expr9 = col("city").eq(lit("Boston"));
-        filters.push(expr9);
-
-        // city != Braintree
-        let expr9 = col("city").not_eq(lit("Braintree"));
-        filters.push(expr9);
-
-        // city != state --> won't be pushed down
-        let expr10 = col("city").not_eq(col("state"));
-        filters.push(expr10);
-
-        // city = state --> won't be pushed down
-        let expr11 = col("city").eq(col("state"));
-        filters.push(expr11);
-
-        // city_state = city + state --> won't be pushed down
-        let expr12 = col("city_sate").eq(col("city") + col("state"));
-        filters.push(expr12);
-
-        // city = city + 5 --> won't be pushed down
-        let expr13 = col("city").eq(col("city") + lit(5));
-        filters.push(expr13);
-
-        // city = city --> won't be pushed down
-        let expr14 = col("city").eq(col("city"));
-        filters.push(expr14);
-
-        // city + 5 = city --> won't be pushed down
-        let expr15 = (col("city") + lit(5)).eq(col("city"));
-        filters.push(expr15);
-
-        // 5 = city
-        let expr16 = lit(5).eq(col("city"));
-        filters.push(expr16);
-
-        println!(" --------------- Filters: {:#?}", filters);
-
-        // Expected pushdown predicates: [state = CA, price > 10, a < 10, b >= 50, f <= 60, city = Boston, city != Braintree, 5 = city]
-        let predicate = PredicateBuilder::default()
-            .add_pushdown_exprs(&filters)
-            .build();
-
-        println!(" ------------- Predicates: {:#?}", predicate);
-        assert_eq!(predicate.exprs.len(), 8);
-        assert_eq!(predicate.exprs[0], col("state").eq(lit("CA")));
-        assert_eq!(predicate.exprs[1], col("price").gt(lit(10)));
-        assert_eq!(predicate.exprs[2], col("a").lt(lit(10)));
-        assert_eq!(predicate.exprs[3], col("b").gt_eq(lit(50)));
-        assert_eq!(predicate.exprs[4], col("f").lt_eq(lit(60)));
-        assert_eq!(predicate.exprs[5], col("city").eq(lit("Boston")));
-        assert_eq!(predicate.exprs[6], col("city").not_eq(lit("Braintree")));
-        assert_eq!(predicate.exprs[7], lit(5).eq(col("city")));
-    }
-    #[test]
-    fn predicate_display_ts() {
-        // TODO make this a doc example?
-        let p = PredicateBuilder::new().timestamp_range(1, 100).build();
-
-        assert_eq!(p.to_string(), "Predicate range: [1 - 100]");
-    }
-
-    #[test]
-    fn predicate_display_ts_and_expr() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(1, 100)
-            .add_expr(col("foo").eq(lit(42)).and(col("bar").lt(lit(11))))
-            .build();
-
-        assert_eq!(
-            p.to_string(),
-            "Predicate range: [1 - 100] exprs: [#foo = Int32(42) AND #bar < Int32(11)]"
-        );
-    }
-
-    #[test]
-    fn predicate_display_full() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(1, 100)
-            .add_expr(col("foo").eq(lit(42)))
-            .field_columns(vec!["f1", "f2"])
-            .partition_key("the_key")
-            .build();
-
-        assert_eq!(p.to_string(), "Predicate field_columns: {f1, f2} partition_key: 'the_key' range: [1 - 100] exprs: [#foo = Int32(42)]");
-    }
-
-    #[test]
-    fn test_clear_timestamp_if_max_range_out_of_range() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(1, 100)
-            .add_expr(col("foo").eq(lit(42)))
-            .build();
-
-        let expected = p.clone();
-
-        // no rewrite
-        assert_eq!(p.clear_timestamp_if_max_range(), expected);
-    }
-
-    #[test]
-    fn test_clear_timestamp_if_max_range_out_of_range_low() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(MIN_NANO_TIME, 100)
-            .add_expr(col("foo").eq(lit(42)))
-            .build();
-
-        let expected = p.clone();
-
-        // no rewrite
-        assert_eq!(p.clear_timestamp_if_max_range(), expected);
-    }
-
-    #[test]
-    fn test_clear_timestamp_if_max_range_out_of_range_high() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(0, MAX_NANO_TIME)
-            .add_expr(col("foo").eq(lit(42)))
-            .build();
-
-        let expected = p.clone();
-
-        // no rewrite
-        assert_eq!(p.clear_timestamp_if_max_range(), expected);
-    }
-
-    #[test]
-    fn test_clear_timestamp_if_max_range_in_range() {
-        let p = PredicateBuilder::new()
-            .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
-            .add_expr(col("foo").eq(lit(42)))
-            .build();
-
-        let expected = PredicateBuilder::new()
-            .add_expr(col("foo").eq(lit(42)))
-            .build();
-        // rewrite
-        assert_eq!(p.clear_timestamp_if_max_range(), expected);
-    }
-}
diff --git a/predicate/src/rpc_predicate.rs b/predicate/src/rpc_predicate.rs
index 2004d8e1c4..e888d27e24 100644
--- a/predicate/src/rpc_predicate.rs
+++ b/predicate/src/rpc_predicate.rs
@@ -1,7 +1,6 @@
 //! Interface logic between IOx ['Predicate`] and predicates used by the
 //! InfluxDB Storage gRPC API
-use crate::predicate::{BinaryExpr, Predicate};
-use crate::rewrite;
+use crate::{rewrite, BinaryExpr, Predicate};
 
 use datafusion::error::Result as DataFusionResult;
 use datafusion::execution::context::ExecutionProps;
diff --git a/query/src/frontend/influxrpc.rs b/query/src/frontend/influxrpc.rs
index 5800c73d06..0f4f4da8ce 100644
--- a/query/src/frontend/influxrpc.rs
+++ b/query/src/frontend/influxrpc.rs
@@ -18,8 +18,8 @@ use datafusion_util::AsExpr;
 
 use hashbrown::HashSet;
 use observability_deps::tracing::{debug, trace};
-use predicate::predicate::{BinaryExpr, Predicate, PredicateMatch};
 use predicate::rpc_predicate::{InfluxRpcPredicate, FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME};
+use predicate::{BinaryExpr, Predicate, PredicateMatch};
 use schema::selection::Selection;
 use schema::{InfluxColumnType, Schema, TIME_COLUMN_NAME};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
@@ -1834,7 +1834,7 @@ impl<'a> ExprRewriter for MissingColumnsToNull<'a> {
 #[cfg(test)]
 mod tests {
     use datafusion::logical_plan::lit;
-    use predicate::predicate::PredicateBuilder;
+    use predicate::PredicateBuilder;
     use schema::builder::SchemaBuilder;
 
     use crate::{
diff --git a/query/src/lib.rs b/query/src/lib.rs
index 7404bb3e4f..ac37c7d758 100644
--- a/query/src/lib.rs
+++ b/query/src/lib.rs
@@ -16,10 +16,7 @@ use data_types::{
 use datafusion::physical_plan::SendableRecordBatchStream;
 use exec::stringset::StringSet;
 use observability_deps::tracing::{debug, trace};
-use predicate::{
-    predicate::{Predicate, PredicateMatch},
-    rpc_predicate::QueryDatabaseMeta,
-};
+use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate, PredicateMatch};
 use schema::selection::Selection;
 use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};
 
diff --git a/query/src/provider.rs b/query/src/provider.rs
index 966f661cd1..b0c094f3ab 100644
--- a/query/src/provider.rs
+++ b/query/src/provider.rs
@@ -18,7 +18,7 @@ use datafusion::{
     },
 };
 use observability_deps::tracing::{debug, trace};
-use predicate::predicate::{Predicate, PredicateBuilder};
+use predicate::{Predicate, PredicateBuilder};
 use schema::{merge::SchemaMerger, sort::SortKey, Schema};
 
 use crate::{
diff --git a/query/src/provider/physical.rs b/query/src/provider/physical.rs
index 5f8f6ae9b6..7025697f16 100644
--- a/query/src/provider/physical.rs
+++ b/query/src/provider/physical.rs
@@ -16,7 +16,7 @@ use schema::selection::Selection;
 use schema::Schema;
 
 use crate::QueryChunk;
-use predicate::predicate::Predicate;
+use predicate::Predicate;
 
 use async_trait::async_trait;
 
diff --git a/query/src/pruning.rs b/query/src/pruning.rs
index 0bb4d1c9ce..bf4f09003d 100644
--- a/query/src/pruning.rs
+++ b/query/src/pruning.rs
@@ -13,7 +13,7 @@ use datafusion::{
     physical_optimizer::pruning::{PruningPredicate, PruningStatistics},
 };
 use observability_deps::tracing::{debug, trace};
-use predicate::predicate::Predicate;
+use predicate::Predicate;
 use schema::Schema;
 
 use crate::{group_by::Aggregate, QueryChunkMeta};
@@ -228,7 +228,7 @@ mod test {
     use std::{cell::RefCell, sync::Arc};
 
     use datafusion::logical_plan::{col, lit};
-    use predicate::predicate::PredicateBuilder;
+    use predicate::PredicateBuilder;
     use schema::merge::SchemaMerger;
 
     use crate::{test::TestChunk, QueryChunk};
diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs
index 908c1288f3..ad91ca3a59 100644
--- a/query_tests/src/influxrpc/field_columns.rs
+++ b/query_tests/src/influxrpc/field_columns.rs
@@ -1,7 +1,7 @@
 use arrow::datatypes::DataType;
 use datafusion::logical_plan::{col, lit};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::fieldlist::{Field, FieldList},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/query_tests/src/influxrpc/read_filter.rs b/query_tests/src/influxrpc/read_filter.rs
index 01b00e99c1..4eaae5de73 100644
--- a/query_tests/src/influxrpc/read_filter.rs
+++ b/query_tests/src/influxrpc/read_filter.rs
@@ -13,8 +13,8 @@ use crate::{
     },
 };
 use datafusion::logical_plan::{col, lit};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::frontend::influxrpc::InfluxRpcPlanner;
 
 /// runs read_filter(predicate) and compares it to the expected
diff --git a/query_tests/src/influxrpc/read_group.rs b/query_tests/src/influxrpc/read_group.rs
index 127f300154..bed008d936 100644
--- a/query_tests/src/influxrpc/read_group.rs
+++ b/query_tests/src/influxrpc/read_group.rs
@@ -14,8 +14,8 @@ use datafusion::{
     logical_plan::{binary_expr, Operator},
     prelude::*,
 };
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{frontend::influxrpc::InfluxRpcPlanner, group_by::Aggregate};
 
 /// runs read_group(predicate) and compares it to the expected
diff --git a/query_tests/src/influxrpc/read_window_aggregate.rs b/query_tests/src/influxrpc/read_window_aggregate.rs
index 47f0c944e2..e7c43ddcfb 100644
--- a/query_tests/src/influxrpc/read_window_aggregate.rs
+++ b/query_tests/src/influxrpc/read_window_aggregate.rs
@@ -10,8 +10,8 @@ use async_trait::async_trait;
 use data_types::{delete_predicate::DeletePredicate, timestamp::TimestampRange};
 use datafusion::prelude::*;
 use db::{test_helpers::write_lp, utils::make_db};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     frontend::influxrpc::InfluxRpcPlanner,
     group_by::{Aggregate, WindowDuration},
diff --git a/query_tests/src/influxrpc/table_names.rs b/query_tests/src/influxrpc/table_names.rs
index beef62f3e0..2fe5421d0a 100644
--- a/query_tests/src/influxrpc/table_names.rs
+++ b/query_tests/src/influxrpc/table_names.rs
@@ -1,7 +1,7 @@
 //! Tests for the Influx gRPC queries
 use datafusion::logical_plan::{col, lit};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::stringset::{IntoStringSet, StringSetRef},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/query_tests/src/influxrpc/tag_keys.rs b/query_tests/src/influxrpc/tag_keys.rs
index 55c4497882..5dcf1f0007 100644
--- a/query_tests/src/influxrpc/tag_keys.rs
+++ b/query_tests/src/influxrpc/tag_keys.rs
@@ -1,6 +1,6 @@
 use datafusion::logical_plan::{col, lit};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::stringset::{IntoStringSet, StringSetRef},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/query_tests/src/influxrpc/tag_values.rs b/query_tests/src/influxrpc/tag_values.rs
index e64b596fd1..4f6a46299c 100644
--- a/query_tests/src/influxrpc/tag_values.rs
+++ b/query_tests/src/influxrpc/tag_values.rs
@@ -1,6 +1,6 @@
 use datafusion::logical_plan::{col, lit};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::stringset::{IntoStringSet, StringSetRef},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/query_tests/src/pruning.rs b/query_tests/src/pruning.rs
index e22c9f7a3e..9f36af7c46 100644
--- a/query_tests/src/pruning.rs
+++ b/query_tests/src/pruning.rs
@@ -5,8 +5,8 @@ use db::{
     utils::{make_db, TestDb},
 };
 use metric::{Attributes, Metric, U64Counter};
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::{stringset::StringSet, ExecutionContextProvider},
     frontend::{influxrpc::InfluxRpcPlanner, sql::SqlQueryPlanner},
diff --git a/server_benchmarks/benches/read_filter.rs b/server_benchmarks/benches/read_filter.rs
index 7cec9b7713..f95dd83e02 100644
--- a/server_benchmarks/benches/read_filter.rs
+++ b/server_benchmarks/benches/read_filter.rs
@@ -5,8 +5,8 @@ use std::io::Read;
 // current-thread executor
 use db::Db;
 use flate2::read::GzDecoder;
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::{Executor, ExecutorType},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/server_benchmarks/benches/read_group.rs b/server_benchmarks/benches/read_group.rs
index 4ff62e4570..d606472b36 100644
--- a/server_benchmarks/benches/read_group.rs
+++ b/server_benchmarks/benches/read_group.rs
@@ -5,8 +5,8 @@ use std::io::Read;
 // current-thread executor
 use db::Db;
 use flate2::read::GzDecoder;
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::{Executor, ExecutorType},
     frontend::influxrpc::InfluxRpcPlanner,
diff --git a/server_benchmarks/benches/tag_values.rs b/server_benchmarks/benches/tag_values.rs
index 1028d1e45d..252159a71f 100644
--- a/server_benchmarks/benches/tag_values.rs
+++ b/server_benchmarks/benches/tag_values.rs
@@ -5,8 +5,8 @@ use std::io::Read;
 // current-thread executor
 use db::Db;
 use flate2::read::GzDecoder;
-use predicate::predicate::PredicateBuilder;
 use predicate::rpc_predicate::InfluxRpcPredicate;
+use predicate::PredicateBuilder;
 use query::{
     exec::{Executor, ExecutorType},
     frontend::influxrpc::InfluxRpcPlanner,

From e6ec8ef5f345a9e3b91a414da154e256b6d35fe3 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 7 Feb 2022 10:04:20 -0500
Subject: [PATCH 09/30] test: tests to show predicate simplification on chunks
 (#3649)

* test: tests to show predicate simplification on chunks

* fix: clippy

* refactor: less Box

* refactor: make typealias + add comments, hopefully to improve clarity

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 query/src/frontend/influxrpc.rs | 100 ++++++++++++++++++++++++--------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/query/src/frontend/influxrpc.rs b/query/src/frontend/influxrpc.rs
index 0f4f4da8ce..8727850615 100644
--- a/query/src/frontend/influxrpc.rs
+++ b/query/src/frontend/influxrpc.rs
@@ -1953,7 +1953,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_rewrite_table_names() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             InfluxRpcPlanner::new()
                 .table_names(test_db, rpc_predicate)
                 .expect("creating plan");
@@ -1963,7 +1963,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_rewrite_tag_keys() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             InfluxRpcPlanner::new()
                 .tag_keys(test_db, rpc_predicate)
                 .expect("creating plan");
@@ -1973,7 +1973,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_rewrite_tag_values() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             InfluxRpcPlanner::new()
                 .tag_values(test_db, "foo", rpc_predicate)
                 .expect("creating plan");
@@ -1983,7 +1983,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_rewrite_field_columns() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             InfluxRpcPlanner::new()
                 .field_columns(test_db, rpc_predicate)
                 .expect("creating plan");
@@ -1993,7 +1993,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_rewrite_read_filter() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             InfluxRpcPlanner::new()
                 .read_filter(test_db, rpc_predicate)
                 .expect("creating plan");
@@ -2003,7 +2003,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_read_group() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             let agg = Aggregate::None;
             let group_columns = &["foo"];
             InfluxRpcPlanner::new()
@@ -2015,7 +2015,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_predicate_read_window_aggregate() {
-        run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
+        run_test(&|test_db, rpc_predicate| {
             let agg = Aggregate::First;
             let every = WindowDuration::from_months(1, false);
             let offset = WindowDuration::from_months(1, false);
@@ -2026,17 +2026,15 @@ mod tests {
         .await
     }
 
-    /// Runs func() and checks that predicates are simplified prior to sending them off
-    async fn run_test<F, D>(f: F)
-    where
-        F: FnOnce(&TestDatabase, InfluxRpcPredicate) + Send,
-    {
-        let chunk0 = Arc::new(
-            TestChunk::new("h2o")
-                .with_id(0)
-                .with_tag_column("foo")
-                .with_time_column(),
-        );
+    /// Given a `TestDatabase` plans a InfluxRPC query
+    /// (e.g. read_filter, read_window_aggregate, etc). The test below
+    /// ensures that predicates are simplified during query planning.
+    type PlanRPCFunc = dyn Fn(&TestDatabase, InfluxRpcPredicate) + Send + Sync;
+
+    /// Runs func() and checks that predicates are simplified prior to
+    /// sending them down to the chunks for processing.
+    async fn run_test(func: &'static PlanRPCFunc) {
+        // ------------- Test 1 ----------------
 
         // this is what happens with a grpc predicate on a tag
         //
@@ -2053,22 +2051,74 @@ mod tests {
             .add_expr(expr.eq(lit("bar")))
             .build();
 
+        // verify that the predicate was rewritten to `foo = 'bar'`
+        let expr = col("foo").eq(lit("bar"));
+        let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
+
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+
+        // ------------- Test 2 ----------------
+        // Validate that _measurement predicates are translated
+        //
+        // https://github.com/influxdata/influxdb_iox/issues/3601
+        // _measurement = 'foo'
+        let silly_predicate = PredicateBuilder::new()
+            .add_expr(col("_measurement").eq(lit("foo")))
+            .build();
+
+        // verify that the predicate was rewritten to `false` as the
+        // measurement name is `h20`
+        let expr = lit(false);
+
+        let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+
+        // ------------- Test 3 ----------------
+        // more complicated _measurement predicates are translated
+        //
+        // https://github.com/influxdata/influxdb_iox/issues/3601
+        // (_measurement = 'foo' or measurement = 'h2o') AND time > 5
+        let silly_predicate = PredicateBuilder::new()
+            .add_expr(
+                col("_measurement")
+                    .eq(lit("foo"))
+                    .or(col("_measurement").eq(lit("h2o")))
+                    .and(col("time").gt(lit(5))),
+            )
+            .build();
+
+        // verify that the predicate was rewritten to time > 5
+        let expr = col("time").gt(lit(5));
+
+        let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+    }
+
+    /// Runs func() with the specified predicate and verifies
+    /// `expected_predicate` is received by the chunk
+    async fn run_test_with_predicate(
+        func: &PlanRPCFunc,
+        predicate: Predicate,
+        expected_predicate: Predicate,
+    ) {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_time_column(),
+        );
+
         let executor = Arc::new(Executor::new(1));
         let test_db = TestDatabase::new(Arc::clone(&executor));
         test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
 
-        let rpc_predicate = InfluxRpcPredicate::new(None, silly_predicate);
+        let rpc_predicate = InfluxRpcPredicate::new(None, predicate);
 
         // run the function
-        f(&test_db, rpc_predicate);
+        func(&test_db, rpc_predicate);
 
         let actual_predicate = test_db.get_chunks_predicate();
 
-        // verify that the predicate was rewritten to `foo = 'bar'`
-        let expr = col("foo").eq(lit("bar"));
-
-        let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
-
         assert_eq!(
             actual_predicate, expected_predicate,
             "\nActual: {:?}\nExpected: {:?}",

From 8d7865496d0625b2e4f81a31e992b1e235296d21 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 7 Feb 2022 10:14:34 -0500
Subject: [PATCH 10/30] refactor: remove out of date comments (#3653)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 query_tests/src/influxrpc/read_group.rs            |  4 ----
 query_tests/src/influxrpc/read_window_aggregate.rs |  4 ----
 query_tests/src/scenarios/library.rs               | 10 ----------
 3 files changed, 18 deletions(-)

diff --git a/query_tests/src/influxrpc/read_group.rs b/query_tests/src/influxrpc/read_group.rs
index bed008d936..c2937548aa 100644
--- a/query_tests/src/influxrpc/read_group.rs
+++ b/query_tests/src/influxrpc/read_group.rs
@@ -360,8 +360,6 @@ async fn test_grouped_series_set_plan_count_measurement_pred() {
     .await;
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 #[tokio::test]
 async fn test_grouped_series_set_plan_first() {
     let predicate = PredicateBuilder::default()
@@ -486,8 +484,6 @@ async fn test_grouped_series_set_plan_last_with_nulls() {
     .await;
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 #[tokio::test]
 async fn test_grouped_series_set_plan_min() {
     let predicate = PredicateBuilder::default()
diff --git a/query_tests/src/influxrpc/read_window_aggregate.rs b/query_tests/src/influxrpc/read_window_aggregate.rs
index e7c43ddcfb..df909d1df6 100644
--- a/query_tests/src/influxrpc/read_window_aggregate.rs
+++ b/query_tests/src/influxrpc/read_window_aggregate.rs
@@ -108,8 +108,6 @@ impl DbSetup for MeasurementForWindowAggregate {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 #[tokio::test]
 async fn test_read_window_aggregate_nanoseconds() {
     let predicate = PredicateBuilder::default()
@@ -256,8 +254,6 @@ impl DbSetup for MeasurementForWindowAggregateMonths {
     }
 }
 
-// NGA todo: add delete DbSetup
-
 #[tokio::test]
 async fn test_read_window_aggregate_months() {
     let agg = Aggregate::Mean;
diff --git a/query_tests/src/scenarios/library.rs b/query_tests/src/scenarios/library.rs
index 90b773d943..11b1a73054 100644
--- a/query_tests/src/scenarios/library.rs
+++ b/query_tests/src/scenarios/library.rs
@@ -1308,8 +1308,6 @@ impl DbSetup for OneMeasurementForAggs {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 pub struct AnotherMeasurementForAggs {}
 #[async_trait]
 impl DbSetup for AnotherMeasurementForAggs {
@@ -1332,8 +1330,6 @@ impl DbSetup for AnotherMeasurementForAggs {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 pub struct TwoMeasurementForAggs {}
 #[async_trait]
 impl DbSetup for TwoMeasurementForAggs {
@@ -1353,8 +1349,6 @@ impl DbSetup for TwoMeasurementForAggs {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 pub struct MeasurementForSelectors {}
 #[async_trait]
 impl DbSetup for MeasurementForSelectors {
@@ -1408,8 +1402,6 @@ impl DbSetup for MeasurementForMax {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 pub struct MeasurementForGroupKeys {}
 #[async_trait]
 impl DbSetup for MeasurementForGroupKeys {
@@ -1432,8 +1424,6 @@ impl DbSetup for MeasurementForGroupKeys {
     }
 }
 
-// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
-
 pub struct MeasurementForGroupByField {}
 #[async_trait]
 impl DbSetup for MeasurementForGroupByField {

From 87ac926e06270e212d93d53568d7296adf4601d4 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Mon, 7 Feb 2022 15:26:06 +0000
Subject: [PATCH 11/30] feat: add queries system table (#3655)

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 influxdb_iox/src/commands/sql/observer.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/influxdb_iox/src/commands/sql/observer.rs b/influxdb_iox/src/commands/sql/observer.rs
index 6f510b33dc..53d66613a7 100644
--- a/influxdb_iox/src/commands/sql/observer.rs
+++ b/influxdb_iox/src/commands/sql/observer.rs
@@ -102,7 +102,13 @@ async fn load_remote_system_tables(
     connection: Connection,
 ) -> Result<()> {
     // all prefixed with "system."
-    let table_names = vec!["chunks", "chunk_columns", "columns", "operations"];
+    let table_names = vec![
+        "chunks",
+        "chunk_columns",
+        "columns",
+        "operations",
+        "queries",
+    ];
 
     let start = Instant::now();
 

From 977ccc1989ee85314ca285b4916ade14f54ad89d Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Mon, 7 Feb 2022 15:56:54 +0000
Subject: [PATCH 12/30] fix: use a single metric registry for ingester (#3652)

With this change write buffer ingestion metrics are showing up under
`/metrics`

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 influxdb_iox/src/commands/run/ingester.rs | 2 +-
 ingester/src/server.rs                    | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs
index a263bb2e13..bbca6354c7 100644
--- a/influxdb_iox/src/commands/run/ingester.rs
+++ b/influxdb_iox/src/commands/run/ingester.rs
@@ -167,7 +167,7 @@ pub async fn command(config: Config) -> Result<()> {
     let http = HttpDelegate::new(Arc::clone(&ingest_handler));
     let grpc = GrpcDelegate::new(ingest_handler);
 
-    let ingester = IngesterServer::new(http, grpc);
+    let ingester = IngesterServer::new(metric_registry, http, grpc);
     let server_type = Arc::new(IngesterServerType::new(ingester, &common_state));
 
     info!("starting ingester");
diff --git a/ingester/src/server.rs b/ingester/src/server.rs
index 324b8cbf5b..7e0680fdd9 100644
--- a/ingester/src/server.rs
+++ b/ingester/src/server.rs
@@ -22,9 +22,13 @@ pub struct IngesterServer<I: IngestHandler> {
 impl<I: IngestHandler> IngesterServer<I> {
     /// Initialise a new [`IngesterServer`] using the provided HTTP and gRPC
     /// handlers.
-    pub fn new(http: HttpDelegate<I>, grpc: GrpcDelegate<I>) -> Self {
+    pub fn new(
+        metrics: Arc<metric::Registry>,
+        http: HttpDelegate<I>,
+        grpc: GrpcDelegate<I>,
+    ) -> Self {
         Self {
-            metrics: Default::default(),
+            metrics,
             http,
             grpc,
         }

From d9cc9f5a2a7da0792276e31cca4d53d85392e1d2 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Mon, 7 Feb 2022 16:24:28 +0000
Subject: [PATCH 13/30] feat: expose write buffer connection config via CLI
 (#3651)

* feat: improve rskafka config error messages

* feat: expose write buffer connection config via CLI
---
 influxdb_iox/src/clap_blocks/write_buffer.rs | 113 ++++++++++++--
 influxdb_iox/src/commands/run/ingester.rs    |  23 +--
 influxdb_iox/src/commands/run/router2.rs     |   2 +-
 write_buffer/src/kafka/config.rs             | 150 ++++++++++++++-----
 4 files changed, 220 insertions(+), 68 deletions(-)

diff --git a/influxdb_iox/src/clap_blocks/write_buffer.rs b/influxdb_iox/src/clap_blocks/write_buffer.rs
index 4c556befaf..b370cb380a 100644
--- a/influxdb_iox/src/clap_blocks/write_buffer.rs
+++ b/influxdb_iox/src/clap_blocks/write_buffer.rs
@@ -1,11 +1,11 @@
-use std::sync::Arc;
+use std::{collections::BTreeMap, sync::Arc};
 
 use data_types::write_buffer::WriteBufferConnection;
 use time::SystemProvider;
 use trace::TraceCollector;
 use write_buffer::{
     config::WriteBufferConfigFactory,
-    core::{WriteBufferError, WriteBufferWriting},
+    core::{WriteBufferError, WriteBufferReading, WriteBufferWriting},
 };
 
 #[derive(Debug, clap::Parser)]
@@ -27,27 +27,114 @@ pub struct WriteBufferConfig {
         default_value = "iox-shared"
     )]
     pub(crate) topic: String,
+
+    /// Write buffer connection config.
+    ///
+    /// The concrete options depend on the write buffer type.
+    ///
+    /// Command line arguments are passed as `--write-buffer-connection-config key1=value1 key2=value2` or
+    /// `--write-buffer-connection-config key1=value1,key2=value2`.
+    ///
+    /// Environment variables are passed as `key1=value1,key2=value2,...`.
+    #[clap(
+        long = "--write-buffer-connection-config",
+        env = "INFLUXDB_IOX_WRITE_BUFFER_CONNECTION_CONFIG",
+        default_value = "",
+        multiple_values = true,
+        use_delimiter = true
+    )]
+    pub(crate) connection_config: Vec<String>,
 }
 
 impl WriteBufferConfig {
-    /// Initialize the [`WriteBufferWriting`].
-    pub async fn init_write_buffer(
+    /// Initialize a [`WriteBufferWriting`].
+    pub async fn writing(
         &self,
         metrics: Arc<metric::Registry>,
         trace_collector: Option<Arc<dyn TraceCollector>>,
     ) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
-        let write_buffer_config = WriteBufferConnection {
+        let conn = self.conn();
+        let factory = Self::factory(metrics);
+        factory
+            .new_config_write(&self.topic, trace_collector.as_ref(), &conn)
+            .await
+    }
+
+    /// Initialize a [`WriteBufferReading`].
+    pub async fn reading(
+        &self,
+        metrics: Arc<metric::Registry>,
+        trace_collector: Option<Arc<dyn TraceCollector>>,
+    ) -> Result<Arc<dyn WriteBufferReading>, WriteBufferError> {
+        let conn = self.conn();
+        let factory = Self::factory(metrics);
+        factory
+            .new_config_read(&self.topic, trace_collector.as_ref(), &conn)
+            .await
+    }
+
+    fn connection_config(&self) -> BTreeMap<String, String> {
+        let mut cfg = BTreeMap::new();
+
+        for s in &self.connection_config {
+            if s.is_empty() {
+                continue;
+            }
+
+            if let Some((k, v)) = s.split_once('=') {
+                cfg.insert(k.to_owned(), v.to_owned());
+            } else {
+                cfg.insert(s.clone(), String::from(""));
+            }
+        }
+
+        cfg
+    }
+
+    fn conn(&self) -> WriteBufferConnection {
+        WriteBufferConnection {
             type_: self.type_.clone(),
             connection: self.connection_string.clone(),
-            connection_config: Default::default(),
+            connection_config: self.connection_config(),
             creation_config: None,
-        };
+        }
+    }
 
-        let write_buffer =
-            WriteBufferConfigFactory::new(Arc::new(SystemProvider::default()), metrics);
-        let write_buffer = write_buffer
-            .new_config_write(&self.topic, trace_collector.as_ref(), &write_buffer_config)
-            .await?;
-        Ok(write_buffer)
+    fn factory(metrics: Arc<metric::Registry>) -> WriteBufferConfigFactory {
+        WriteBufferConfigFactory::new(Arc::new(SystemProvider::default()), metrics)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use clap::StructOpt;
+
+    use super::*;
+
+    #[test]
+    fn test_connection_config() {
+        let cfg = WriteBufferConfig::try_parse_from([
+            "my_binary",
+            "--write-buffer",
+            "kafka",
+            "--write-buffer-addr",
+            "localhost:1234",
+            "--write-buffer-connection-config",
+            "foo=bar",
+            "",
+            "x=",
+            "y",
+            "foo=baz",
+            "so=many=args",
+        ])
+        .unwrap();
+        let actual = cfg.connection_config();
+        let expected = BTreeMap::from([
+            (String::from("foo"), String::from("baz")),
+            (String::from("x"), String::from("")),
+            (String::from("y"), String::from("")),
+            (String::from("so"), String::from("many=args")),
+        ]);
+        assert_eq!(actual, expected);
     }
 }
diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs
index bbca6354c7..cd207a6949 100644
--- a/influxdb_iox/src/commands/run/ingester.rs
+++ b/influxdb_iox/src/commands/run/ingester.rs
@@ -12,7 +12,6 @@ use crate::{
         },
     },
 };
-use data_types::write_buffer::WriteBufferConnection;
 use ingester::{
     handler::IngestHandlerImpl,
     server::{grpc::GrpcDelegate, http::HttpDelegate, IngesterServer},
@@ -24,8 +23,6 @@ use std::collections::BTreeMap;
 use std::convert::TryFrom;
 use std::sync::Arc;
 use thiserror::Error;
-use time::TimeProvider;
-use write_buffer::config::WriteBufferConfigFactory;
 
 #[derive(Debug, Error)]
 pub enum Error {
@@ -107,7 +104,7 @@ pub async fn command(config: Config) -> Result<()> {
         .kafka_topics()
         .get_by_name(&config.write_buffer_config.topic)
         .await?
-        .ok_or(Error::KafkaTopicNotFound(config.write_buffer_config.topic))?;
+        .ok_or_else(|| Error::KafkaTopicNotFound(config.write_buffer_config.topic.clone()))?;
 
     if config.write_buffer_partition_range_start > config.write_buffer_partition_range_end {
         return Err(Error::KafkaRange);
@@ -135,22 +132,10 @@ pub async fn command(config: Config) -> Result<()> {
 
     let metric_registry: Arc<metric::Registry> = Default::default();
     let trace_collector = common_state.trace_collector();
-    let time_provider: Arc<dyn TimeProvider> = Arc::new(time::SystemProvider::new());
-    let write_buffer_factory =
-        WriteBufferConfigFactory::new(Arc::clone(&time_provider), Arc::clone(&metric_registry));
 
-    let write_buffer_cfg = WriteBufferConnection {
-        type_: config.write_buffer_config.type_,
-        connection: config.write_buffer_config.connection_string,
-        connection_config: Default::default(),
-        creation_config: None,
-    };
-    let write_buffer = write_buffer_factory
-        .new_config_read(
-            &kafka_topic.name,
-            trace_collector.as_ref(),
-            &write_buffer_cfg,
-        )
+    let write_buffer = config
+        .write_buffer_config
+        .reading(Arc::clone(&metric_registry), trace_collector.clone())
         .await?;
 
     let ingest_handler = Arc::new(
diff --git a/influxdb_iox/src/commands/run/router2.rs b/influxdb_iox/src/commands/run/router2.rs
index 05d282b668..be43edf92a 100644
--- a/influxdb_iox/src/commands/run/router2.rs
+++ b/influxdb_iox/src/commands/run/router2.rs
@@ -167,7 +167,7 @@ async fn init_write_buffer(
     let write_buffer = Arc::new(
         config
             .write_buffer_config
-            .init_write_buffer(metrics, trace_collector)
+            .writing(metrics, trace_collector)
             .await?,
     );
 
diff --git a/write_buffer/src/kafka/config.rs b/write_buffer/src/kafka/config.rs
index 61ea8c0c54..f828e4b207 100644
--- a/write_buffer/src/kafka/config.rs
+++ b/write_buffer/src/kafka/config.rs
@@ -1,4 +1,4 @@
-use std::{collections::BTreeMap, time::Duration};
+use std::{collections::BTreeMap, fmt::Display, str::FromStr, time::Duration};
 
 use data_types::write_buffer::WriteBufferCreationConfig;
 
@@ -18,7 +18,7 @@ impl TryFrom<&BTreeMap<String, String>> for ClientConfig {
 
     fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
         Ok(Self {
-            max_message_size: cfg.get("max_message_size").map(|s| s.parse()).transpose()?,
+            max_message_size: parse_key(cfg, "max_message_size")?,
         })
     }
 }
@@ -46,18 +46,8 @@ impl TryFrom<&WriteBufferCreationConfig> for TopicCreationConfig {
     fn try_from(cfg: &WriteBufferCreationConfig) -> Result<Self, Self::Error> {
         Ok(Self {
             num_partitions: i32::try_from(cfg.n_sequencers.get())?,
-            replication_factor: cfg
-                .options
-                .get("replication_factor")
-                .map(|s| s.parse())
-                .transpose()?
-                .unwrap_or(1),
-            timeout_ms: cfg
-                .options
-                .get("timeout_ms")
-                .map(|s| s.parse())
-                .transpose()?
-                .unwrap_or(5_000),
+            replication_factor: parse_key(&cfg.options, "replication_factor")?.unwrap_or(1),
+            timeout_ms: parse_key(&cfg.options, "timeout_ms")?.unwrap_or(5_000),
         })
     }
 }
@@ -86,18 +76,9 @@ impl TryFrom<&BTreeMap<String, String>> for ConsumerConfig {
 
     fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
         Ok(Self {
-            max_wait_ms: cfg
-                .get("consumer_max_wait_ms")
-                .map(|s| s.parse())
-                .transpose()?,
-            min_batch_size: cfg
-                .get("consumer_min_batch_size")
-                .map(|s| s.parse())
-                .transpose()?,
-            max_batch_size: cfg
-                .get("consumer_max_batch_size")
-                .map(|s| s.parse())
-                .transpose()?,
+            max_wait_ms: parse_key(cfg, "consumer_max_wait_ms")?,
+            min_batch_size: parse_key(cfg, "consumer_min_batch_size")?,
+            max_batch_size: parse_key(cfg, "consumer_max_batch_size")?,
         })
     }
 }
@@ -120,22 +101,29 @@ impl TryFrom<&BTreeMap<String, String>> for ProducerConfig {
     type Error = WriteBufferError;
 
     fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
-        let linger_ms: Option<u64> = cfg
-            .get("producer_linger_ms")
-            .map(|s| s.parse())
-            .transpose()?;
+        let linger_ms: Option<u64> = parse_key(cfg, "producer_linger_ms")?;
 
         Ok(Self {
             linger: linger_ms.map(Duration::from_millis),
-            max_batch_size: cfg
-                .get("producer_max_batch_size")
-                .map(|s| s.parse())
-                .transpose()?
-                .unwrap_or(100 * 1024),
+            max_batch_size: parse_key(cfg, "producer_max_batch_size")?.unwrap_or(100 * 1024),
         })
     }
 }
 
+fn parse_key<T>(cfg: &BTreeMap<String, String>, key: &str) -> Result<Option<T>, WriteBufferError>
+where
+    T: FromStr,
+    T::Err: Display,
+{
+    if let Some(s) = cfg.get(key) {
+        s.parse()
+            .map(Some)
+            .map_err(|e| format!("Cannot parse `{key}` from '{s}': {e}").into())
+    } else {
+        Ok(None)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::{collections::BTreeMap, num::NonZeroU32};
@@ -164,6 +152,19 @@ mod tests {
         assert_eq!(actual, expected);
     }
 
+    #[test]
+    fn test_client_config_error() {
+        let err = ClientConfig::try_from(&BTreeMap::from([(
+            String::from("max_message_size"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `max_message_size` from 'xyz': invalid digit found in string"
+        );
+    }
+
     #[test]
     fn test_topic_creation_config_default() {
         let actual = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
@@ -198,6 +199,29 @@ mod tests {
         assert_eq!(actual, expected);
     }
 
+    #[test]
+    fn test_topic_creation_config_err() {
+        let err = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
+            n_sequencers: NonZeroU32::new(2).unwrap(),
+            options: BTreeMap::from([(String::from("replication_factor"), String::from("xyz"))]),
+        })
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `replication_factor` from 'xyz': invalid digit found in string"
+        );
+
+        let err = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
+            n_sequencers: NonZeroU32::new(2).unwrap(),
+            options: BTreeMap::from([(String::from("timeout_ms"), String::from("xyz"))]),
+        })
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `timeout_ms` from 'xyz': invalid digit found in string"
+        );
+    }
+
     #[test]
     fn test_consumer_config_default() {
         let actual = ConsumerConfig::try_from(&BTreeMap::default()).unwrap();
@@ -226,6 +250,39 @@ mod tests {
         assert_eq!(actual, expected);
     }
 
+    #[test]
+    fn test_consumer_config_err() {
+        let err = ConsumerConfig::try_from(&BTreeMap::from([(
+            String::from("consumer_max_wait_ms"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `consumer_max_wait_ms` from 'xyz': invalid digit found in string"
+        );
+
+        let err = ConsumerConfig::try_from(&BTreeMap::from([(
+            String::from("consumer_min_batch_size"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `consumer_min_batch_size` from 'xyz': invalid digit found in string"
+        );
+
+        let err = ConsumerConfig::try_from(&BTreeMap::from([(
+            String::from("consumer_max_batch_size"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `consumer_max_batch_size` from 'xyz': invalid digit found in string"
+        );
+    }
+
     #[test]
     fn test_producer_config_default() {
         let actual = ProducerConfig::try_from(&BTreeMap::default()).unwrap();
@@ -253,4 +310,27 @@ mod tests {
         };
         assert_eq!(actual, expected);
     }
+
+    #[test]
+    fn test_producer_config_err() {
+        let err = ProducerConfig::try_from(&BTreeMap::from([(
+            String::from("producer_linger_ms"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `producer_linger_ms` from 'xyz': invalid digit found in string"
+        );
+
+        let err = ProducerConfig::try_from(&BTreeMap::from([(
+            String::from("producer_max_batch_size"),
+            String::from("xyz"),
+        )]))
+        .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Cannot parse `producer_max_batch_size` from 'xyz': invalid digit found in string"
+        );
+    }
 }

From 245676ff5d0c210d9e580c76f35b0d50be0dafb8 Mon Sep 17 00:00:00 2001
From: Paul Dix <paul@pauldix.net>
Date: Mon, 7 Feb 2022 15:57:25 -0500
Subject: [PATCH 14/30] feat: add method to catalog to get its partition info
 by id (#3650)

This is a little bit specific for how things are structured in IngesterData right now. Easy enough to take back out later if/when we restructure.
---
 iox_catalog/src/interface.rs | 28 ++++++++++++++++++++++-
 iox_catalog/src/mem.rs       | 44 ++++++++++++++++++++++++++++++++----
 iox_catalog/src/postgres.rs  | 40 ++++++++++++++++++++++++++++++--
 3 files changed, 105 insertions(+), 7 deletions(-)

diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index 2ca9d3e4a0..16bb28913b 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -446,6 +446,21 @@ pub trait PartitionRepo: Send + Sync {
 
     /// return partitions for a given sequencer
     async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>>;
+
+    /// return the partition record, the namespace name it belongs to, and the table name it is under
+    async fn partition_info_by_id(
+        &self,
+        partition_id: PartitionId,
+    ) -> Result<Option<PartitionInfo>>;
+}
+
+/// Information for a partition from the catalog.
+#[derive(Debug)]
+#[allow(missing_docs)]
+pub struct PartitionInfo {
+    pub partition: Partition,
+    pub namespace_name: String,
+    pub table_name: String,
 }
 
 /// Functions for working with tombstones in the catalog
@@ -1217,7 +1232,7 @@ pub(crate) mod test_helpers {
             })
             .collect::<BTreeMap<_, _>>()
             .await;
-        let _ = catalog
+        let other_partition = catalog
             .partitions()
             .create_or_get("asdf", other_sequencer.id, table.id)
             .await
@@ -1234,6 +1249,17 @@ pub(crate) mod test_helpers {
             .collect::<BTreeMap<_, _>>();
 
         assert_eq!(created, listed);
+
+        // test get_partition_info_by_id
+        let info = catalog
+            .partitions()
+            .partition_info_by_id(other_partition.id)
+            .await
+            .unwrap()
+            .unwrap();
+        assert_eq!(info.partition, other_partition);
+        assert_eq!(info.table_name, "test_table");
+        assert_eq!(info.namespace_name, "namespace_partition_test");
     }
 
     async fn test_tombstone(catalog: Arc<dyn Catalog>) {
diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs
index 94ea5602c6..58852cdc19 100644
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@@ -4,10 +4,10 @@
 use crate::interface::{
     Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic,
     KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
-    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionRepo, ProcessedTombstone,
-    ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result, SequenceNumber,
-    Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp, Tombstone,
-    TombstoneId, TombstoneRepo,
+    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo,
+    ProcessedTombstone, ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result,
+    SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp,
+    Tombstone, TombstoneId, TombstoneRepo,
 };
 use async_trait::async_trait;
 use sqlx::{Postgres, Transaction};
@@ -436,6 +436,42 @@ impl PartitionRepo for MemCatalog {
             .collect();
         Ok(partitions)
     }
+
+    async fn partition_info_by_id(
+        &self,
+        partition_id: PartitionId,
+    ) -> Result<Option<PartitionInfo>> {
+        let collections = self.collections.lock().expect("mutex poisoned");
+        let partition = collections
+            .partitions
+            .iter()
+            .find(|p| p.id == partition_id)
+            .cloned();
+
+        if let Some(partition) = partition {
+            let table = collections
+                .tables
+                .iter()
+                .find(|t| t.id == partition.table_id)
+                .cloned();
+            if let Some(table) = table {
+                let namespace = collections
+                    .namespaces
+                    .iter()
+                    .find(|n| n.id == table.namespace_id)
+                    .cloned();
+                if let Some(namespace) = namespace {
+                    return Ok(Some(PartitionInfo {
+                        namespace_name: namespace.name,
+                        table_name: table.name,
+                        partition,
+                    }));
+                }
+            }
+        }
+
+        Ok(None)
+    }
 }
 
 #[async_trait]
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 990e7d604e..078bdd8f8f 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -3,14 +3,16 @@
 use crate::interface::{
     Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic, KafkaTopicId,
     KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile, ParquetFileId,
-    ParquetFileRepo, Partition, PartitionId, PartitionRepo, ProcessedTombstone,
+    ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo, ProcessedTombstone,
     ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result, SequenceNumber,
     Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp, Tombstone,
     TombstoneId, TombstoneRepo,
 };
 use async_trait::async_trait;
 use observability_deps::tracing::{info, warn};
-use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Transaction};
+use sqlx::{
+    migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Row, Transaction,
+};
 use std::time::Duration;
 use uuid::Uuid;
 
@@ -509,6 +511,40 @@ impl PartitionRepo for PostgresCatalog {
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
+
+    async fn partition_info_by_id(
+        &self,
+        partition_id: PartitionId,
+    ) -> Result<Option<PartitionInfo>> {
+        let info = sqlx::query(
+            r#"
+        SELECT namespace.name as namespace_name, table_name.name as table_name, partition.id,
+               partition.sequencer_id, partition.table_id, partition.partition_key
+        FROM partition
+        INNER JOIN table_name on table_name.id = partition.table_id
+        INNER JOIN namespace on namespace.id = table_name.namespace_id
+        WHERE partition.id = $1;"#,
+        )
+        .bind(&partition_id) // $1
+        .fetch_one(&self.pool)
+        .await
+        .map_err(|e| Error::SqlxError { source: e })?;
+
+        let namespace_name = info.get("namespace_name");
+        let table_name = info.get("table_name");
+        let partition = Partition {
+            id: info.get("id"),
+            sequencer_id: info.get("sequencer_id"),
+            table_id: info.get("table_id"),
+            partition_key: info.get("partition_key"),
+        };
+
+        Ok(Some(PartitionInfo {
+            namespace_name,
+            table_name,
+            partition,
+        }))
+    }
 }
 
 #[async_trait]

From 4e0b7a20fa1d4d14896d98a2c0126d9cfb142d1a Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Mon, 7 Feb 2022 21:07:43 +0000
Subject: [PATCH 15/30] feat: add timeouts to write chunk (#3662)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/lifecycle/error.rs |  3 +++
 db/src/lifecycle/write.rs | 46 +++++++++++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/db/src/lifecycle/error.rs b/db/src/lifecycle/error.rs
index 1063dbaf80..3010cbb044 100644
--- a/db/src/lifecycle/error.rs
+++ b/db/src/lifecycle/error.rs
@@ -30,6 +30,9 @@ pub enum Error {
     #[snafu(context(false))]
     Aborted { source: futures::future::Aborted },
 
+    #[snafu(context(false))]
+    Timeout { source: tokio::time::error::Elapsed },
+
     #[snafu(display("Read Buffer Error in chunk {}{} : {}", chunk_id, table_name, source))]
     ReadBufferChunkError {
         source: read_buffer::Error,
diff --git a/db/src/lifecycle/write.rs b/db/src/lifecycle/write.rs
index 82360f6f78..b36da80005 100644
--- a/db/src/lifecycle/write.rs
+++ b/db/src/lifecycle/write.rs
@@ -14,6 +14,7 @@ use crate::{
     DbChunk,
 };
 use ::lifecycle::LifecycleWriteGuard;
+use data_types::error::ErrorLogger;
 use data_types::{chunk_metadata::ChunkLifecycleAction, job::Job};
 use observability_deps::tracing::{debug, warn};
 use parquet_catalog::interface::CatalogParquetInfo;
@@ -29,9 +30,13 @@ use persistence_windows::{
 use query::QueryChunk;
 use schema::selection::Selection;
 use snafu::ResultExt;
+use std::time::Duration;
 use std::{future::Future, sync::Arc};
+use tokio::time::timeout;
 use tracker::{TaskTracker, TrackedFuture, TrackedFutureExt};
 
+const TIMEOUT: Duration = Duration::from_secs(300);
+
 /// The implementation for writing a chunk to the object store
 ///
 /// `flush_handle` describes both what to persist and also acts as a transaction
@@ -111,7 +116,9 @@ pub(super) fn write_chunk_to_object_store(
         // catalog-level transaction for preservation layer
         {
             // fetch shared (= read) guard preventing the cleanup job from deleting our files
-            let _guard = db.cleanup_lock.read().await;
+            let _guard = timeout(TIMEOUT, db.cleanup_lock.read())
+                .await
+                .log_if_error("write chunk cleanup lock")?;
 
             // Write this table data into the object store
             //
@@ -128,10 +135,14 @@ pub(super) fn write_chunk_to_object_store(
                 time_of_last_write,
                 chunk_order,
             };
-            let written_result = storage
-                .write_to_object_store(addr.clone(), stream, metadata)
-                .await
-                .context(WritingToObjectStoreSnafu)?;
+
+            let written_result = timeout(
+                TIMEOUT,
+                storage.write_to_object_store(addr.clone(), stream, metadata),
+            )
+            .await
+            .log_if_error("write chunk to object store")?
+            .context(WritingToObjectStoreSnafu)?;
 
             // the stream was empty
             if written_result.is_none() {
@@ -160,13 +171,17 @@ pub(super) fn write_chunk_to_object_store(
             //
             // This ensures that any deletes encountered during or prior to the replay window
             // must have been made durable within the catalog for any persisted chunks
-            let delete_handle = db.delete_predicates_mailbox.consume().await;
+            let delete_handle = timeout(TIMEOUT, db.delete_predicates_mailbox.consume())
+                .await
+                .log_if_error("delete handle")?;
 
             // IMPORTANT: Start transaction AFTER writing the actual parquet file so we do not hold
             //            the transaction lock (that is part of the PreservedCatalog) for too long.
             //            By using the cleanup lock (see above) it is ensured that the file that we
             //            have written is not deleted in between.
-            let mut transaction = db.preserved_catalog.open_transaction().await;
+            let mut transaction = timeout(TIMEOUT, db.preserved_catalog.open_transaction())
+                .await
+                .log_if_error("preserved catalog transaction")?;
 
             // add parquet file
             let info = CatalogParquetInfo {
@@ -194,7 +209,10 @@ pub(super) fn write_chunk_to_object_store(
             }
 
             // preserved commit
-            let ckpt_handle = transaction.commit().await.context(CommitSnafu)?;
+            let ckpt_handle = timeout(TIMEOUT, transaction.commit())
+                .await
+                .log_if_error("preserved catalog commit")?
+                .context(CommitSnafu)?;
 
             // Deletes persisted correctly
             delete_handle.flush();
@@ -216,10 +234,14 @@ pub(super) fn write_chunk_to_object_store(
                 // NOTE: There can only be a single transaction in this section because the checkpoint handle holds
                 //       transaction lock. Therefore we don't need to worry about concurrent modifications of
                 //       preserved chunks.
-                if let Err(e) = ckpt_handle
-                    .create_checkpoint(checkpoint_data_from_catalog(&db.catalog))
-                    .await
-                {
+                let checkpoint_result = timeout(
+                    TIMEOUT,
+                    ckpt_handle.create_checkpoint(checkpoint_data_from_catalog(&db.catalog)),
+                )
+                .await
+                .log_if_error("create checkpoint")?;
+
+                if let Err(e) = checkpoint_result {
                     warn!(%e, "cannot create catalog checkpoint");
 
                     // That's somewhat OK. Don't fail the entire task, because the actual preservation was completed

From 38a889ecf640b01e2af0f06c731b24274509c569 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Mon, 7 Feb 2022 21:15:34 +0000
Subject: [PATCH 16/30] refactor: remove unnecessary struct

---
 influxdb_iox/src/commands/storage.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index c3ed69135a..19e3f14215 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -110,12 +110,11 @@ fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, ParseError> {
 #[derive(Debug, clap::Parser)]
 enum Command {
     /// Issue a read_filter request
-    ReadFilter(ReadFilter),
+    ReadFilter,
 }
 
-/// Create a new database
-#[derive(Debug, clap::Parser)]
-struct ReadFilter {}
+// #[derive(Debug, clap::Parser)]
+// struct ReadFilter {}
 
 /// Create and issue read request
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
@@ -126,7 +125,7 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
 
     let source = Client::read_source(&config.db_name, 0);
     let result = match config.command {
-        Command::ReadFilter(_) => {
+        Command::ReadFilter => {
             client
                 .read_filter(request::read_filter(
                     source,

From eb733042ca9682cbf21cb612c1ea024ecece2e0e Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Mon, 7 Feb 2022 21:51:47 +0000
Subject: [PATCH 17/30] feat: add support for tag_values cli

---
 influxdb_iox/src/commands/storage.rs          | 30 ++++++++++++-----
 influxdb_iox/src/commands/storage/request.rs  | 16 ++++++++-
 influxdb_iox/src/commands/storage/response.rs | 33 +++++++++++++++----
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index 19e3f14215..2169997199 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -109,12 +109,14 @@ fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, ParseError> {
 /// All possible subcommands for storage
 #[derive(Debug, clap::Parser)]
 enum Command {
-    /// Issue a read_filter request
     ReadFilter,
+    TagValues(TagValues),
 }
 
-// #[derive(Debug, clap::Parser)]
-// struct ReadFilter {}
+#[derive(Debug, clap::Parser)]
+struct TagValues {
+    tag_key: String,
+}
 
 /// Create and issue read request
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
@@ -124,9 +126,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
     let predicate = config.predicate.root.is_some().then(|| config.predicate);
 
     let source = Client::read_source(&config.db_name, 0);
-    let result = match config.command {
+    match config.command {
         Command::ReadFilter => {
-            client
+            let result = client
                 .read_filter(request::read_filter(
                     source,
                     config.start,
@@ -134,11 +136,23 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                     predicate,
                 ))
                 .await
+                .context(ServerSnafu)?;
+            response::pretty_print_frames(&result).context(ResponseSnafu)
+        }
+        Command::TagValues(tv) => {
+            let result = client
+                .tag_values(request::tag_values(
+                    source,
+                    config.start,
+                    config.stop,
+                    predicate,
+                    tv.tag_key,
+                ))
+                .await
+                .context(ServerSnafu)?;
+            response::pretty_print_strings(result).context(ResponseSnafu)
         }
     }
-    .context(ServerSnafu)?;
-
-    response::pretty_print(&result).context(ResponseSnafu)
 }
 
 #[cfg(test)]
diff --git a/influxdb_iox/src/commands/storage/request.rs b/influxdb_iox/src/commands/storage/request.rs
index f0051b11e5..4dc9f09232 100644
--- a/influxdb_iox/src/commands/storage/request.rs
+++ b/influxdb_iox/src/commands/storage/request.rs
@@ -20,12 +20,26 @@ pub fn read_filter(
     }
 }
 
+pub fn tag_values(
+    org_bucket: Any,
+    start: i64,
+    stop: i64,
+    predicate: std::option::Option<Predicate>,
+    tag_key: String,
+) -> TagValuesRequest {
+    generated_types::TagValuesRequest {
+        predicate,
+        tags_source: Some(org_bucket),
+        range: Some(TimestampRange { start, end: stop }),
+        tag_key: tag_key.into(),
+    }
+}
+
 // TODO Add the following helpers for building requests:
 //
 // * read_group
 // * read_window_aggregate
 // * tag_keys
-// * tag_values
 // * tag_values_with_measurement_and_key
 // * measurement_names
 // * measurement_tag_keys
diff --git a/influxdb_iox/src/commands/storage/response.rs b/influxdb_iox/src/commands/storage/response.rs
index 9f3c4a6463..15dbcd8003 100644
--- a/influxdb_iox/src/commands/storage/response.rs
+++ b/influxdb_iox/src/commands/storage/response.rs
@@ -39,20 +39,41 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 
 // Prints the provided data frames in a tabular format grouped into tables per
 // distinct measurement.
-pub fn pretty_print(frames: &[Data]) -> Result<()> {
-    let rbs = into_record_batches(frames)?;
+pub fn pretty_print_frames(frames: &[Data]) -> Result<()> {
+    let rbs = frames_to_record_batches(frames)?;
     for (k, rb) in rbs {
-        println!("_measurement: {}", k);
+        println!("\n_measurement: {}", k);
         println!("rows: {:?}", &rb.num_rows());
         print_batches(&[rb]).context(ArrowSnafu)?;
-        println!("\n\n");
+        println!("\n");
     }
     Ok(())
 }
 
+// Prints the provided set of strings in a tabular format grouped.
+pub fn pretty_print_strings(values: Vec<String>) -> Result<()> {
+    let schema = SchemaBuilder::new()
+        .influx_field("values", InfluxFieldType::String)
+        .build()
+        .context(SchemaBuildingSnafu)?;
+
+    let arrow_schema: arrow::datatypes::SchemaRef = schema.into();
+    let rb_columns: Vec<Arc<dyn arrow::array::Array>> =
+        vec![Arc::new(arrow::array::StringArray::from(
+            values.iter().map(|x| Some(x.as_str())).collect::<Vec<_>>(),
+        ))];
+
+    let rb = RecordBatch::try_new(arrow_schema, rb_columns).context(ArrowSnafu)?;
+
+    println!("\ntag values: {:?}", &rb.num_rows());
+    print_batches(&[rb]).context(ArrowSnafu)?;
+    println!("\n");
+    Ok(())
+}
+
 // This function takes a set of InfluxRPC data frames and converts them into an
 // Arrow record batches, which are suitable for pretty printing.
-fn into_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>> {
+fn frames_to_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>> {
     // Run through all the frames once to build the schema of each table we need
     // to build as a record batch.
     let mut table_column_mapping = determine_tag_columns(frames);
@@ -728,7 +749,7 @@ mod test_super {
     fn test_into_record_batches() {
         let frames = gen_frames();
 
-        let rbs = into_record_batches(&frames);
+        let rbs = frames_to_record_batches(&frames);
         let exp = vec![
             (
                 "another table",

From 6c10e1e901e56e1c4f6ddc031e425135451b616e Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Mon, 7 Feb 2022 22:01:35 +0000
Subject: [PATCH 18/30] feat: support _measurement/_field tag keys

---
 influxdb_iox/src/commands/storage.rs          |  1 +
 influxdb_iox/src/commands/storage/request.rs  | 13 ++++++++++++-
 influxdb_iox/src/commands/storage/response.rs | 12 ++++++------
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/influxdb_iox/src/commands/storage.rs b/influxdb_iox/src/commands/storage.rs
index 2169997199..5dddb2efd0 100644
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@@ -115,6 +115,7 @@ enum Command {
 
 #[derive(Debug, clap::Parser)]
 struct TagValues {
+    // The tag key value to interrogate for tag values.
     tag_key: String,
 }
 
diff --git a/influxdb_iox/src/commands/storage/request.rs b/influxdb_iox/src/commands/storage/request.rs
index 4dc9f09232..4a994f44bc 100644
--- a/influxdb_iox/src/commands/storage/request.rs
+++ b/influxdb_iox/src/commands/storage/request.rs
@@ -3,6 +3,9 @@ pub mod generated_types {
 }
 
 use self::generated_types::*;
+use super::response::{
+    tag_key_is_field, tag_key_is_measurement, FIELD_TAG_KEY_BIN, MEASUREMENT_TAG_KEY_BIN,
+};
 use ::generated_types::google::protobuf::*;
 
 pub fn read_filter(
@@ -27,11 +30,19 @@ pub fn tag_values(
     predicate: std::option::Option<Predicate>,
     tag_key: String,
 ) -> TagValuesRequest {
+    let tag_key = if tag_key_is_measurement(tag_key.as_bytes()) {
+        MEASUREMENT_TAG_KEY_BIN.to_vec()
+    } else if tag_key_is_field(tag_key.as_bytes()) {
+        FIELD_TAG_KEY_BIN.to_vec()
+    } else {
+        tag_key.as_bytes().to_vec()
+    };
+
     generated_types::TagValuesRequest {
         predicate,
         tags_source: Some(org_bucket),
         range: Some(TimestampRange { start, end: stop }),
-        tag_key: tag_key.into(),
+        tag_key,
     }
 }
 
diff --git a/influxdb_iox/src/commands/storage/response.rs b/influxdb_iox/src/commands/storage/response.rs
index 15dbcd8003..2e032b5bc7 100644
--- a/influxdb_iox/src/commands/storage/response.rs
+++ b/influxdb_iox/src/commands/storage/response.rs
@@ -483,9 +483,9 @@ impl TryFrom<IntermediateTable> for RecordBatch {
 const MEASUREMENT_TAG_KEY_TEXT: [u8; 12] = [
     b'_', b'm', b'e', b'a', b's', b'u', b'r', b'e', b'm', b'e', b'n', b't',
 ];
-const MEASUREMENT_TAG_KEY_BIN: [u8; 1] = [0_u8];
+pub(crate) const MEASUREMENT_TAG_KEY_BIN: [u8; 1] = [0_u8];
 const FIELD_TAG_KEY_TEXT: [u8; 6] = [b'_', b'f', b'i', b'e', b'l', b'd'];
-const FIELD_TAG_KEY: [u8; 1] = [255_u8];
+pub(crate) const FIELD_TAG_KEY_BIN: [u8; 1] = [255_u8];
 
 // Store a collection of column names and types for a single table (measurement).
 #[derive(Debug, Default, PartialEq, Eq)]
@@ -534,12 +534,12 @@ fn field_name(frame: &SeriesFrame) -> &Vec<u8> {
     &frame.tags[idx].value
 }
 
-fn tag_key_is_measurement(key: &[u8]) -> bool {
+pub(crate) fn tag_key_is_measurement(key: &[u8]) -> bool {
     (key == MEASUREMENT_TAG_KEY_TEXT) || (key == MEASUREMENT_TAG_KEY_BIN)
 }
 
-fn tag_key_is_field(key: &[u8]) -> bool {
-    (key == FIELD_TAG_KEY_TEXT) || (key == FIELD_TAG_KEY)
+pub(crate) fn tag_key_is_field(key: &[u8]) -> bool {
+    (key == FIELD_TAG_KEY_TEXT) || (key == FIELD_TAG_KEY_BIN)
 }
 
 #[cfg(test)]
@@ -746,7 +746,7 @@ mod test_super {
     }
 
     #[test]
-    fn test_into_record_batches() {
+    fn test_frames_to_into_record_batches() {
         let frames = gen_frames();
 
         let rbs = frames_to_record_batches(&frames);

From be662ec7318e83fc54f8a6e942a3dc8abbc84bc4 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 8 Feb 2022 13:07:28 +0000
Subject: [PATCH 19/30] feat: lazy query log! (#3654)

* feat: lazy query log

* chore: fmt

* chore: review feedback

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/access.rs                              |  6 ++---
 db/src/lib.rs                                 |  4 +--
 db/src/query_log.rs                           | 27 ++++++++++++-------
 db/src/system_tables/queries.rs               |  8 +++---
 .../server_type/database/http.rs              |  2 +-
 .../server_type/database/rpc/flight.rs        |  2 +-
 .../database/rpc/storage/service.rs           | 26 +++++++++---------
 query/src/lib.rs                              |  7 ++++-
 query/src/test.rs                             |  4 +--
 9 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/db/src/access.rs b/db/src/access.rs
index b3d9519a22..91ef0200bf 100644
--- a/db/src/access.rs
+++ b/db/src/access.rs
@@ -18,7 +18,7 @@ use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 use query::{
     provider::{ChunkPruner, ProviderBuilder},
     pruning::{prune_chunks, PruningObserver},
-    QueryChunkMeta, QueryCompletedToken, QueryDatabase, DEFAULT_SCHEMA,
+    QueryChunkMeta, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
 };
 use schema::Schema;
 use std::time::Instant;
@@ -27,7 +27,7 @@ use system_tables::{SystemSchemaProvider, SYSTEM_SCHEMA};
 use time::TimeProvider;
 
 /// The number of entries to store in the circular query buffer log
-const QUERY_LOG_SIZE: usize = 100;
+const QUERY_LOG_SIZE: usize = 10_000;
 
 /// Metrics related to chunk access (pruning specifically)
 #[derive(Debug)]
@@ -290,7 +290,7 @@ impl QueryDatabase for QueryCatalogAccess {
     fn record_query(
         &self,
         query_type: impl Into<String>,
-        query_text: impl Into<String>,
+        query_text: QueryText,
     ) -> QueryCompletedToken<'_> {
         // When the query token is dropped the query entry's completion time
         // will be set.
diff --git a/db/src/lib.rs b/db/src/lib.rs
index 15f82d5899..3e3a6e59e1 100644
--- a/db/src/lib.rs
+++ b/db/src/lib.rs
@@ -45,7 +45,7 @@ use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::Persisten
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 use query::{
     exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext},
-    QueryCompletedToken, QueryDatabase,
+    QueryCompletedToken, QueryDatabase, QueryText,
 };
 use rand_distr::{Distribution, Poisson};
 use schema::selection::Selection;
@@ -1230,7 +1230,7 @@ impl QueryDatabase for Db {
     fn record_query(
         &self,
         query_type: impl Into<String>,
-        query_text: impl Into<String>,
+        query_text: QueryText,
     ) -> QueryCompletedToken<'_> {
         self.catalog_access.record_query(query_type, query_text)
     }
diff --git a/db/src/query_log.rs b/db/src/query_log.rs
index 917b63a292..8d1d55fb66 100644
--- a/db/src/query_log.rs
+++ b/db/src/query_log.rs
@@ -7,19 +7,19 @@ use std::{
 };
 
 use parking_lot::Mutex;
+use query::QueryText;
 use time::{Time, TimeProvider};
 
 // The query duration used for queries still running.
 const UNCOMPLETED_DURATION: i64 = -1;
 
 /// Information about a single query that was executed
-#[derive(Debug)]
 pub struct QueryLogEntry {
     /// The type of query
     pub query_type: String,
 
     /// The text of the query (SQL for sql queries, pbjson for storage rpc queries)
-    pub query_text: String,
+    pub query_text: QueryText,
 
     /// Time at which the query was run
     pub issue_time: Time,
@@ -29,9 +29,20 @@ pub struct QueryLogEntry {
     query_completed_duration: atomic::AtomicI64,
 }
 
+impl std::fmt::Debug for QueryLogEntry {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("QueryLogEntry")
+            .field("query_type", &self.query_type)
+            .field("query_text", &self.query_text.to_string())
+            .field("issue_time", &self.issue_time)
+            .field("query_completed_duration", &self.query_completed_duration)
+            .finish()
+    }
+}
+
 impl QueryLogEntry {
     /// Creates a new QueryLogEntry -- use `QueryLog::push` to add new entries to the log
-    fn new(query_type: String, query_text: String, issue_time: Time) -> Self {
+    fn new(query_type: String, query_text: QueryText, issue_time: Time) -> Self {
         Self {
             query_type,
             query_text,
@@ -77,14 +88,10 @@ impl QueryLog {
         }
     }
 
-    pub fn push(
-        &self,
-        query_type: impl Into<String>,
-        query_text: impl Into<String>,
-    ) -> Arc<QueryLogEntry> {
+    pub fn push(&self, query_type: impl Into<String>, query_text: QueryText) -> Arc<QueryLogEntry> {
         let entry = Arc::new(QueryLogEntry::new(
             query_type.into(),
-            query_text.into(),
+            query_text,
             self.time_provider.now(),
         ));
 
@@ -126,7 +133,7 @@ mod test_super {
 
         let entry = Arc::new(QueryLogEntry::new(
             "sql".into(),
-            "SELECT 1".into(),
+            Box::new("SELECT 1"),
             time_provider.now(),
         ));
         // query has not completed
diff --git a/db/src/system_tables/queries.rs b/db/src/system_tables/queries.rs
index a4c18f1fd4..14428bf9f5 100644
--- a/db/src/system_tables/queries.rs
+++ b/db/src/system_tables/queries.rs
@@ -72,7 +72,7 @@ fn from_query_log_entries(
 
     let query_text = entries
         .iter()
-        .map(|e| Some(&e.query_text))
+        .map(|e| Some(e.query_text.to_string()))
         .collect::<StringArray>();
 
     let query_runtime = entries
@@ -102,10 +102,10 @@ mod tests {
         let now = Time::from_rfc3339("1996-12-19T16:39:57+00:00").unwrap();
         let time_provider = Arc::new(time::MockProvider::new(now));
         let query_log = QueryLog::new(10, Arc::clone(&time_provider) as Arc<dyn TimeProvider>);
-        query_log.push("sql", "select * from foo");
+        query_log.push("sql", Box::new("select * from foo"));
         time_provider.inc(std::time::Duration::from_secs(24 * 60 * 60));
-        query_log.push("sql", "select * from bar");
-        let read_filter_entry = query_log.push("read_filter", "json goop");
+        query_log.push("sql", Box::new("select * from bar"));
+        let read_filter_entry = query_log.push("read_filter", Box::new("json goop"));
 
         let expected = vec![
             "+----------------------+-------------+-------------------+--------------------+",
diff --git a/influxdb_iox/src/influxdb_ioxd/server_type/database/http.rs b/influxdb_iox/src/influxdb_ioxd/server_type/database/http.rs
index 614e93df51..b7a423a4aa 100644
--- a/influxdb_iox/src/influxdb_ioxd/server_type/database/http.rs
+++ b/influxdb_iox/src/influxdb_ioxd/server_type/database/http.rs
@@ -252,7 +252,7 @@ async fn query(
 
     let db = server.db(&db_name)?;
 
-    let _query_completed_token = db.record_query("sql", &q);
+    let _query_completed_token = db.record_query("sql", Box::new(q.clone()));
 
     let ctx = db.new_query_context(req.extensions().get().cloned());
     let physical_plan = Planner::new(&ctx).sql(&q).await.context(PlanningSnafu)?;
diff --git a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/flight.rs b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/flight.rs
index 287da93276..e6a9f528a7 100644
--- a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/flight.rs
+++ b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/flight.rs
@@ -172,7 +172,7 @@ impl Flight for FlightService {
             .db(&database)
             .map_err(default_server_error_handler)?;
 
-        let _query_completed_token = db.record_query("sql", &read_info.sql_query);
+        let _query_completed_token = db.record_query("sql", Box::new(read_info.sql_query.clone()));
 
         let ctx = db.new_query_context(span_ctx);
 
diff --git a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
index b1faba9b0a..6d24628e74 100644
--- a/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
+++ b/influxdb_iox/src/influxdb_ioxd/server_type/database/rpc/storage/service.rs
@@ -30,7 +30,7 @@ use query::{
         fieldlist::FieldList, seriesset::converter::Error as SeriesSetError,
         ExecutionContextProvider,
     },
-    QueryDatabase,
+    QueryDatabase, QueryText,
 };
 use server::DatabaseStore;
 
@@ -1303,31 +1303,29 @@ where
 
 /// Return something which can be formatted as json ("pbjson"
 /// specifically)
-fn defer_json<S>(s: &S) -> impl Into<String> + '_
+fn defer_json<S>(s: &S) -> QueryText
 where
-    S: serde::Serialize,
+    S: serde::Serialize + Send + Sync + Clone + 'static,
 {
     /// Defers conversion into a String
-    struct DeferredToJson<'a, S>
+    struct DeferredToJson<S>
     where
         S: serde::Serialize,
     {
-        s: &'a S,
+        s: S,
     }
 
-    impl<S> From<DeferredToJson<'_, S>> for String
-    where
-        S: serde::Serialize,
-    {
-        fn from(w: DeferredToJson<'_, S>) -> Self {
-            match serde_json::to_string_pretty(&w.s) {
-                Ok(json) => json,
-                Err(e) => e.to_string(),
+    impl<S: serde::Serialize> std::fmt::Display for DeferredToJson<S> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            // This buffering is unfortunate but `Formatter` doesn't implement `std::io::Write`
+            match serde_json::to_string_pretty(&self.s) {
+                Ok(s) => f.write_str(&s),
+                Err(e) => write!(f, "error formatting: {}", e),
             }
         }
     }
 
-    DeferredToJson { s }
+    Box::new(DeferredToJson { s: s.clone() })
 }
 
 #[cfg(test)]
diff --git a/query/src/lib.rs b/query/src/lib.rs
index ac37c7d758..1192efb4f2 100644
--- a/query/src/lib.rs
+++ b/query/src/lib.rs
@@ -106,6 +106,11 @@ impl<'a> Drop for QueryCompletedToken<'a> {
     }
 }
 
+/// Boxed description of a query that knows how to render to a string
+///
+/// This avoids storing potentially large strings
+pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
+
 /// A `Database` is the main trait implemented by the IOx subsystems
 /// that store actual data.
 ///
@@ -129,7 +134,7 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
     fn record_query(
         &self,
         query_type: impl Into<String>,
-        query_text: impl Into<String>,
+        query_text: QueryText,
     ) -> QueryCompletedToken<'_>;
 }
 
diff --git a/query/src/test.rs b/query/src/test.rs
index 7436f5e717..08b8154f39 100644
--- a/query/src/test.rs
+++ b/query/src/test.rs
@@ -4,11 +4,11 @@
 //! AKA it is a Mock
 
 use crate::exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext};
-use crate::QueryCompletedToken;
 use crate::{
     exec::stringset::{StringSet, StringSetRef},
     Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryDatabase,
 };
+use crate::{QueryCompletedToken, QueryText};
 use arrow::array::UInt64Array;
 use arrow::{
     array::{ArrayRef, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray},
@@ -155,7 +155,7 @@ impl QueryDatabase for TestDatabase {
     fn record_query(
         &self,
         _query_type: impl Into<String>,
-        _query_text: impl Into<String>,
+        _query_text: QueryText,
     ) -> QueryCompletedToken<'_> {
         QueryCompletedToken::new(|| {})
     }

From de2a013786477d46be449f88e5493a2774422f0a Mon Sep 17 00:00:00 2001
From: Luke Bond <luke.n.bond@gmail.com>
Date: Tue, 8 Feb 2022 13:27:36 +0000
Subject: [PATCH 20/30] feat: gitops adapter (#3656)

* feat: scaffolding of gitops adapter bin crate

* chore: refactor gitops adapter; calls CLI now; status update fixes

* feat: gitops adapter now calls out to CLI once per topic; improved tests

* chore: add mock failure script for gitops adapter

* chore: update workspace-hack

* chore: refactor away unecessary to_string in gitops syncer

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                                    | 338 ++++++++++-
 Cargo.toml                                    |   1 +
 gitops_adapter/Cargo.toml                     |  44 ++
 gitops_adapter/build.rs                       |  25 +
 gitops_adapter/src/kafka_topic_list/api.rs    |  49 ++
 .../src/kafka_topic_list/mock_api.rs          | 129 +++++
 gitops_adapter/src/kafka_topic_list/mod.rs    |   5 +
 .../src/kafka_topic_list/resources.rs         | 108 ++++
 gitops_adapter/src/main.rs                    | 537 ++++++++++++++++++
 gitops_adapter/test/mock-iox-failure.sh       |   2 +
 gitops_adapter/test/mock-iox-multi-topic.sh   |   3 +
 gitops_adapter/test/mock-iox-single-topic.sh  |   2 +
 workspace-hack/Cargo.toml                     |  10 +-
 13 files changed, 1247 insertions(+), 6 deletions(-)
 create mode 100644 gitops_adapter/Cargo.toml
 create mode 100644 gitops_adapter/build.rs
 create mode 100644 gitops_adapter/src/kafka_topic_list/api.rs
 create mode 100644 gitops_adapter/src/kafka_topic_list/mock_api.rs
 create mode 100644 gitops_adapter/src/kafka_topic_list/mod.rs
 create mode 100644 gitops_adapter/src/kafka_topic_list/resources.rs
 create mode 100644 gitops_adapter/src/main.rs
 create mode 100755 gitops_adapter/test/mock-iox-failure.sh
 create mode 100755 gitops_adapter/test/mock-iox-multi-topic.sh
 create mode 100755 gitops_adapter/test/mock-iox-single-topic.sh

diff --git a/Cargo.lock b/Cargo.lock
index c65afd84a2..52315d9550 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -841,6 +841,51 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "darling"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim 0.10.0",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "dashmap"
+version = "4.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c"
+dependencies = [
+ "cfg-if",
+ "num_cpus",
+]
+
 [[package]]
 name = "data_types"
 version = "0.1.0"
@@ -962,6 +1007,17 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "diff"
 version = "0.1.12"
@@ -1428,6 +1484,36 @@ version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
 
+[[package]]
+name = "gitops_adapter"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "async-trait",
+ "chrono",
+ "clap 3.0.13",
+ "dotenv",
+ "futures",
+ "glob",
+ "k8s-openapi",
+ "kube",
+ "kube-derive",
+ "kube-runtime",
+ "parking_lot 0.11.2",
+ "pbjson-build",
+ "prost",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tonic",
+ "tonic-build",
+ "tracing",
+ "trogging",
+ "workspace-hack",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.0"
@@ -1690,6 +1776,12 @@ dependencies = [
  "tokio-native-tls",
 ]
 
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "0.2.3"
@@ -2129,6 +2221,28 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json-patch"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f995a3c8f2bc3dd52a18a583e90f9ec109c047fa1603a853e46bcda14d2e279d"
+dependencies = [
+ "serde",
+ "serde_json",
+ "treediff",
+]
+
+[[package]]
+name = "jsonpath_lib"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaa63191d68230cccb81c5aa23abd53ed64d83337cacbb25a7b8c7979523774f"
+dependencies = [
+ "log",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "jsonwebtoken"
 version = "7.2.0"
@@ -2136,13 +2250,126 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32"
 dependencies = [
  "base64 0.12.3",
- "pem",
+ "pem 0.8.3",
  "ring",
  "serde",
  "serde_json",
  "simple_asn1",
 ]
 
+[[package]]
+name = "k8s-openapi"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f8de9873b904e74b3533f77493731ee26742418077503683db44e1b3c54aa5c"
+dependencies = [
+ "base64 0.13.0",
+ "bytes",
+ "chrono",
+ "schemars",
+ "serde",
+ "serde-value",
+ "serde_json",
+]
+
+[[package]]
+name = "kube"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84dcc2f8ca3f2427a72acc31fa9538159f6b33a97002e315a3fcd5323cf51a2b"
+dependencies = [
+ "k8s-openapi",
+ "kube-client",
+ "kube-core",
+]
+
+[[package]]
+name = "kube-client"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8957106140aa24a76de3f7d005966f381b30a4cd6a9c003b3bba6828e9617535"
+dependencies = [
+ "base64 0.13.0",
+ "bytes",
+ "chrono",
+ "dirs-next",
+ "either",
+ "futures",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "hyper-tls",
+ "jsonpath_lib",
+ "k8s-openapi",
+ "kube-core",
+ "openssl",
+ "pem 1.0.2",
+ "pin-project",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "thiserror",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+]
+
+[[package]]
+name = "kube-core"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ec73e7d8e937dd055d962af06e635e262fdb6ed341c36ecf659d4fece0a8005"
+dependencies = [
+ "chrono",
+ "form_urlencoded",
+ "http",
+ "json-patch",
+ "k8s-openapi",
+ "once_cell",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "kube-derive"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6651bfae82bc23439da1099174b52bcbf68df065dc33317c912e3c5c5cea43c"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "serde_json",
+ "syn",
+]
+
+[[package]]
+name = "kube-runtime"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b090d3d7b43e2d60fa93ca51b19fe9f2e05a5252c97880fe834f8fa9f2de605"
+dependencies = [
+ "dashmap",
+ "derivative",
+ "futures",
+ "json-patch",
+ "k8s-openapi",
+ "kube-client",
+ "pin-project",
+ "serde",
+ "serde_json",
+ "smallvec",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -2257,6 +2484,12 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "linked-hash-map"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.0.37"
@@ -3149,6 +3382,15 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "pem"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947"
+dependencies = [
+ "base64 0.13.0",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.1.0"
@@ -4083,6 +4325,30 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "schemars"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6b5a3c80cea1ab61f4260238409510e814e38b4b563c06044edf91e7dc070e3"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41ae4dce13e8614c46ac3c38ef1c0d668b101df6ac39817aebdaa26642ddae9b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -4137,6 +4403,16 @@ dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "serde-value"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
+dependencies = [
+ "ordered-float 2.10.0",
+ "serde",
+]
+
 [[package]]
 name = "serde-xml-rs"
 version = "0.4.1"
@@ -4170,6 +4446,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "serde_derive_internals"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dbab34ca63057a1f15280bdf3c39f2b1eb1b54c17e98360e511637aef7418c6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "serde_json"
 version = "1.0.78"
@@ -4203,6 +4490,18 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_yaml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a521f2940385c165a24ee286aa8599633d162077a54bdcae2a6fd5a7bfa7a0"
+dependencies = [
+ "indexmap",
+ "ryu",
+ "serde",
+ "yaml-rust",
+]
+
 [[package]]
 name = "server"
 version = "0.1.0"
@@ -4905,6 +5204,7 @@ dependencies = [
  "futures-sink",
  "log",
  "pin-project-lite",
+ "slab",
  "tokio",
 ]
 
@@ -5011,6 +5311,24 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "tower-http"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81eca72647e58054bbfa41e6f297c23436f1c60aff6e5eb38455a0f9ca420bb5"
+dependencies = [
+ "base64 0.13.0",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "pin-project",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.1"
@@ -5171,6 +5489,15 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "treediff"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "761e8d5ad7ce14bb82b7e61ccc0ca961005a275a060b9644a2431aa11553c2ff"
+dependencies = [
+ "serde_json",
+]
+
 [[package]]
 name = "trogging"
 version = "0.1.0"
@@ -5669,6 +5996,15 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3"
 
+[[package]]
+name = "yaml-rust"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85"
+dependencies = [
+ "linked-hash-map",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.5.2"
diff --git a/Cargo.toml b/Cargo.toml
index fb419424e2..4eaa1c74ff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
     "db",
     "dml",
     "generated_types",
+    "gitops_adapter",
     "grpc-router",
     "grpc-router-test-gen",
     "influxdb_iox",
diff --git a/gitops_adapter/Cargo.toml b/gitops_adapter/Cargo.toml
new file mode 100644
index 0000000000..e4573ea206
--- /dev/null
+++ b/gitops_adapter/Cargo.toml
@@ -0,0 +1,44 @@
+[package]
+name = "gitops_adapter"
+version = "0.1.0"
+authors = ["Luke Bond <luke.n.bond@gmail.com>"]
+edition = "2021"
+
+# Prevent this from being published to crates.io!
+publish = false
+
+[[bin]]
+name = "iox-gitops-adapter"
+path = "src/main.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+async-trait = "0.1"
+chrono = "0.4.15"
+clap = { version = "3", features = ["derive", "env"] }
+dotenv = "0.15"
+futures = "0.3"
+k8s-openapi = { version = "0.13.1", features = ["v1_17", "schemars"], default-features = false }
+kube = "0.64"
+kube-derive = { version = "0.64", default-features = false } # only needed to opt out of schema
+kube-runtime = "0.64"
+prost = "0.9"
+schemars = "0.8.3"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+thiserror = "1.0"
+tokio = { version = "1.0", features = ["rt-multi-thread", "macros", "parking_lot"] }
+tonic = "0.6"
+tracing = { version = "0.1", features = ["release_max_level_debug"] }
+workspace-hack = { path = "../workspace-hack"}
+trogging = { path = "../trogging", default-features = false, features = ["clap"] }
+
+[build-dependencies]
+glob = "0.3.0"
+pbjson-build = "0.2"
+tonic-build = "0.6"
+
+[dev-dependencies]
+assert_matches = "1.5"
+parking_lot = { version = "0.11.1" }
diff --git a/gitops_adapter/build.rs b/gitops_adapter/build.rs
new file mode 100644
index 0000000000..013fb8f714
--- /dev/null
+++ b/gitops_adapter/build.rs
@@ -0,0 +1,25 @@
+use std::process::Command;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Populate env!(GIT_HASH) with the current git commit
+    println!("cargo:rustc-env=GIT_HASH={}", get_git_hash());
+
+    Ok(())
+}
+
+fn get_git_hash() -> String {
+    let out = match std::env::var("VERSION_HASH") {
+        Ok(v) => v,
+        Err(_) => {
+            let output = Command::new("git")
+                .args(&["describe", "--always", "--dirty", "--abbrev=64"])
+                .output()
+                .expect("failed to execute git rev-parse to read the current git hash");
+
+            String::from_utf8(output.stdout).expect("non-utf8 found in git hash")
+        }
+    };
+
+    assert!(!out.is_empty(), "attempting to embed empty git hash");
+    out
+}
diff --git a/gitops_adapter/src/kafka_topic_list/api.rs b/gitops_adapter/src/kafka_topic_list/api.rs
new file mode 100644
index 0000000000..b257930e9c
--- /dev/null
+++ b/gitops_adapter/src/kafka_topic_list/api.rs
@@ -0,0 +1,49 @@
+use async_trait::async_trait;
+use kube::{
+    api::{Patch, PatchParams},
+    Api,
+};
+use serde_json::json;
+
+use crate::kafka_topic_list::resources::{KafkaTopicList, KafkaTopicListStatus};
+
+#[async_trait]
+pub trait KafkaTopicListApi: Send + Sync + Clone + 'static {
+    /// Gets a KafkaTopicList resource by name.
+    async fn get_kafka_topic_list(
+        &self,
+        kafka_topic_list_name: String,
+    ) -> Result<KafkaTopicList, kube::Error>;
+
+    /// Patch status block, if it exists, with the given status.
+    async fn patch_resource_status(
+        &self,
+        kafka_topic_list_name: String,
+        status: KafkaTopicListStatus,
+    ) -> Result<KafkaTopicList, kube::Error>;
+}
+
+#[async_trait]
+impl KafkaTopicListApi for Api<KafkaTopicList> {
+    async fn get_kafka_topic_list(
+        &self,
+        kafka_topic_list_name: String,
+    ) -> Result<KafkaTopicList, kube::Error> {
+        self.get(kafka_topic_list_name.as_str()).await
+    }
+
+    async fn patch_resource_status(
+        &self,
+        kafka_topic_list_name: String,
+        status: KafkaTopicListStatus,
+    ) -> Result<KafkaTopicList, kube::Error> {
+        let patch_params = PatchParams::default();
+        let s = json!({ "status": status });
+        self.patch_status(
+            kafka_topic_list_name.as_str(),
+            &patch_params,
+            &Patch::Merge(&s),
+        )
+        .await
+    }
+}
diff --git a/gitops_adapter/src/kafka_topic_list/mock_api.rs b/gitops_adapter/src/kafka_topic_list/mock_api.rs
new file mode 100644
index 0000000000..002414c084
--- /dev/null
+++ b/gitops_adapter/src/kafka_topic_list/mock_api.rs
@@ -0,0 +1,129 @@
+#![allow(missing_docs)]
+
+use std::sync::{mpsc::SyncSender, Arc};
+
+use async_trait::async_trait;
+use parking_lot::Mutex;
+
+use crate::kafka_topic_list::{
+    api::KafkaTopicListApi,
+    resources::{KafkaTopicList, KafkaTopicListStatus},
+};
+
+#[derive(Debug, Clone, PartialEq)]
+#[allow(clippy::large_enum_variant)]
+pub enum MockKafkaTopicListApiCall {
+    Get(String),
+    PatchStatus {
+        kafka_topic_list_name: String,
+        status: KafkaTopicListStatus,
+    },
+}
+
+#[derive(Debug, Default)]
+pub struct ClientInner {
+    /// A channel to push call notifications into as they occur.
+    pub notify: Option<SyncSender<MockKafkaTopicListApiCall>>,
+
+    /// A vector of calls in call order for assertions.
+    pub calls: Vec<MockKafkaTopicListApiCall>,
+
+    // Return values
+    pub get_ret: Vec<Result<KafkaTopicList, kube::Error>>,
+    pub patch_status_ret: Vec<Result<KafkaTopicList, kube::Error>>,
+}
+
+impl ClientInner {
+    fn record_call(&mut self, c: MockKafkaTopicListApiCall) {
+        self.calls.push(c.clone());
+        if let Some(ref n) = self.notify {
+            let _ = n.send(c);
+        }
+    }
+}
+
+impl From<ClientInner> for MockKafkaTopicListApi {
+    fn from(state: ClientInner) -> Self {
+        Self {
+            state: Arc::new(Mutex::new(state)),
+        }
+    }
+}
+
+/// Mock helper to record a call and return the pre-configured value.
+///
+/// Pushes `$call` to call record, popping `self.$return` and returning it to
+/// the caller. If no value exists, the pop attempt causes a panic.
+macro_rules! record_and_return {
+    ($self:ident, $call:expr, $return:ident) => {{
+        let mut state = $self.state.lock();
+        state.record_call($call);
+        state.$return.pop().expect("no mock result to return")
+    }};
+}
+
+#[derive(Debug, Default)]
+pub struct MockKafkaTopicListApi {
+    pub state: Arc<Mutex<ClientInner>>,
+}
+
+impl MockKafkaTopicListApi {
+    pub fn with_notify(self, s: SyncSender<MockKafkaTopicListApiCall>) -> Self {
+        self.state.lock().notify = Some(s);
+        self
+    }
+
+    pub fn with_get_ret(self, ret: Vec<Result<KafkaTopicList, kube::Error>>) -> Self {
+        self.state.lock().get_ret = ret;
+        self
+    }
+
+    pub fn with_patch_status_ret(self, ret: Vec<Result<KafkaTopicList, kube::Error>>) -> Self {
+        self.state.lock().patch_status_ret = ret;
+        self
+    }
+
+    pub fn get_calls(&self) -> Vec<MockKafkaTopicListApiCall> {
+        self.state.lock().calls.clone()
+    }
+}
+
+#[async_trait]
+impl KafkaTopicListApi for Arc<MockKafkaTopicListApi> {
+    /// Gets a KafkaTopicList resource by name.
+    async fn get_kafka_topic_list(
+        &self,
+        kafka_topic_list_name: String,
+    ) -> Result<KafkaTopicList, kube::Error> {
+        record_and_return!(
+            self,
+            MockKafkaTopicListApiCall::Get(kafka_topic_list_name,),
+            get_ret
+        )
+    }
+
+    /// Patch status block, if it exists, with the given status.
+    async fn patch_resource_status(
+        &self,
+        kafka_topic_list_name: String,
+        status: KafkaTopicListStatus,
+    ) -> Result<KafkaTopicList, kube::Error> {
+        record_and_return!(
+            self,
+            MockKafkaTopicListApiCall::PatchStatus {
+                kafka_topic_list_name,
+                status,
+            },
+            patch_status_ret
+        )
+    }
+}
+
+/// Cloning a client shares the same mock state across both client instances.
+impl Clone for MockKafkaTopicListApi {
+    fn clone(&self) -> Self {
+        Self {
+            state: Arc::clone(&self.state),
+        }
+    }
+}
diff --git a/gitops_adapter/src/kafka_topic_list/mod.rs b/gitops_adapter/src/kafka_topic_list/mod.rs
new file mode 100644
index 0000000000..4579c8239f
--- /dev/null
+++ b/gitops_adapter/src/kafka_topic_list/mod.rs
@@ -0,0 +1,5 @@
+pub mod api;
+pub mod resources;
+
+#[cfg(test)]
+pub mod mock_api;
diff --git a/gitops_adapter/src/kafka_topic_list/resources.rs b/gitops_adapter/src/kafka_topic_list/resources.rs
new file mode 100644
index 0000000000..4644a0d002
--- /dev/null
+++ b/gitops_adapter/src/kafka_topic_list/resources.rs
@@ -0,0 +1,108 @@
+use kube_derive::CustomResource;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
+#[kube(
+    group = "iox.influxdata.com",
+    version = "v1alpha1",
+    kind = "KafkaTopicList",
+    namespaced,
+    shortname = "topics"
+)]
+#[kube(status = "KafkaTopicListStatus")]
+#[serde(rename_all = "camelCase")]
+pub struct KafkaTopicListSpec {
+    topics: Vec<String>,
+}
+
+#[derive(Deserialize, Serialize, Clone, Debug, Default, JsonSchema, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct KafkaTopicListStatus {
+    conditions: Vec<KafkaTopicListStatusCondition>,
+    observed_generation: i64, // type matches that of metadata.generation
+}
+
+#[derive(Deserialize, Serialize, Clone, Debug, Default, JsonSchema)]
+#[serde(rename_all = "camelCase")]
+pub struct KafkaTopicListStatusCondition {
+    type_: String,
+    status: String,
+    message: String,
+    last_transition_time: String,
+    last_update_time: String,
+}
+
+impl KafkaTopicListSpec {
+    pub fn new(topics: Vec<String>) -> Self {
+        Self { topics }
+    }
+
+    pub fn topics(&self) -> &Vec<String> {
+        &self.topics
+    }
+}
+
+impl KafkaTopicListStatus {
+    pub fn conditions(&self) -> &Vec<KafkaTopicListStatusCondition> {
+        &self.conditions
+    }
+
+    pub fn conditions_mut(&mut self) -> &mut Vec<KafkaTopicListStatusCondition> {
+        &mut self.conditions
+    }
+
+    pub fn observed_generation(&self) -> i64 {
+        self.observed_generation
+    }
+
+    pub fn set_observed_generation(&mut self, observed_generation: i64) {
+        self.observed_generation = observed_generation;
+    }
+}
+
+impl KafkaTopicListStatusCondition {
+    pub fn new(
+        type_: String,
+        status: String,
+        message: String,
+        last_transition_time: String,
+        last_update_time: String,
+    ) -> Self {
+        Self {
+            type_,
+            status,
+            message,
+            last_transition_time,
+            last_update_time,
+        }
+    }
+
+    pub fn type_(&self) -> &String {
+        &self.type_
+    }
+
+    pub fn status(&self) -> &String {
+        &self.status
+    }
+
+    pub fn message(&self) -> &String {
+        &self.message
+    }
+
+    pub fn last_transition_time(&self) -> &String {
+        &self.last_transition_time
+    }
+
+    pub fn last_update_time(&self) -> &String {
+        &self.last_update_time
+    }
+}
+
+impl PartialEq for KafkaTopicListStatusCondition {
+    // just for assertions in tests; too tedious to have to have the items the same
+    // too
+    fn eq(&self, other: &Self) -> bool {
+        self.type_ == other.type_ && self.status == other.status && self.message == other.message
+    }
+}
diff --git a/gitops_adapter/src/main.rs b/gitops_adapter/src/main.rs
new file mode 100644
index 0000000000..65c63ff0f1
--- /dev/null
+++ b/gitops_adapter/src/main.rs
@@ -0,0 +1,537 @@
+use std::{
+    io::ErrorKind,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use chrono::{DateTime, Utc};
+use dotenv::dotenv;
+use futures::StreamExt;
+use kube::{api::ListParams, Api, Client as K8sClient};
+use kube_runtime::controller::{Context, Controller, ReconcilerAction};
+use std::process::Command as Cmd;
+use thiserror::Error;
+use tracing::*;
+use trogging::{cli::LoggingConfig, LogFormat};
+
+use crate::kafka_topic_list::{
+    api::KafkaTopicListApi,
+    resources::{KafkaTopicList, KafkaTopicListStatus, KafkaTopicListStatusCondition},
+};
+
+pub mod kafka_topic_list;
+
+static CONDITION_TYPE_RECONCILED: &str = "Reconciled";
+static CONDITION_STATUS_TRUE: &str = "True";
+static CONDITION_STATUS_FALSE: &str = "False";
+
+#[derive(Debug, Error)]
+enum CatalogError {
+    #[error("Malformed KafkaTopicList resource: {message}")]
+    MalformedKafkaTopicListResource { message: String },
+
+    #[error("Request to patch status of k8s custom resource failed: {0}")]
+    PatchStatusError(#[from] kube::Error),
+
+    #[error("Failed to execute iox binary to update catalog: {0}")]
+    IOxBinaryExecFailed(#[from] std::io::Error),
+
+    #[error("Request to update catalog with topic failed: {stderr}")]
+    UpdateTopicError { stderr: String },
+
+    #[error("Failed to parse stdout of catalog update command to ID: {0}")]
+    TopicIdParseError(#[from] std::num::ParseIntError),
+}
+
+// Config defines the runtime configuration variables settable on the command
+// line.
+//
+// These fields are automatically converted into a [Clap] CLI.
+//
+// This has an `allow(missing_docs)` annotation as otherwise the comment is
+// added to the CLI help text.
+//
+// [Clap]: https://github.com/clap-rs/clap
+#[derive(Debug, clap::Parser)]
+#[clap(
+    name = "iox-gitops-adapter",
+    about = "Adapter to configure IOx Catalog from Kubernetes Custom Resources",
+    long_about = r#"Kubernetes controller responsible for synchronising the IOx Catalog to cluster configuration in a Kubernetes Custom Resource.
+
+Examples:
+    # Run the gitops adapter server:
+    iox-gitops-adapter
+
+    # See all configuration options
+    iox-gitops-adapter --help
+"#,
+    version = concat!(env!("CARGO_PKG_VERSION"), " - ", env!("GIT_HASH"))
+)]
+#[allow(missing_docs)]
+pub struct Config {
+    /// Configure the log level & filter.
+    ///
+    /// Example values:
+    ///     iox_gitops_adapter=debug
+    #[clap(flatten)]
+    logging_config: LoggingConfig,
+
+    /// Configure the Kubernetes namespace where custom resources are found.
+    ///
+    /// Example values:
+    ///     namespace=conductor
+    #[clap(long = "--namespace", env = "GITOPS_ADAPTER_NAMESPACE")]
+    namespace: String,
+
+    /// Configure the Catalog's Postgres DSN.
+    ///
+    /// Example values:
+    ///     catalog-dsn=postgres://postgres:postgres@localhost:5432/iox_shared
+    #[clap(long = "--catalog-dsn", env = "GITOPS_ADAPTER_CATALOG_DSN")]
+    catalog_dsn: String,
+
+    /// Configure the path to the IOx CLI.
+    ///
+    /// Example values:
+    ///     iox-cli=/usr/bin/influxdb_iox
+    #[clap(long = "--iox-cli", env = "GITOPS_ADAPTER_IOX_CLI")]
+    iox_cli: String,
+}
+
+#[derive(Debug, clap::Parser)]
+enum Command {
+    Config,
+}
+
+impl Config {
+    /// Returns the (possibly invalid) log filter string.
+    pub fn log_filter(&self) -> &Option<String> {
+        &self.logging_config.log_filter
+    }
+
+    /// Returns the (possibly invalid) log format string.
+    pub fn log_format(&self) -> &LogFormat {
+        &self.logging_config.log_format
+    }
+}
+
+/// Load the config.
+///
+/// This pulls in config from the following sources, in order of precedence:
+///
+///     - command line arguments
+///     - user set environment variables
+///     - .env file contents
+///     - pre-configured default values
+pub fn load_config() -> Result<Config, Box<dyn std::error::Error>> {
+    // Source the .env file before initialising the Config struct - this sets
+    // any envs in the file, which the Config struct then uses.
+    //
+    // Precedence is given to existing env variables.
+    match dotenv() {
+        Ok(_) => {}
+        Err(dotenv::Error::Io(err)) if err.kind() == ErrorKind::NotFound => {
+            // Ignore this - a missing env file is not an error,
+            // defaults will be applied when initialising the Config struct.
+        }
+        Err(e) => return Err(Box::new(e)),
+    };
+
+    // Load the Config struct - this pulls in any envs set by the user or
+    // sourced above, and applies any defaults.
+    Ok(clap::Parser::parse())
+}
+
+/// Initialise the tracing subscribers.
+fn setup_tracing(
+    logging_config: &LoggingConfig,
+    log_env_var: Option<String>,
+) -> Result<trogging::TroggingGuard, trogging::Error> {
+    let drop_handle = logging_config
+        .to_builder()
+        .with_default_log_filter(log_env_var.unwrap_or_else(|| "info".to_string()))
+        .install_global()?;
+
+    trace!("logging initialised!");
+
+    Ok(drop_handle)
+}
+
+async fn reconcile_topics(
+    path_to_iox_binary: &str,
+    catalog_dsn: &str,
+    topics: &[String],
+) -> Result<Vec<u32>, CatalogError> {
+    trace!(
+        "calling out to {} for topics {:?}",
+        path_to_iox_binary,
+        topics
+    );
+    topics
+        .iter()
+        .map(|topic| {
+            match Cmd::new(path_to_iox_binary)
+                .arg("catalog")
+                .arg("topic")
+                .arg("update")
+                .arg("--catalog-dsn")
+                .arg(catalog_dsn)
+                .arg(topic)
+                .output()
+            {
+                Ok(output) => match output.status.success() {
+                    true => {
+                        trace!(
+                            "Updated catalog with kafka topic {}. stdout: {}",
+                            topic,
+                            String::from_utf8_lossy(&output.stdout).trim()
+                        );
+                        // The CLI returns an ID on success; try to parse it here to ensure it
+                        // worked; not sure that return zero is enough? e.g. --help will return 0.
+                        // also, we'd like to print the IDs out later
+                        String::from_utf8_lossy(&output.stdout)
+                            .trim()
+                            .parse::<u32>()
+                            .map_err(CatalogError::TopicIdParseError)
+                    }
+                    false => Err(CatalogError::UpdateTopicError {
+                        stderr: String::from_utf8_lossy(&output.stderr).into(),
+                    }),
+                },
+                Err(e) => Err(CatalogError::IOxBinaryExecFailed(e)),
+            }
+        })
+        .collect()
+}
+
+/// Controller triggers this whenever our main object or our children changed
+async fn reconcile<T>(
+    topics: KafkaTopicList,
+    ctx: Context<Data<T>>,
+) -> Result<ReconcilerAction, CatalogError>
+where
+    T: KafkaTopicListApi,
+{
+    debug!(
+        "got a change to the kafka topic list custom resource: {:?}",
+        topics.spec
+    );
+    let kafka_topic_list_api = ctx.get_ref().kafka_topic_list_api.clone();
+    let topics = Arc::new(topics);
+
+    // if CR doesn't contain status field, add it
+    let mut topics_status = match &topics.status {
+        Some(status) => status.clone(),
+        None => KafkaTopicListStatus::default(),
+    };
+    let kafka_topic_list_name = match &topics.metadata.name {
+        Some(n) => n.clone(),
+        None => {
+            return Err(CatalogError::MalformedKafkaTopicListResource {
+                message: "Missing metadata.name field".to_string(),
+            })
+        }
+    };
+
+    // have we seen this update before?
+    // NOTE: we may find that we'd prefer to do the reconcile anyway, if it's cheap.
+    //       for now this seems okay
+    let generation = match topics.metadata.generation {
+        Some(gen) => {
+            if topics_status.observed_generation() == gen {
+                info!("Nothing to reconcile; observedGeneration == generation");
+                return Ok(ReconcilerAction {
+                    requeue_after: None,
+                });
+            }
+            gen
+        }
+        _ => {
+            return Err(CatalogError::MalformedKafkaTopicListResource {
+                message: "Missing metadata.generation field".to_string(),
+            })
+        }
+    };
+    // make a note that we've seen this update
+    topics_status.set_observed_generation(generation);
+
+    // call out to the iox CLI to update the catalog for each topic name in the list
+    let reconcile_result = reconcile_topics(
+        &ctx.get_ref().path_to_iox_binary,
+        &ctx.get_ref().catalog_dsn,
+        topics.spec.topics(),
+    )
+    .await;
+
+    // update status subresource based on outcome of reconcile
+    let now: DateTime<Utc> = SystemTime::now().into();
+    let now_str = now.to_rfc3339();
+    let prev_condition = topics_status.conditions().get(0);
+    let last_transition_time = match prev_condition {
+        Some(c) if c.status() == CONDITION_STATUS_TRUE => c.last_transition_time().clone(),
+        _ => now_str.clone(),
+    };
+    let new_status = match &reconcile_result {
+        Ok(v) => {
+            debug!(
+                "Updated catalog with kafka topic list: {:?}. IDs returned: {:?}.",
+                topics.spec.topics(),
+                v
+            );
+            KafkaTopicListStatusCondition::new(
+                CONDITION_TYPE_RECONCILED.to_string(),
+                CONDITION_STATUS_TRUE.to_string(),
+                "".to_string(),
+                last_transition_time,
+                now_str.clone(),
+            )
+        }
+        Err(e) => KafkaTopicListStatusCondition::new(
+            CONDITION_TYPE_RECONCILED.to_string(),
+            CONDITION_STATUS_FALSE.to_string(),
+            e.to_string(),
+            last_transition_time,
+            now_str.clone(),
+        ),
+    };
+    if topics_status.conditions().is_empty() {
+        topics_status.conditions_mut().insert(0, new_status);
+    } else {
+        topics_status.conditions_mut()[0] = new_status;
+    }
+
+    // patch the status field with the updated condition and observed generation
+    match kafka_topic_list_api
+        .patch_resource_status(kafka_topic_list_name.clone(), topics_status)
+        .await
+    {
+        Ok(_) => {}
+        Err(e) => {
+            // Not great to silently swallow the error here but doesn't feel warranted to requeue
+            // just because the status wasn't updated
+            error!("Failed to patch KafkaTopicList status subresource: {}", e);
+        }
+    }
+
+    reconcile_result.map(|_| ReconcilerAction {
+        requeue_after: None,
+    })
+}
+
+/// an error handler that will be called when the reconciler fails
+fn error_policy<T>(error: &CatalogError, _ctx: Context<Data<T>>) -> ReconcilerAction
+where
+    T: KafkaTopicListApi,
+{
+    error!(%error, "reconciliation error");
+    ReconcilerAction {
+        // if a sync fails we want to retry- it could simply be in the process of
+        // doing another redeploy. there may be a deeper problem, in which case it'll keep trying
+        // and we'll see errors and investigate. arbitrary duration chosen ¯\_(ツ)_/¯
+        requeue_after: Some(Duration::from_secs(5)),
+    }
+}
+
+// Data we want access to in error/reconcile calls
+struct Data<T>
+where
+    T: KafkaTopicListApi,
+{
+    path_to_iox_binary: String,
+    catalog_dsn: String,
+    kafka_topic_list_api: T,
+}
+
+#[tokio::main]
+async fn main() {
+    let config = load_config().expect("failed to load config");
+    let _drop_handle = setup_tracing(&config.logging_config, None).unwrap();
+    debug!(?config, "loaded config");
+
+    info!(git_hash = env!("GIT_HASH"), "starting iox-gitops-adapter");
+
+    let k8s_client = K8sClient::try_default()
+        .await
+        .expect("couldn't create k8s client");
+    let topics = Api::<KafkaTopicList>::namespaced(k8s_client.clone(), config.namespace.as_str());
+    info!("initialised Kubernetes API client");
+
+    info!("starting IOx GitOps Adapter");
+    Controller::new(topics.clone(), ListParams::default())
+        .run(
+            reconcile,
+            error_policy,
+            Context::new(Data {
+                path_to_iox_binary: config.iox_cli.clone(),
+                catalog_dsn: config.catalog_dsn.clone(),
+                kafka_topic_list_api: topics,
+            }),
+        )
+        .for_each(|res| async move {
+            match res {
+                Ok(o) => info!("reconciled {:?}", o),
+                Err(e) => info!("reconcile failed: {:?}", e),
+            }
+        })
+        .await; // controller does nothing unless polled
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use kafka_topic_list::{
+        mock_api::{MockKafkaTopicListApi, MockKafkaTopicListApiCall},
+        resources::KafkaTopicListSpec,
+    };
+
+    use super::*;
+
+    fn create_topics(
+        name: &str,
+        spec: KafkaTopicListSpec,
+        generation: i64,
+        status: KafkaTopicListStatus,
+    ) -> KafkaTopicList {
+        let mut c = KafkaTopicList::new(name, spec);
+        c.metadata.generation = Some(generation);
+        c.status = Some(status);
+        c
+    }
+
+    fn create_topics_status(
+        observed_generation: i64,
+        reconciled: bool,
+        message: String,
+        t: SystemTime,
+    ) -> KafkaTopicListStatus {
+        let now: DateTime<Utc> = t.into();
+        let now_str = now.to_rfc3339();
+        let mut status = KafkaTopicListStatus::default();
+        status
+            .conditions_mut()
+            .push(KafkaTopicListStatusCondition::new(
+                CONDITION_TYPE_RECONCILED.to_string(),
+                if reconciled {
+                    CONDITION_STATUS_TRUE.to_string()
+                } else {
+                    CONDITION_STATUS_FALSE.to_string()
+                },
+                message,
+                now_str.clone(),
+                now_str,
+            ));
+        status.set_observed_generation(observed_generation);
+        status
+    }
+
+    #[tokio::test]
+    async fn test_single_topic_success() {
+        let now = SystemTime::now();
+        let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
+            vec![Ok(create_topics(
+                "iox",
+                KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
+                1,
+                create_topics_status(0, true, "".to_string(), now),
+            ))],
+        ));
+        let data = Data {
+            path_to_iox_binary: "test/mock-iox-single-topic.sh".to_string(),
+            catalog_dsn: "unused".to_string(),
+            kafka_topic_list_api: Arc::clone(&mock_topics_api),
+        };
+        let c = create_topics(
+            "iox",
+            KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
+            1,
+            create_topics_status(0, true, "".to_string(), now),
+        );
+        let result = reconcile(c, Context::new(data)).await;
+        // whole operation returns a successful result.
+        assert_matches!(result, Ok(ReconcilerAction { .. }));
+        // ensure status was updated accordingly.
+        // alas, we don't have a success patch result either, due to the above
+        assert_eq!(
+            mock_topics_api.get_calls(),
+            vec![MockKafkaTopicListApiCall::PatchStatus {
+                kafka_topic_list_name: "iox".to_string(),
+                status: create_topics_status(1, true, "".to_string(), now),
+            }]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_multi_topic_success() {
+        let now = SystemTime::now();
+        let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
+            vec![Ok(create_topics(
+                "iox",
+                KafkaTopicListSpec::new(vec!["one".to_string(), "two".to_string()]),
+                1,
+                create_topics_status(0, true, "".to_string(), now),
+            ))],
+        ));
+        let data = Data {
+            path_to_iox_binary: "test/mock-iox-single-topic.sh".to_string(),
+            catalog_dsn: "unused".to_string(),
+            kafka_topic_list_api: Arc::clone(&mock_topics_api),
+        };
+        let c = create_topics(
+            "iox",
+            KafkaTopicListSpec::new(vec!["one".to_string(), "two".to_string()]),
+            1,
+            create_topics_status(0, true, "".to_string(), now),
+        );
+        let result = reconcile(c, Context::new(data)).await;
+        // whole operation returns a successful result.
+        assert_matches!(result, Ok(ReconcilerAction { .. }));
+        // ensure status was updated accordingly.
+        assert_eq!(
+            mock_topics_api.get_calls(),
+            vec![MockKafkaTopicListApiCall::PatchStatus {
+                kafka_topic_list_name: "iox".to_string(),
+                status: create_topics_status(1, true, "".to_string(), now),
+            }]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_single_topic_error() {
+        let now = SystemTime::now();
+        let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
+            vec![Ok(create_topics(
+                "iox",
+                KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
+                1,
+                create_topics_status(0, true, "".to_string(), now),
+            ))],
+        ));
+        let data = Data {
+            path_to_iox_binary: "test/mock-iox-failure.sh".to_string(),
+            catalog_dsn: "unused".to_string(),
+            kafka_topic_list_api: Arc::clone(&mock_topics_api),
+        };
+        let c = create_topics(
+            "iox",
+            KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
+            1,
+            create_topics_status(0, false, "".to_string(), now),
+        );
+        let result = reconcile(c, Context::new(data)).await;
+        // whole operation returns a successful result
+        assert_matches!(result, Err(CatalogError::UpdateTopicError { .. }));
+        // Ensure status was updated accordingly
+        assert_eq!(
+            mock_topics_api.get_calls(),
+            vec![MockKafkaTopicListApiCall::PatchStatus {
+                kafka_topic_list_name: "iox".to_string(),
+                status: create_topics_status(
+                    1,
+                    false,
+                    "Request to update catalog with topic failed: ".to_string(),
+                    now
+                ),
+            }]
+        );
+    }
+}
diff --git a/gitops_adapter/test/mock-iox-failure.sh b/gitops_adapter/test/mock-iox-failure.sh
new file mode 100755
index 0000000000..afe8ade3e2
--- /dev/null
+++ b/gitops_adapter/test/mock-iox-failure.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+exit 1
diff --git a/gitops_adapter/test/mock-iox-multi-topic.sh b/gitops_adapter/test/mock-iox-multi-topic.sh
new file mode 100755
index 0000000000..fefd83ce67
--- /dev/null
+++ b/gitops_adapter/test/mock-iox-multi-topic.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo 42
+echo 93
diff --git a/gitops_adapter/test/mock-iox-single-topic.sh b/gitops_adapter/test/mock-iox-single-topic.sh
new file mode 100755
index 0000000000..9feaaf9b1a
--- /dev/null
+++ b/gitops_adapter/test/mock-iox-single-topic.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo 42
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index 6c657dde01..b6c98e5fb8 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -19,7 +19,7 @@ base64 = { version = "0.13", features = ["std"] }
 bitflags = { version = "1" }
 byteorder = { version = "1", features = ["std"] }
 bytes = { version = "1", features = ["std"] }
-chrono = { version = "0.4", default-features = false, features = ["alloc", "clock", "libc", "std", "winapi"] }
+chrono = { version = "0.4", features = ["alloc", "clock", "libc", "oldtime", "serde", "std", "time", "winapi"] }
 digest = { version = "0.9", default-features = false, features = ["alloc", "std"] }
 either = { version = "1", features = ["use_std"] }
 futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
@@ -52,7 +52,7 @@ sha2 = { version = "0.9", features = ["std"] }
 smallvec = { version = "1", default-features = false, features = ["union"] }
 tokio = { version = "1", features = ["bytes", "fs", "full", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "parking_lot", "process", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "sync", "time", "tokio-macros", "winapi"] }
 tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }
-tokio-util = { version = "0.6", features = ["codec", "io"] }
+tokio-util = { version = "0.6", features = ["codec", "io", "slab", "time"] }
 tower = { version = "0.4", features = ["balance", "buffer", "discover", "futures-util", "indexmap", "limit", "load", "log", "make", "rand", "ready-cache", "slab", "timeout", "tokio", "tokio-stream", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "max_level_trace", "release_max_level_debug", "std", "tracing-attributes"] }
 tracing-core = { version = "0.1", features = ["lazy_static", "std"] }
@@ -66,6 +66,7 @@ base64 = { version = "0.13", features = ["std"] }
 bitflags = { version = "1" }
 byteorder = { version = "1", features = ["std"] }
 bytes = { version = "1", features = ["std"] }
+cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] }
 digest = { version = "0.9", default-features = false, features = ["alloc", "std"] }
 either = { version = "1", features = ["use_std"] }
 futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
@@ -98,7 +99,6 @@ uuid = { version = "0.8", features = ["getrandom", "std", "v4"] }
 libc = { version = "0.2", features = ["extra_traits", "std"] }
 
 [target.x86_64-unknown-linux-gnu.build-dependencies]
-cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] }
 libc = { version = "0.2", features = ["extra_traits", "std"] }
 
 [target.x86_64-apple-darwin.dependencies]
@@ -115,10 +115,10 @@ libc = { version = "0.2", features = ["extra_traits", "std"] }
 
 [target.x86_64-pc-windows-msvc.dependencies]
 scopeguard = { version = "1", features = ["use_std"] }
-winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
+winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "profileapi", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
 
 [target.x86_64-pc-windows-msvc.build-dependencies]
 scopeguard = { version = "1", features = ["use_std"] }
-winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
+winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "profileapi", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
 
 ### END HAKARI SECTION

From 5de4d6203f68e99e9c03c3cccc7b47d3b32ca636 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Tue, 8 Feb 2022 13:38:33 +0000
Subject: [PATCH 21/30] refactor: catalog transaction (#3660)

* refactor: catalog Unit of Work (= transaction)

Setup an inteface to handle Units of Work within our catalog. Previously
both the Postgres and the in-mem backend used "mini-transactions on
demand". Now the caller has a clear way to establish boundaries and
gets read and write isolation. A single `Arc<dyn Catalog>` can create as
many `Box<dyn UnitOfWork>` as you like, but note that depending on the
backend you may not scale infinitely (postgres will likely impose
certain limits and the in-mem backend limits concurrency to 1 to keep
things simple).

* docs: improve wording

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* refactor: rename Unit of Work to Transaction

* test: improve `test_txn_isolation`

* feat: clearify transaction drop semantics

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                                    |   1 +
 influxdb_iox/src/commands/catalog/topic.rs    |   5 +-
 influxdb_iox/src/commands/run/ingester.rs     |   6 +-
 influxdb_iox/src/commands/run/router2.rs      |   7 +-
 ingester/src/data.rs                          |  18 +-
 ingester/src/handler.rs                       |  25 +-
 iox_catalog/Cargo.toml                        |   1 +
 iox_catalog/src/interface.rs                  | 563 ++++++++++++------
 iox_catalog/src/lib.rs                        |  92 +--
 iox_catalog/src/mem.rs                        | 438 +++++++-------
 iox_catalog/src/postgres.rs                   | 371 ++++++------
 router2/src/dml_handlers/ns_autocreation.rs   |  26 +-
 router2/src/dml_handlers/schema_validation.rs |  24 +-
 13 files changed, 889 insertions(+), 688 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 52315d9550..723bb10d5c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2113,6 +2113,7 @@ dependencies = [
  "schema",
  "snafu",
  "sqlx",
+ "test_helpers",
  "tokio",
  "uuid",
  "workspace-hack",
diff --git a/influxdb_iox/src/commands/catalog/topic.rs b/influxdb_iox/src/commands/catalog/topic.rs
index a6e7573554..9ae4886d4f 100644
--- a/influxdb_iox/src/commands/catalog/topic.rs
+++ b/influxdb_iox/src/commands/catalog/topic.rs
@@ -44,8 +44,9 @@ pub async fn command(config: Config) -> Result<(), Error> {
     match config.command {
         Command::Update(update) => {
             let catalog = update.catalog_dsn.get_catalog("cli").await?;
-            let topics_repo = catalog.kafka_topics();
-            let topic = topics_repo.create_or_get(&update.db_name).await?;
+            let mut txn = catalog.start_transaction().await?;
+            let topic = txn.kafka_topics().create_or_get(&update.db_name).await?;
+            txn.commit().await?;
             println!("{}", topic.id);
             Ok(())
         }
diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs
index cd207a6949..ce5595b46d 100644
--- a/influxdb_iox/src/commands/run/ingester.rs
+++ b/influxdb_iox/src/commands/run/ingester.rs
@@ -100,7 +100,8 @@ pub async fn command(config: Config) -> Result<()> {
 
     let catalog = config.catalog_dsn.get_catalog("ingester").await?;
 
-    let kafka_topic = catalog
+    let mut txn = catalog.start_transaction().await?;
+    let kafka_topic = txn
         .kafka_topics()
         .get_by_name(&config.write_buffer_config.topic)
         .await?
@@ -122,13 +123,14 @@ pub async fn command(config: Config) -> Result<()> {
 
     let mut sequencers = BTreeMap::new();
     for k in kafka_partitions {
-        let s = catalog
+        let s = txn
             .sequencers()
             .get_by_topic_id_and_partition(kafka_topic.id, k)
             .await?
             .ok_or(Error::SequencerNotFound(k))?;
         sequencers.insert(k, s);
     }
+    txn.commit().await?;
 
     let metric_registry: Arc<metric::Registry> = Default::default();
     let trace_collector = common_state.trace_collector();
diff --git a/influxdb_iox/src/commands/run/router2.rs b/influxdb_iox/src/commands/run/router2.rs
index be43edf92a..e2a57eba3b 100644
--- a/influxdb_iox/src/commands/run/router2.rs
+++ b/influxdb_iox/src/commands/run/router2.rs
@@ -109,7 +109,8 @@ pub async fn command(config: Config) -> Result<()> {
     // This code / auto-creation is for architecture testing purposes only - a
     // prod deployment would expect namespaces to be explicitly created and this
     // layer would be removed.
-    let topic_id = catalog
+    let mut txn = catalog.start_transaction().await?;
+    let topic_id = txn
         .kafka_topics()
         .get_by_name(&config.write_buffer_config.topic)
         .await?
@@ -120,7 +121,7 @@ pub async fn command(config: Config) -> Result<()> {
                 &config.write_buffer_config.topic
             )
         });
-    let query_id = catalog
+    let query_id = txn
         .query_pools()
         .create_or_get(&config.query_pool_name)
         .await
@@ -131,6 +132,8 @@ pub async fn command(config: Config) -> Result<()> {
                 &config.write_buffer_config.topic, e
             )
         });
+    txn.commit().await?;
+
     let handler_stack = NamespaceAutocreation::new(
         catalog,
         ns_cache,
diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 3b67147ebe..95af014bd7 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -144,12 +144,15 @@ impl SequencerData {
         namespace: &str,
         catalog: &dyn Catalog,
     ) -> Result<Arc<NamespaceData>> {
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
+        let namespace = txn
             .namespaces()
             .get_by_name(namespace)
             .await
             .context(CatalogSnafu)?
             .context(NamespaceNotFoundSnafu { namespace })?;
+        txn.commit().await.context(CatalogSnafu)?;
+
         let mut n = self.namespaces.write();
         let data = Arc::clone(
             n.entry(namespace.name)
@@ -230,11 +233,14 @@ impl NamespaceData {
         table_name: &str,
         catalog: &dyn Catalog,
     ) -> Result<Arc<TableData>> {
-        let table = catalog
+        let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
+        let table = txn
             .tables()
             .create_or_get(table_name, self.namespace_id)
             .await
             .context(CatalogSnafu)?;
+        txn.commit().await.context(CatalogSnafu)?;
+
         let mut t = self.tables.write();
         let data = Arc::clone(
             t.entry(table.name)
@@ -306,7 +312,8 @@ impl TableData {
         let min_time = Timestamp::new(predicate.range.start());
         let max_time = Timestamp::new(predicate.range.end());
 
-        let tombstone = catalog
+        let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
+        let tombstone = txn
             .tombstones()
             .create_or_get(
                 self.table_id,
@@ -318,6 +325,7 @@ impl TableData {
             )
             .await
             .context(CatalogSnafu)?;
+        txn.commit().await.context(CatalogSnafu)?;
 
         let partitions = self.partition_data.read();
         for data in partitions.values() {
@@ -339,11 +347,13 @@ impl TableData {
         sequencer_id: SequencerId,
         catalog: &dyn Catalog,
     ) -> Result<Arc<PartitionData>> {
-        let partition = catalog
+        let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
+        let partition = txn
             .partitions()
             .create_or_get(partition_key, sequencer_id, self.table_id)
             .await
             .context(CatalogSnafu)?;
+        txn.commit().await.context(CatalogSnafu)?;
         let mut p = self.partition_data.write();
         let data = Arc::new(PartitionData::new(partition.id));
         p.insert(partition.partition_key, Arc::clone(&data));
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index 4bf20f0736..18a100f132 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -234,34 +234,28 @@ mod tests {
     use iox_catalog::validate_or_insert_schema;
     use metric::{Attributes, Metric, U64Counter, U64Gauge};
     use mutable_batch_lp::lines_to_batches;
-    use std::num::NonZeroU32;
+    use std::{num::NonZeroU32, ops::DerefMut};
     use time::Time;
     use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};
 
     #[tokio::test]
     async fn read_from_write_buffer_write_to_mutable_buffer() {
         let catalog = MemCatalog::new();
-        let kafka_topic = catalog
-            .kafka_topics()
-            .create_or_get("whatevs")
-            .await
-            .unwrap();
-        let query_pool = catalog
-            .query_pools()
-            .create_or_get("whatevs")
-            .await
-            .unwrap();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka_topic = txn.kafka_topics().create_or_get("whatevs").await.unwrap();
+        let query_pool = txn.query_pools().create_or_get("whatevs").await.unwrap();
         let kafka_partition = KafkaPartition::new(0);
-        let namespace = catalog
+        let namespace = txn
             .namespaces()
             .create("foo", "inf", kafka_topic.id, query_pool.id)
             .await
             .unwrap();
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .create_or_get(&kafka_topic, kafka_partition)
             .await
             .unwrap();
+
         let mut sequencer_states = BTreeMap::new();
         sequencer_states.insert(kafka_partition, sequencer);
 
@@ -276,7 +270,7 @@ mod tests {
             lines_to_batches("mem foo=1 10", 0).unwrap(),
             DmlMeta::sequenced(Sequence::new(0, 0), ingest_ts1, None, 50),
         );
-        let schema = validate_or_insert_schema(w1.tables(), &schema, &catalog)
+        let schema = validate_or_insert_schema(w1.tables(), &schema, txn.deref_mut())
             .await
             .unwrap()
             .unwrap();
@@ -286,10 +280,11 @@ mod tests {
             lines_to_batches("cpu bar=2 20\ncpu bar=3 30", 0).unwrap(),
             DmlMeta::sequenced(Sequence::new(0, 7), ingest_ts2, None, 150),
         );
-        let _schema = validate_or_insert_schema(w2.tables(), &schema, &catalog)
+        let _schema = validate_or_insert_schema(w2.tables(), &schema, txn.deref_mut())
             .await
             .unwrap()
             .unwrap();
+        txn.commit().await.unwrap();
         write_buffer_state.push_write(w2);
         let reading: Arc<dyn WriteBufferReading> =
             Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
diff --git a/iox_catalog/Cargo.toml b/iox_catalog/Cargo.toml
index c420134848..ad28c7b4af 100644
--- a/iox_catalog/Cargo.toml
+++ b/iox_catalog/Cargo.toml
@@ -22,5 +22,6 @@ dotenv = "0.15.0"
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 paste = "1.0.6"
 pretty_assertions = "1.0.0"
+test_helpers = { path = "../test_helpers" }
 
 [features]
diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index 16bb28913b..bbe0f2f371 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -4,7 +4,6 @@ use async_trait::async_trait;
 use influxdb_line_protocol::FieldValue;
 use schema::{InfluxColumnType, InfluxFieldType};
 use snafu::{OptionExt, Snafu};
-use sqlx::{Postgres, Transaction};
 use std::convert::TryFrom;
 use std::fmt::Formatter;
 use std::{collections::BTreeMap, fmt::Debug};
@@ -309,59 +308,121 @@ pub trait Catalog: Send + Sync + Debug {
     /// Setup catalog for usage and apply possible migrations.
     async fn setup(&self) -> Result<(), Error>;
 
+    /// Create a new transaction.
+    ///
+    /// Creating transactions is potentially expensive. Holding one consumes resources. The number of parallel active
+    /// transactions might be limited per catalog, so you MUST NOT rely on the ability to create multiple transactions in
+    /// parallel for correctness but only for scaling.
+    async fn start_transaction(&self) -> Result<Box<dyn Transaction>, Error>;
+}
+
+/// Secret module for [sealed traits].
+///
+/// [sealed traits]: https://rust-lang.github.io/api-guidelines/future-proofing.html#sealed-traits-protect-against-downstream-implementations-c-sealed
+pub(crate) mod sealed {
+    use super::*;
+
+    /// Helper trait to implement commit and abort of an transaction.
+    ///
+    /// The problem is that both methods cannot take `self` directly, otherwise the [`Transaction`] would not be object
+    /// safe. Therefore we can only take a reference. To avoid that a user uses a transaction after calling one of the
+    /// finalizers, we use a tiny trick and take `Box<dyn Transaction>` in our public interface and use a sealed trait
+    /// for the actual implementation.
+    #[async_trait]
+    pub trait TransactionFinalize: Send + Sync + Debug {
+        async fn commit_inplace(&mut self) -> Result<(), Error>;
+        async fn abort_inplace(&mut self) -> Result<(), Error>;
+    }
+}
+
+/// transaction of a [`Catalog`].
+///
+/// A transaction provides a consistent view on data and stages writes (this normally maps to a database transaction).
+/// Repositories can cheaply be borrowed from it. To finalize a transaction, call [commit](Self::commit) or [abort](Self::abort).
+///
+/// Note that after any method in this transaction (including all repositories derived from it) returned an error, the
+/// transaction MIGHT be poisoned and will return errors for all operations, depending on the backend.
+///
+///
+/// # Repositories
+/// The methods (e.g. "get or create") for handling entities (e.g. namespaces, tombstones, ...) are grouped into
+/// *repositories* with one *repository* per entity. A repository can be thought of a collection of a single entity.
+/// Getting repositories from the transaction is cheap.
+///
+/// Note that a repository might internally map to a wide range of different storage abstractions, ranging from one or
+/// more SQL tables over key-value key spaces to simple in-memory vectors. The user should and must not care how these
+/// are implemented.
+///
+///
+/// # Drop
+/// Dropping a transaction without calling [`commit`](Self::commit) or [`abort`](Self::abort) will abort the
+/// transaction. However resources might not be released immediately, so it is adviced to always call
+/// [`abort`](Self::abort) when you want to enforce that. Dropping w/o commiting/aborting will also log a warning.
+#[async_trait]
+pub trait Transaction: Send + Sync + Debug + sealed::TransactionFinalize {
+    /// Commit transaction.
+    ///
+    /// # Error Handling
+    /// If successfull, all changes will be visible to other transactions.
+    ///
+    /// If an error is returned, the transaction may or or not be committed. This might be due to IO errors after the
+    /// transaction was finished. However in either case, the transaction is atomic and can only succeed or fail
+    /// entirely.
+    async fn commit(mut self: Box<Self>) -> Result<(), Error> {
+        self.commit_inplace().await
+    }
+
+    /// Abort transaction, throwing away all changes.
+    async fn abort(mut self: Box<Self>) -> Result<(), Error> {
+        self.abort_inplace().await
+    }
+
     /// repo for kafka topics
-    fn kafka_topics(&self) -> &dyn KafkaTopicRepo;
+    fn kafka_topics(&mut self) -> &mut dyn KafkaTopicRepo;
 
     /// repo fo rquery pools
-    fn query_pools(&self) -> &dyn QueryPoolRepo;
+    fn query_pools(&mut self) -> &mut dyn QueryPoolRepo;
 
     /// repo for namespaces
-    fn namespaces(&self) -> &dyn NamespaceRepo;
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo;
 
     /// repo for tables
-    fn tables(&self) -> &dyn TableRepo;
+    fn tables(&mut self) -> &mut dyn TableRepo;
 
     /// repo for columns
-    fn columns(&self) -> &dyn ColumnRepo;
+    fn columns(&mut self) -> &mut dyn ColumnRepo;
 
     /// repo for sequencers
-    fn sequencers(&self) -> &dyn SequencerRepo;
+    fn sequencers(&mut self) -> &mut dyn SequencerRepo;
 
     /// repo for partitions
-    fn partitions(&self) -> &dyn PartitionRepo;
+    fn partitions(&mut self) -> &mut dyn PartitionRepo;
 
     /// repo for tombstones
-    fn tombstones(&self) -> &dyn TombstoneRepo;
+    fn tombstones(&mut self) -> &mut dyn TombstoneRepo;
 
     /// repo for parquet_files
-    fn parquet_files(&self) -> &dyn ParquetFileRepo;
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo;
 
     /// repo for processed_tombstones
-    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo;
-
-    /// Insert the conpacted parquet file and its tombstones into the catalog in one transaction
-    async fn add_parquet_file_with_tombstones(
-        &self,
-        parquet_file: &ParquetFile,
-        tombstones: &[Tombstone],
-    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error>;
+    fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo;
 }
 
 /// Functions for working with Kafka topics in the catalog.
 #[async_trait]
 pub trait KafkaTopicRepo: Send + Sync {
     /// Creates the kafka topic in the catalog or gets the existing record by name.
-    async fn create_or_get(&self, name: &str) -> Result<KafkaTopic>;
+    async fn create_or_get(&mut self, name: &str) -> Result<KafkaTopic>;
 
     /// Gets the kafka topic by its unique name
-    async fn get_by_name(&self, name: &str) -> Result<Option<KafkaTopic>>;
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<KafkaTopic>>;
 }
 
 /// Functions for working with query pools in the catalog.
 #[async_trait]
 pub trait QueryPoolRepo: Send + Sync {
     /// Creates the query pool in the catalog or gets the existing record by name.
-    async fn create_or_get(&self, name: &str) -> Result<QueryPool>;
+    async fn create_or_get(&mut self, name: &str) -> Result<QueryPool>;
 }
 
 /// Functions for working with namespaces in the catalog
@@ -370,7 +431,7 @@ pub trait NamespaceRepo: Send + Sync {
     /// Creates the namespace in the catalog. If one by the same name already exists, an
     /// error is returned.
     async fn create(
-        &self,
+        &mut self,
         name: &str,
         retention_duration: &str,
         kafka_topic_id: KafkaTopicId,
@@ -378,17 +439,17 @@ pub trait NamespaceRepo: Send + Sync {
     ) -> Result<Namespace>;
 
     /// Gets the namespace by its unique name.
-    async fn get_by_name(&self, name: &str) -> Result<Option<Namespace>>;
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<Namespace>>;
 }
 
 /// Functions for working with tables in the catalog
 #[async_trait]
 pub trait TableRepo: Send + Sync {
     /// Creates the table in the catalog or get the existing record by name.
-    async fn create_or_get(&self, name: &str, namespace_id: NamespaceId) -> Result<Table>;
+    async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result<Table>;
 
     /// Lists all tables in the catalog for the given namespace id.
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Table>>;
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>>;
 }
 
 /// Functions for working with columns in the catalog
@@ -398,14 +459,14 @@ pub trait ColumnRepo: Send + Sync {
     /// `Error::ColumnTypeMismatch` if the existing column type doesn't match the type
     /// the caller is attempting to create.
     async fn create_or_get(
-        &self,
+        &mut self,
         name: &str,
         table_id: TableId,
         column_type: ColumnType,
     ) -> Result<Column>;
 
     /// Lists all columns in the passed in namespace id.
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Column>>;
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>>;
 }
 
 /// Functions for working with sequencers in the catalog
@@ -413,23 +474,23 @@ pub trait ColumnRepo: Send + Sync {
 pub trait SequencerRepo: Send + Sync {
     /// create a sequencer record for the kafka topic and partition or return the existing record
     async fn create_or_get(
-        &self,
+        &mut self,
         topic: &KafkaTopic,
         partition: KafkaPartition,
     ) -> Result<Sequencer>;
 
     /// get the sequencer record by `KafkaTopicId` and `KafkaPartition`
     async fn get_by_topic_id_and_partition(
-        &self,
+        &mut self,
         topic_id: KafkaTopicId,
         partition: KafkaPartition,
     ) -> Result<Option<Sequencer>>;
 
     /// list all sequencers
-    async fn list(&self) -> Result<Vec<Sequencer>>;
+    async fn list(&mut self) -> Result<Vec<Sequencer>>;
 
     /// list all sequencers for a given kafka topic
-    async fn list_by_kafka_topic(&self, topic: &KafkaTopic) -> Result<Vec<Sequencer>>;
+    async fn list_by_kafka_topic(&mut self, topic: &KafkaTopic) -> Result<Vec<Sequencer>>;
 }
 
 /// Functions for working with IOx partitions in the catalog. Note that these are how
@@ -438,18 +499,18 @@ pub trait SequencerRepo: Send + Sync {
 pub trait PartitionRepo: Send + Sync {
     /// create or get a partition record for the given partition key, sequencer and table
     async fn create_or_get(
-        &self,
+        &mut self,
         key: &str,
         sequencer_id: SequencerId,
         table_id: TableId,
     ) -> Result<Partition>;
 
     /// return partitions for a given sequencer
-    async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>>;
+    async fn list_by_sequencer(&mut self, sequencer_id: SequencerId) -> Result<Vec<Partition>>;
 
     /// return the partition record, the namespace name it belongs to, and the table name it is under
     async fn partition_info_by_id(
-        &self,
+        &mut self,
         partition_id: PartitionId,
     ) -> Result<Option<PartitionInfo>>;
 }
@@ -468,7 +529,7 @@ pub struct PartitionInfo {
 pub trait TombstoneRepo: Send + Sync {
     /// create or get a tombstone
     async fn create_or_get(
-        &self,
+        &mut self,
         table_id: TableId,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
@@ -481,7 +542,7 @@ pub trait TombstoneRepo: Send + Sync {
     /// passed in. This will be used by the ingester on startup to see what tombstones
     /// might have to be applied to data that is read from the write buffer.
     async fn list_tombstones_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<Tombstone>>;
@@ -493,9 +554,7 @@ pub trait ParquetFileRepo: Send + Sync {
     /// create the parquet file
     #[allow(clippy::too_many_arguments)]
     async fn create(
-        &self,
-        // this transaction is only provided when this record is inserted in a transaction
-        txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -507,23 +566,23 @@ pub trait ParquetFileRepo: Send + Sync {
     ) -> Result<ParquetFile>;
 
     /// Flag the parquet file for deletion
-    async fn flag_for_delete(&self, id: ParquetFileId) -> Result<()>;
+    async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()>;
 
     /// Get all parquet files for a sequencer with a max_sequence_number greater than the
     /// one passed in. The ingester will use this on startup to see which files were persisted
     /// that are greater than its min_unpersisted_number so that it can discard any data in
     /// these partitions on replay.
     async fn list_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<ParquetFile>>;
 
     /// Verify if the parquet file exists by selecting its id
-    async fn exist(&self, id: ParquetFileId) -> Result<bool>;
+    async fn exist(&mut self, id: ParquetFileId) -> Result<bool>;
 
     /// Return count
-    async fn count(&self) -> Result<i64>;
+    async fn count(&mut self) -> Result<i64>;
 }
 
 /// Functions for working with processed tombstone pointers in the catalog
@@ -531,21 +590,20 @@ pub trait ParquetFileRepo: Send + Sync {
 pub trait ProcessedTombstoneRepo: Send + Sync {
     /// create processed tombstones
     async fn create_many(
-        &self,
-        txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstones: &[Tombstone],
     ) -> Result<Vec<ProcessedTombstone>>;
 
     /// Verify if a processed tombstone exists in the catalog
     async fn exist(
-        &self,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstone_id: TombstoneId,
     ) -> Result<bool>;
 
     /// Return count
-    async fn count(&self) -> Result<i64>;
+    async fn count(&mut self) -> Result<i64>;
 }
 
 /// Data object for a kafka topic
@@ -609,16 +667,16 @@ impl NamespaceSchema {
 }
 
 /// Gets the namespace schema including all tables and columns.
-pub async fn get_schema_by_name(name: &str, catalog: &dyn Catalog) -> Result<NamespaceSchema> {
-    let namespace = catalog
+pub async fn get_schema_by_name(name: &str, txn: &mut dyn Transaction) -> Result<NamespaceSchema> {
+    let namespace = txn
         .namespaces()
         .get_by_name(name)
         .await?
         .context(NamespaceNotFoundSnafu { name })?;
 
     // get the columns first just in case someone else is creating schema while we're doing this.
-    let columns = catalog.columns().list_by_namespace_id(namespace.id).await?;
-    let tables = catalog.tables().list_by_namespace_id(namespace.id).await?;
+    let columns = txn.columns().list_by_namespace_id(namespace.id).await?;
+    let tables = txn.tables().list_by_namespace_id(namespace.id).await?;
 
     let mut namespace = NamespaceSchema::new(
         namespace.id,
@@ -956,9 +1014,12 @@ pub struct ProcessedTombstone {
 
 #[cfg(test)]
 pub(crate) mod test_helpers {
+    use ::test_helpers::{assert_contains, tracing::TracingCapture};
+
+    use crate::add_parquet_file_with_tombstones;
+
     use super::*;
-    use futures::{stream::FuturesOrdered, StreamExt};
-    use std::sync::Arc;
+    use std::{ops::DerefMut, sync::Arc, time::Duration};
 
     pub(crate) async fn test_catalog(catalog: Arc<dyn Catalog>) {
         test_setup(Arc::clone(&catalog)).await;
@@ -972,6 +1033,8 @@ pub(crate) mod test_helpers {
         test_tombstone(Arc::clone(&catalog)).await;
         test_parquet_file(Arc::clone(&catalog)).await;
         test_add_parquet_file_with_tombstones(Arc::clone(&catalog)).await;
+        test_txn_isolation(Arc::clone(&catalog)).await;
+        test_txn_drop(Arc::clone(&catalog)).await;
     }
 
     async fn test_setup(catalog: Arc<dyn Catalog>) {
@@ -980,7 +1043,9 @@ pub(crate) mod test_helpers {
     }
 
     async fn test_kafka_topic(catalog: Arc<dyn Catalog>) {
-        let kafka_repo = catalog.kafka_topics();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka_repo = txn.kafka_topics();
+
         let k = kafka_repo.create_or_get("foo").await.unwrap();
         assert!(k.id > KafkaTopicId::new(0));
         assert_eq!(k.name, "foo");
@@ -990,62 +1055,77 @@ pub(crate) mod test_helpers {
         assert_eq!(k3, k);
         let k3 = kafka_repo.get_by_name("asdf").await.unwrap();
         assert!(k3.is_none());
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_query_pool(catalog: Arc<dyn Catalog>) {
-        let query_repo = catalog.query_pools();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let query_repo = txn.query_pools();
+
         let q = query_repo.create_or_get("foo").await.unwrap();
         assert!(q.id > QueryPoolId::new(0));
         assert_eq!(q.name, "foo");
         let q2 = query_repo.create_or_get("foo").await.unwrap();
         assert_eq!(q, q2);
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_namespace(catalog: Arc<dyn Catalog>) {
-        let namespace_repo = catalog.namespaces();
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
 
         let namespace_name = "test_namespace";
-        let namespace = namespace_repo
+        let namespace = txn
+            .namespaces()
             .create(namespace_name, "inf", kafka.id, pool.id)
             .await
             .unwrap();
         assert!(namespace.id > NamespaceId::new(0));
         assert_eq!(namespace.name, namespace_name);
+        txn.commit().await.unwrap();
 
-        let conflict = namespace_repo
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let conflict = txn
+            .namespaces()
             .create(namespace_name, "inf", kafka.id, pool.id)
             .await;
         assert!(matches!(
             conflict.unwrap_err(),
             Error::NameExists { name: _ }
         ));
+        txn.abort().await.unwrap();
 
-        let found = namespace_repo
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let found = txn
+            .namespaces()
             .get_by_name(namespace_name)
             .await
             .unwrap()
             .expect("namespace should be there");
         assert_eq!(namespace, found);
+        txn.commit().await.unwrap();
     }
 
     async fn test_table(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create("namespace_table_test", "inf", kafka.id, pool.id)
             .await
             .unwrap();
 
         // test we can create or get a table
-        let t = catalog
+        let t = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
             .unwrap();
-        let tt = catalog
+        let tt = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
@@ -1053,7 +1133,7 @@ pub(crate) mod test_helpers {
         assert!(t.id > TableId::new(0));
         assert_eq!(t, tt);
 
-        let tables = catalog
+        let tables = txn
             .tables()
             .list_by_namespace_id(namespace.id)
             .await
@@ -1061,30 +1141,33 @@ pub(crate) mod test_helpers {
         assert_eq!(vec![t], tables);
 
         // test we can create a table of the same name in a different namespace
-        let namespace2 = catalog
+        let namespace2 = txn
             .namespaces()
             .create("two", "inf", kafka.id, pool.id)
             .await
             .unwrap();
         assert_ne!(namespace, namespace2);
-        let test_table = catalog
+        let test_table = txn
             .tables()
             .create_or_get("test_table", namespace2.id)
             .await
             .unwrap();
         assert_ne!(tt, test_table);
-        assert_eq!(test_table.namespace_id, namespace2.id)
+        assert_eq!(test_table.namespace_id, namespace2.id);
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_column(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create("namespace_column_test", "inf", kafka.id, pool.id)
             .await
             .unwrap();
-        let table = catalog
+        let table = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
@@ -1092,12 +1175,12 @@ pub(crate) mod test_helpers {
         assert_eq!(table.namespace_id, namespace.id);
 
         // test we can create or get a column
-        let c = catalog
+        let c = txn
             .columns()
             .create_or_get("column_test", table.id, ColumnType::Tag)
             .await
             .unwrap();
-        let cc = catalog
+        let cc = txn
             .columns()
             .create_or_get("column_test", table.id, ColumnType::Tag)
             .await
@@ -1106,7 +1189,7 @@ pub(crate) mod test_helpers {
         assert_eq!(c, cc);
 
         // test that attempting to create an already defined column of a different type returns error
-        let err = catalog
+        let err = txn
             .columns()
             .create_or_get("column_test", table.id, ColumnType::U64)
             .await
@@ -1121,50 +1204,49 @@ pub(crate) mod test_helpers {
         ));
 
         // test that we can create a column of the same name under a different table
-        let table2 = catalog
+        let table2 = txn
             .tables()
             .create_or_get("test_table_2", namespace.id)
             .await
             .unwrap();
-        let ccc = catalog
+        let ccc = txn
             .columns()
             .create_or_get("column_test", table2.id, ColumnType::U64)
             .await
             .unwrap();
         assert_ne!(c, ccc);
 
-        let columns = catalog
+        let columns = txn
             .columns()
             .list_by_namespace_id(namespace.id)
             .await
             .unwrap();
         assert_eq!(vec![c, ccc], columns);
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_sequencer(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn
             .kafka_topics()
             .create_or_get("sequencer_test")
             .await
             .unwrap();
 
         // Create 10 sequencers
-        let created = (1..=10)
-            .map(|partition| {
-                catalog
-                    .sequencers()
-                    .create_or_get(&kafka, KafkaPartition::new(partition))
-            })
-            .collect::<FuturesOrdered<_>>()
-            .map(|v| {
-                let v = v.expect("failed to create sequencer");
-                (v.id, v)
-            })
-            .collect::<BTreeMap<_, _>>()
-            .await;
+        let mut created = BTreeMap::new();
+        for partition in 1..=10 {
+            let sequencer = txn
+                .sequencers()
+                .create_or_get(&kafka, KafkaPartition::new(partition))
+                .await
+                .expect("failed to create sequencer");
+            created.insert(sequencer.id, sequencer);
+        }
 
         // List them and assert they match
-        let listed = catalog
+        let listed = txn
             .sequencers()
             .list_by_kafka_topic(&kafka)
             .await
@@ -1177,7 +1259,7 @@ pub(crate) mod test_helpers {
 
         // get by the sequencer id and partition
         let kafka_partition = KafkaPartition::new(1);
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .get_by_topic_id_and_partition(kafka.id, kafka_partition)
             .await
@@ -1186,60 +1268,58 @@ pub(crate) mod test_helpers {
         assert_eq!(kafka.id, sequencer.kafka_topic_id);
         assert_eq!(kafka_partition, sequencer.kafka_partition);
 
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .get_by_topic_id_and_partition(kafka.id, KafkaPartition::new(523))
             .await
             .unwrap();
         assert!(sequencer.is_none());
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_partition(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create("namespace_partition_test", "inf", kafka.id, pool.id)
             .await
             .unwrap();
-        let table = catalog
+        let table = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
             .unwrap();
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .create_or_get(&kafka, KafkaPartition::new(1))
             .await
             .unwrap();
-        let other_sequencer = catalog
+        let other_sequencer = txn
             .sequencers()
             .create_or_get(&kafka, KafkaPartition::new(2))
             .await
             .unwrap();
 
-        let created = ["foo", "bar"]
-            .iter()
-            .map(|key| {
-                catalog
-                    .partitions()
-                    .create_or_get(key, sequencer.id, table.id)
-            })
-            .collect::<FuturesOrdered<_>>()
-            .map(|v| {
-                let v = v.expect("failed to create partition");
-                (v.id, v)
-            })
-            .collect::<BTreeMap<_, _>>()
-            .await;
-        let other_partition = catalog
+        let mut created = BTreeMap::new();
+        for key in ["foo", "bar"] {
+            let partition = txn
+                .partitions()
+                .create_or_get(key, sequencer.id, table.id)
+                .await
+                .expect("failed to create partition");
+            created.insert(partition.id, partition);
+        }
+        let other_partition = txn
             .partitions()
             .create_or_get("asdf", other_sequencer.id, table.id)
             .await
             .unwrap();
 
         // List them and assert they match
-        let listed = catalog
+        let listed = txn
             .partitions()
             .list_by_sequencer(sequencer.id)
             .await
@@ -1251,7 +1331,7 @@ pub(crate) mod test_helpers {
         assert_eq!(created, listed);
 
         // test get_partition_info_by_id
-        let info = catalog
+        let info = txn
             .partitions()
             .partition_info_by_id(other_partition.id)
             .await
@@ -1260,27 +1340,30 @@ pub(crate) mod test_helpers {
         assert_eq!(info.partition, other_partition);
         assert_eq!(info.table_name, "test_table");
         assert_eq!(info.namespace_name, "namespace_partition_test");
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_tombstone(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create("namespace_tombstone_test", "inf", kafka.id, pool.id)
             .await
             .unwrap();
-        let table = catalog
+        let table = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
             .unwrap();
-        let other_table = catalog
+        let other_table = txn
             .tables()
             .create_or_get("other", namespace.id)
             .await
             .unwrap();
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .create_or_get(&kafka, KafkaPartition::new(1))
             .await
@@ -1288,7 +1371,7 @@ pub(crate) mod test_helpers {
 
         let min_time = Timestamp::new(1);
         let max_time = Timestamp::new(10);
-        let t1 = catalog
+        let t1 = txn
             .tombstones()
             .create_or_get(
                 table.id,
@@ -1306,7 +1389,7 @@ pub(crate) mod test_helpers {
         assert_eq!(t1.min_time, min_time);
         assert_eq!(t1.max_time, max_time);
         assert_eq!(t1.serialized_predicate, "whatevs");
-        let t2 = catalog
+        let t2 = txn
             .tombstones()
             .create_or_get(
                 other_table.id,
@@ -1318,7 +1401,7 @@ pub(crate) mod test_helpers {
             )
             .await
             .unwrap();
-        let t3 = catalog
+        let t3 = txn
             .tombstones()
             .create_or_get(
                 table.id,
@@ -1331,43 +1414,46 @@ pub(crate) mod test_helpers {
             .await
             .unwrap();
 
-        let listed = catalog
+        let listed = txn
             .tombstones()
             .list_tombstones_by_sequencer_greater_than(sequencer.id, SequenceNumber::new(1))
             .await
             .unwrap();
         assert_eq!(vec![t2, t3], listed);
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_parquet_file(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create("namespace_parquet_file_test", "inf", kafka.id, pool.id)
             .await
             .unwrap();
-        let table = catalog
+        let table = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
             .unwrap();
-        let other_table = catalog
+        let other_table = txn
             .tables()
             .create_or_get("other", namespace.id)
             .await
             .unwrap();
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .create_or_get(&kafka, KafkaPartition::new(1))
             .await
             .unwrap();
-        let partition = catalog
+        let partition = txn
             .partitions()
             .create_or_get("one", sequencer.id, table.id)
             .await
             .unwrap();
-        let other_partition = catalog
+        let other_partition = txn
             .partitions()
             .create_or_get("one", sequencer.id, other_table.id)
             .await
@@ -1376,15 +1462,13 @@ pub(crate) mod test_helpers {
         let min_time = Timestamp::new(1);
         let max_time = Timestamp::new(10);
 
-        let parquet_repo = catalog.parquet_files();
-
         // Must have no rows
-        let row_count = parquet_repo.count().await.unwrap();
+        let row_count = txn.parquet_files().count().await.unwrap();
         assert_eq!(row_count, 0);
 
-        let parquet_file = parquet_repo
+        let parquet_file = txn
+            .parquet_files()
             .create(
-                None,
                 sequencer.id,
                 partition.table_id,
                 partition.id,
@@ -1396,11 +1480,13 @@ pub(crate) mod test_helpers {
             )
             .await
             .unwrap();
+        txn.commit().await.unwrap();
 
         // verify that trying to create a file with the same UUID throws an error
-        let err = parquet_repo
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let err = txn
+            .parquet_files()
             .create(
-                None,
                 sequencer.id,
                 partition.table_id,
                 partition.id,
@@ -1413,10 +1499,12 @@ pub(crate) mod test_helpers {
             .await
             .unwrap_err();
         assert!(matches!(err, Error::FileExists { object_store_id: _ }));
+        txn.abort().await.unwrap();
 
-        let other_file = parquet_repo
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let other_file = txn
+            .parquet_files()
             .create(
-                None,
                 sequencer.id,
                 other_partition.table_id,
                 other_partition.id,
@@ -1430,22 +1518,24 @@ pub(crate) mod test_helpers {
             .unwrap();
 
         // Must have 2 rows
-        let row_count = parquet_repo.count().await.unwrap();
+        let row_count = txn.parquet_files().count().await.unwrap();
         assert_eq!(row_count, 2);
 
         let exist_id = parquet_file.id;
         let non_exist_id = ParquetFileId::new(other_file.id.get() + 10);
         // make sure exists_id != non_exist_id
         assert_ne!(exist_id, non_exist_id);
-        assert!(parquet_repo.exist(exist_id).await.unwrap());
-        assert!(!parquet_repo.exist(non_exist_id).await.unwrap());
+        assert!(txn.parquet_files().exist(exist_id).await.unwrap());
+        assert!(!txn.parquet_files().exist(non_exist_id).await.unwrap());
 
-        let files = parquet_repo
+        let files = txn
+            .parquet_files()
             .list_by_sequencer_greater_than(sequencer.id, SequenceNumber::new(1))
             .await
             .unwrap();
         assert_eq!(vec![parquet_file, other_file], files);
-        let files = parquet_repo
+        let files = txn
+            .parquet_files()
             .list_by_sequencer_greater_than(sequencer.id, SequenceNumber::new(150))
             .await
             .unwrap();
@@ -1453,18 +1543,25 @@ pub(crate) mod test_helpers {
 
         // verify that to_delete is initially set to false and that it can be updated to true
         assert!(!parquet_file.to_delete);
-        parquet_repo.flag_for_delete(parquet_file.id).await.unwrap();
-        let files = parquet_repo
+        txn.parquet_files()
+            .flag_for_delete(parquet_file.id)
+            .await
+            .unwrap();
+        let files = txn
+            .parquet_files()
             .list_by_sequencer_greater_than(sequencer.id, SequenceNumber::new(1))
             .await
             .unwrap();
         assert!(files.first().unwrap().to_delete);
+
+        txn.commit().await.unwrap();
     }
 
     async fn test_add_parquet_file_with_tombstones(catalog: Arc<dyn Catalog>) {
-        let kafka = catalog.kafka_topics().create_or_get("foo").await.unwrap();
-        let pool = catalog.query_pools().create_or_get("foo").await.unwrap();
-        let namespace = catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let kafka = txn.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = txn.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = txn
             .namespaces()
             .create(
                 "namespace_parquet_file_with_tombstones_test",
@@ -1474,17 +1571,17 @@ pub(crate) mod test_helpers {
             )
             .await
             .unwrap();
-        let table = catalog
+        let table = txn
             .tables()
             .create_or_get("test_table", namespace.id)
             .await
             .unwrap();
-        let sequencer = catalog
+        let sequencer = txn
             .sequencers()
             .create_or_get(&kafka, KafkaPartition::new(1))
             .await
             .unwrap();
-        let partition = catalog
+        let partition = txn
             .partitions()
             .create_or_get("one", sequencer.id, table.id)
             .await
@@ -1493,7 +1590,7 @@ pub(crate) mod test_helpers {
         // Add tombstones
         let min_time = Timestamp::new(1);
         let max_time = Timestamp::new(10);
-        let t1 = catalog
+        let t1 = txn
             .tombstones()
             .create_or_get(
                 table.id,
@@ -1505,7 +1602,7 @@ pub(crate) mod test_helpers {
             )
             .await
             .unwrap();
-        let t2 = catalog
+        let t2 = txn
             .tombstones()
             .create_or_get(
                 table.id,
@@ -1517,7 +1614,7 @@ pub(crate) mod test_helpers {
             )
             .await
             .unwrap();
-        let t3 = catalog
+        let t3 = txn
             .tombstones()
             .create_or_get(
                 table.id,
@@ -1568,90 +1665,162 @@ pub(crate) mod test_helpers {
             to_delete: false,
         };
 
-        let parquet_file_count_before = catalog.parquet_files().count().await.unwrap();
-        let pt_count_before = catalog.processed_tombstones().count().await.unwrap();
+        let parquet_file_count_before = txn.parquet_files().count().await.unwrap();
+        let pt_count_before = txn.processed_tombstones().count().await.unwrap();
+        txn.commit().await.unwrap();
 
         // Add parquet and processed tombstone in one transaction
-        let (parquet_file, p_tombstones) = catalog
-            .add_parquet_file_with_tombstones(&parquet, &[t1.clone(), t2.clone()])
-            .await
-            .unwrap();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let (parquet_file, p_tombstones) =
+            add_parquet_file_with_tombstones(&parquet, &[t1.clone(), t2.clone()], txn.deref_mut())
+                .await
+                .unwrap();
+        txn.commit().await.unwrap();
         assert_eq!(p_tombstones.len(), 2);
         assert_eq!(t1.id, p_tombstones[0].tombstone_id);
         assert_eq!(t2.id, p_tombstones[1].tombstone_id);
 
         // verify the catalog
-        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
-        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let parquet_file_count_after = txn.parquet_files().count().await.unwrap();
+        let pt_count_after = txn.processed_tombstones().count().await.unwrap();
         assert_eq!(pt_count_after - pt_count_before, 2);
         assert_eq!(parquet_file_count_after - parquet_file_count_before, 1);
         let pt_count_before = pt_count_after;
         let parquet_file_count_before = parquet_file_count_after;
 
-        assert!(catalog
-            .parquet_files()
-            .exist(parquet_file.id)
-            .await
-            .unwrap());
-        assert!(catalog
+        assert!(txn.parquet_files().exist(parquet_file.id).await.unwrap());
+        assert!(txn
             .processed_tombstones()
             .exist(parquet_file.id, t1.id)
             .await
             .unwrap());
-        assert!(catalog
+        assert!(txn
             .processed_tombstones()
             .exist(parquet_file.id, t1.id)
             .await
             .unwrap());
+        txn.commit().await.unwrap();
 
         // Error due to duplicate parquet file
-        catalog
-            .add_parquet_file_with_tombstones(&parquet, &[t3.clone(), t1.clone()])
+        let mut txn = catalog.start_transaction().await.unwrap();
+        add_parquet_file_with_tombstones(&parquet, &[t3.clone(), t1.clone()], txn.deref_mut())
             .await
             .unwrap_err();
+        txn.abort().await.unwrap();
+
         // Since the transaction is rollback, t3 is not yet added
-        assert!(!catalog
+        let mut txn = catalog.start_transaction().await.unwrap();
+        assert!(!txn
             .processed_tombstones()
             .exist(parquet_file.id, t3.id)
             .await
             .unwrap());
 
         // Add new parquet and new tombstone. Should go trhough
-        let (parquet_file, p_tombstones) = catalog
-            .add_parquet_file_with_tombstones(&other_parquet, &[t3.clone()])
-            .await
-            .unwrap();
+        let (parquet_file, p_tombstones) =
+            add_parquet_file_with_tombstones(&other_parquet, &[t3.clone()], txn.deref_mut())
+                .await
+                .unwrap();
         assert_eq!(p_tombstones.len(), 1);
         assert_eq!(t3.id, p_tombstones[0].tombstone_id);
-        assert!(catalog
+        assert!(txn
             .processed_tombstones()
             .exist(parquet_file.id, t3.id)
             .await
             .unwrap());
-        assert!(catalog
-            .parquet_files()
-            .exist(parquet_file.id)
-            .await
-            .unwrap());
+        assert!(txn.parquet_files().exist(parquet_file.id).await.unwrap());
 
-        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
-        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
+        let pt_count_after = txn.processed_tombstones().count().await.unwrap();
+        let parquet_file_count_after = txn.parquet_files().count().await.unwrap();
         assert_eq!(pt_count_after - pt_count_before, 1);
         assert_eq!(parquet_file_count_after - parquet_file_count_before, 1);
         let pt_count_before = pt_count_after;
         let parquet_file_count_before = parquet_file_count_after;
+        txn.commit().await.unwrap();
 
         // Add non-exist tombstone t4 and should fail
+        let mut txn = catalog.start_transaction().await.unwrap();
         let mut t4 = t3.clone();
         t4.id = TombstoneId::new(t4.id.get() + 10);
-        catalog
-            .add_parquet_file_with_tombstones(&another_parquet, &[t4])
+        add_parquet_file_with_tombstones(&another_parquet, &[t4], txn.deref_mut())
             .await
             .unwrap_err();
+        txn.abort().await.unwrap();
+
         // Still same count as before
-        let pt_count_after = catalog.processed_tombstones().count().await.unwrap();
-        let parquet_file_count_after = catalog.parquet_files().count().await.unwrap();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let pt_count_after = txn.processed_tombstones().count().await.unwrap();
+        let parquet_file_count_after = txn.parquet_files().count().await.unwrap();
         assert_eq!(pt_count_after - pt_count_before, 0);
         assert_eq!(parquet_file_count_after - parquet_file_count_before, 0);
+        txn.commit().await.unwrap();
+    }
+
+    async fn test_txn_isolation(catalog: Arc<dyn Catalog>) {
+        let barrier = Arc::new(tokio::sync::Barrier::new(2));
+
+        let barrier_captured = Arc::clone(&barrier);
+        let catalog_captured = Arc::clone(&catalog);
+        let insertion_task = tokio::spawn(async move {
+            barrier_captured.wait().await;
+
+            let mut txn = catalog_captured.start_transaction().await.unwrap();
+            txn.kafka_topics()
+                .create_or_get("test_txn_isolation")
+                .await
+                .unwrap();
+
+            tokio::time::sleep(Duration::from_millis(200)).await;
+            txn.abort().await.unwrap();
+        });
+
+        let mut txn = catalog.start_transaction().await.unwrap();
+
+        barrier.wait().await;
+        tokio::time::sleep(Duration::from_millis(100)).await;
+
+        let topic = txn
+            .kafka_topics()
+            .get_by_name("test_txn_isolation")
+            .await
+            .unwrap();
+        assert!(topic.is_none());
+        txn.abort().await.unwrap();
+
+        insertion_task.await.unwrap();
+
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let topic = txn
+            .kafka_topics()
+            .get_by_name("test_txn_isolation")
+            .await
+            .unwrap();
+        assert!(topic.is_none());
+        txn.abort().await.unwrap();
+    }
+
+    async fn test_txn_drop(catalog: Arc<dyn Catalog>) {
+        let capture = TracingCapture::new();
+        let mut txn = catalog.start_transaction().await.unwrap();
+        txn.kafka_topics()
+            .create_or_get("test_txn_drop")
+            .await
+            .unwrap();
+        drop(txn);
+
+        // got a warning
+        assert_contains!(capture.to_string(), "Dropping ");
+        assert_contains!(capture.to_string(), " w/o finalizing (commit or abort)");
+
+        // data is NOT committed
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let topic = txn
+            .kafka_topics()
+            .get_by_name("test_txn_drop")
+            .await
+            .unwrap();
+        assert!(topic.is_none());
+        txn.abort().await.unwrap();
     }
 }
diff --git a/iox_catalog/src/lib.rs b/iox_catalog/src/lib.rs
index 5543c4979e..a14108ebfc 100644
--- a/iox_catalog/src/lib.rs
+++ b/iox_catalog/src/lib.rs
@@ -12,11 +12,11 @@
 )]
 
 use crate::interface::{
-    Catalog, ColumnType, Error, KafkaPartition, KafkaTopic, NamespaceSchema, QueryPool, Result,
-    Sequencer, SequencerId, TableSchema,
+    ColumnType, Error, KafkaPartition, KafkaTopic, NamespaceSchema, QueryPool, Result, Sequencer,
+    SequencerId, TableSchema, Transaction,
 };
-use futures::{stream::FuturesOrdered, StreamExt};
 
+use interface::{ParquetFile, ProcessedTombstone, Tombstone};
 use mutable_batch::MutableBatch;
 use std::{borrow::Cow, collections::BTreeMap};
 
@@ -43,7 +43,7 @@ pub mod postgres;
 pub async fn validate_or_insert_schema<'a, T, U>(
     tables: T,
     schema: &NamespaceSchema,
-    catalog: &dyn Catalog,
+    txn: &mut dyn Transaction,
 ) -> Result<Option<NamespaceSchema>>
 where
     T: IntoIterator<IntoIter = U, Item = (&'a str, &'a MutableBatch)> + Send + Sync,
@@ -55,7 +55,7 @@ where
     let mut schema = Cow::Borrowed(schema);
 
     for (table_name, batch) in tables {
-        validate_mutable_batch(batch, table_name, &mut schema, catalog).await?;
+        validate_mutable_batch(batch, table_name, &mut schema, txn).await?;
     }
 
     match schema {
@@ -68,7 +68,7 @@ async fn validate_mutable_batch(
     mb: &MutableBatch,
     table_name: &str,
     schema: &mut Cow<'_, NamespaceSchema>,
-    catalog: &dyn Catalog,
+    txn: &mut dyn Transaction,
 ) -> Result<()> {
     // Check if the table exists in the schema.
     //
@@ -81,14 +81,14 @@ async fn validate_mutable_batch(
             //
             // Attempt to create the table in the catalog, or load an existing
             // table from the catalog to populate the cache.
-            let mut table = catalog
+            let mut table = txn
                 .tables()
                 .create_or_get(table_name, schema.id)
                 .await
                 .map(|t| TableSchema::new(t.id))?;
 
             // Always add a time column to all new tables.
-            let time_col = catalog
+            let time_col = txn
                 .columns()
                 .create_or_get(TIME_COLUMN, table.id, ColumnType::Time)
                 .await?;
@@ -134,7 +134,7 @@ async fn validate_mutable_batch(
             None => {
                 // The column does not exist in the cache, create/get it from
                 // the catalog, and add it to the table.
-                let column = catalog
+                let column = txn
                     .columns()
                     .create_or_get(name.as_str(), table.id, ColumnType::from(col.influx_type()))
                     .await?;
@@ -161,34 +161,53 @@ async fn validate_mutable_batch(
 /// each of the partitions.
 pub async fn create_or_get_default_records(
     kafka_partition_count: i32,
-    catalog: &dyn Catalog,
+    txn: &mut dyn Transaction,
 ) -> Result<(KafkaTopic, QueryPool, BTreeMap<SequencerId, Sequencer>)> {
-    let kafka_topic = catalog
-        .kafka_topics()
-        .create_or_get(SHARED_KAFKA_TOPIC)
-        .await?;
-    let query_pool = catalog
-        .query_pools()
-        .create_or_get(SHARED_QUERY_POOL)
-        .await?;
+    let kafka_topic = txn.kafka_topics().create_or_get(SHARED_KAFKA_TOPIC).await?;
+    let query_pool = txn.query_pools().create_or_get(SHARED_QUERY_POOL).await?;
 
-    let sequencers = (1..=kafka_partition_count)
-        .map(|partition| {
-            catalog
-                .sequencers()
-                .create_or_get(&kafka_topic, KafkaPartition::new(partition))
-        })
-        .collect::<FuturesOrdered<_>>()
-        .map(|v| {
-            let v = v.expect("failed to create sequencer");
-            (v.id, v)
-        })
-        .collect::<BTreeMap<_, _>>()
-        .await;
+    let mut sequencers = BTreeMap::new();
+    for partition in 1..=kafka_partition_count {
+        let sequencer = txn
+            .sequencers()
+            .create_or_get(&kafka_topic, KafkaPartition::new(partition))
+            .await?;
+        sequencers.insert(sequencer.id, sequencer);
+    }
 
     Ok((kafka_topic, query_pool, sequencers))
 }
 
+/// Insert the conpacted parquet file and its tombstones
+pub async fn add_parquet_file_with_tombstones(
+    parquet_file: &ParquetFile,
+    tombstones: &[Tombstone],
+    txn: &mut dyn Transaction,
+) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
+    // create a parquet file in the catalog first
+    let parquet = txn
+        .parquet_files()
+        .create(
+            parquet_file.sequencer_id,
+            parquet_file.table_id,
+            parquet_file.partition_id,
+            parquet_file.object_store_id,
+            parquet_file.min_sequence_number,
+            parquet_file.max_sequence_number,
+            parquet_file.min_time,
+            parquet_file.max_time,
+        )
+        .await?;
+
+    // Now the parquet available, create its processed tombstones
+    let processed_tombstones = txn
+        .processed_tombstones()
+        .create_many(parquet.id, tombstones)
+        .await?;
+
+    Ok((parquet, processed_tombstones))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -211,13 +230,16 @@ mod tests {
                 #[allow(clippy::bool_assert_comparison)]
                 #[tokio::test]
                 async fn [<test_validate_schema_ $name>]() {
+                    use crate::interface::Catalog;
+                    use std::ops::DerefMut;
                     use pretty_assertions::assert_eq;
                     const NAMESPACE_NAME: &str = "bananas";
 
                     let repo = MemCatalog::new();
-                    let (kafka_topic, query_pool, _) = create_or_get_default_records(2, &repo).await.unwrap();
+                    let mut txn = repo.start_transaction().await.unwrap();
+                    let (kafka_topic, query_pool, _) = create_or_get_default_records(2, txn.deref_mut()).await.unwrap();
 
-                    let namespace = repo
+                    let namespace = txn
                         .namespaces()
                         .create(NAMESPACE_NAME, "inf", kafka_topic.id, query_pool.id)
                         .await
@@ -240,7 +262,7 @@ mod tests {
                             let (writes, _) = mutable_batch_lp::lines_to_batches_stats(lp.as_str(), 42)
                                 .expect("failed to build test writes from LP");
 
-                            let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, &repo)
+                            let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut())
                                 .await;
 
                             match got {
@@ -260,7 +282,7 @@ mod tests {
                     // Invariant: in absence of concurrency, the schema within
                     // the database must always match the incrementally built
                     // cached schema.
-                    let db_schema = get_schema_by_name(NAMESPACE_NAME, &repo)
+                    let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut())
                         .await
                         .expect("database failed to query for namespace schema");
                     assert_eq!(schema, db_schema, "schema in DB and cached schema differ");
diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs
index 58852cdc19..c844976848 100644
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@@ -2,25 +2,26 @@
 //! used for testing or for an IOx designed to run without catalog persistence.
 
 use crate::interface::{
-    Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic,
-    KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
-    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo,
-    ProcessedTombstone, ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result,
-    SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp,
-    Tombstone, TombstoneId, TombstoneRepo,
+    sealed::TransactionFinalize, Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error,
+    KafkaPartition, KafkaTopic, KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId,
+    NamespaceRepo, ParquetFile, ParquetFileId, ParquetFileRepo, Partition, PartitionId,
+    PartitionInfo, PartitionRepo, ProcessedTombstone, ProcessedTombstoneRepo, QueryPool,
+    QueryPoolId, QueryPoolRepo, Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo,
+    Table, TableId, TableRepo, Timestamp, Tombstone, TombstoneId, TombstoneRepo, Transaction,
 };
 use async_trait::async_trait;
-use sqlx::{Postgres, Transaction};
+use observability_deps::tracing::warn;
 use std::convert::TryFrom;
 use std::fmt::Formatter;
-use std::sync::Mutex;
+use std::sync::Arc;
+use tokio::sync::{Mutex, OwnedMutexGuard};
 use uuid::Uuid;
 
 /// In-memory catalog that implements the `RepoCollection` and individual repo traits from
 /// the catalog interface.
 #[derive(Default)]
 pub struct MemCatalog {
-    collections: Mutex<MemCollections>,
+    collections: Arc<Mutex<MemCollections>>,
 }
 
 impl MemCatalog {
@@ -28,25 +29,15 @@ impl MemCatalog {
     pub fn new() -> Self {
         Self::default()
     }
-
-    // Since this is test catalog that do not handle transaction
-    // this is a help function to fake `rollback` work
-    fn remove_parquet_file(&self, object_store_id: Uuid) {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-        collections
-            .parquet_files
-            .retain(|f| f.object_store_id != object_store_id);
-    }
 }
 
 impl std::fmt::Debug for MemCatalog {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        let c = self.collections.lock().expect("mutex poisoned");
-        write!(f, "MemCatalog[ {:?} ]", c)
+        f.debug_struct("MemCatalog").finish_non_exhaustive()
     }
 }
 
-#[derive(Default, Debug)]
+#[derive(Default, Debug, Clone)]
 struct MemCollections {
     kafka_topics: Vec<KafkaTopic>,
     query_pools: Vec<QueryPool>,
@@ -60,6 +51,22 @@ struct MemCollections {
     processed_tombstones: Vec<ProcessedTombstone>,
 }
 
+/// transaction bound to an in-memory catalog.
+#[derive(Debug)]
+pub struct MemTxn {
+    guard: OwnedMutexGuard<MemCollections>,
+    stage: MemCollections,
+    finalized: bool,
+}
+
+impl Drop for MemTxn {
+    fn drop(&mut self) {
+        if !self.finalized {
+            warn!("Dropping MemTxn w/o finalizing (commit or abort)");
+        }
+    }
+}
+
 #[async_trait]
 impl Catalog for MemCatalog {
     async fn setup(&self) -> Result<(), Error> {
@@ -67,110 +74,95 @@ impl Catalog for MemCatalog {
         Ok(())
     }
 
-    fn kafka_topics(&self) -> &dyn KafkaTopicRepo {
-        self
-    }
-
-    fn query_pools(&self) -> &dyn QueryPoolRepo {
-        self
-    }
-
-    fn namespaces(&self) -> &dyn NamespaceRepo {
-        self
-    }
-
-    fn tables(&self) -> &dyn TableRepo {
-        self
-    }
-
-    fn columns(&self) -> &dyn ColumnRepo {
-        self
-    }
-
-    fn sequencers(&self) -> &dyn SequencerRepo {
-        self
-    }
-
-    fn partitions(&self) -> &dyn PartitionRepo {
-        self
-    }
-
-    fn tombstones(&self) -> &dyn TombstoneRepo {
-        self
-    }
-
-    fn parquet_files(&self) -> &dyn ParquetFileRepo {
-        self
-    }
-
-    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo {
-        self
-    }
-
-    async fn add_parquet_file_with_tombstones(
-        &self,
-        parquet_file: &ParquetFile,
-        tombstones: &[Tombstone],
-    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
-        // The activities in this file must either be all succeed or all fail
-
-        // Create a parquet file in the catalog first
-        let parquet = self
-            .parquet_files()
-            .create(
-                None,
-                parquet_file.sequencer_id,
-                parquet_file.table_id,
-                parquet_file.partition_id,
-                parquet_file.object_store_id,
-                parquet_file.min_sequence_number,
-                parquet_file.max_sequence_number,
-                parquet_file.min_time,
-                parquet_file.max_time,
-            )
-            .await?;
-
-        // Now the parquet available, let create its dependent processed tombstones
-        let processed_tombstones = self
-            .processed_tombstones()
-            .create_many(None, parquet.id, tombstones)
-            .await;
-
-        if let Err(error) = processed_tombstones {
-            // failed to insert processed tombstone, remove the above
-            // inserted parquet file from the catalog
-            self.remove_parquet_file(parquet.object_store_id);
-            return Err(error);
-        }
-        let processed_tombstones = processed_tombstones.unwrap();
-
-        Ok((parquet, processed_tombstones))
+    async fn start_transaction(&self) -> Result<Box<dyn Transaction>, Error> {
+        let guard = Arc::clone(&self.collections).lock_owned().await;
+        let stage = guard.clone();
+        Ok(Box::new(MemTxn {
+            guard,
+            stage,
+            finalized: false,
+        }))
     }
 }
 
 #[async_trait]
-impl KafkaTopicRepo for MemCatalog {
-    async fn create_or_get(&self, name: &str) -> Result<KafkaTopic> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
+impl TransactionFinalize for MemTxn {
+    async fn commit_inplace(&mut self) -> Result<(), Error> {
+        *self.guard = std::mem::take(&mut self.stage);
+        self.finalized = true;
+        Ok(())
+    }
 
-        let topic = match collections.kafka_topics.iter().find(|t| t.name == name) {
+    async fn abort_inplace(&mut self) -> Result<(), Error> {
+        self.finalized = true;
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Transaction for MemTxn {
+    fn kafka_topics(&mut self) -> &mut dyn KafkaTopicRepo {
+        self
+    }
+
+    fn query_pools(&mut self) -> &mut dyn QueryPoolRepo {
+        self
+    }
+
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn sequencers(&mut self) -> &mut dyn SequencerRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn tombstones(&mut self) -> &mut dyn TombstoneRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+
+    fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl KafkaTopicRepo for MemTxn {
+    async fn create_or_get(&mut self, name: &str) -> Result<KafkaTopic> {
+        let topic = match self.stage.kafka_topics.iter().find(|t| t.name == name) {
             Some(t) => t,
             None => {
                 let topic = KafkaTopic {
-                    id: KafkaTopicId::new(collections.kafka_topics.len() as i32 + 1),
+                    id: KafkaTopicId::new(self.stage.kafka_topics.len() as i32 + 1),
                     name: name.to_string(),
                 };
-                collections.kafka_topics.push(topic);
-                collections.kafka_topics.last().unwrap()
+                self.stage.kafka_topics.push(topic);
+                self.stage.kafka_topics.last().unwrap()
             }
         };
 
         Ok(topic.clone())
     }
 
-    async fn get_by_name(&self, name: &str) -> Result<Option<KafkaTopic>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let kafka_topic = collections
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<KafkaTopic>> {
+        let kafka_topic = self
+            .stage
             .kafka_topics
             .iter()
             .find(|t| t.name == name)
@@ -180,19 +172,17 @@ impl KafkaTopicRepo for MemCatalog {
 }
 
 #[async_trait]
-impl QueryPoolRepo for MemCatalog {
-    async fn create_or_get(&self, name: &str) -> Result<QueryPool> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
-        let pool = match collections.query_pools.iter().find(|t| t.name == name) {
+impl QueryPoolRepo for MemTxn {
+    async fn create_or_get(&mut self, name: &str) -> Result<QueryPool> {
+        let pool = match self.stage.query_pools.iter().find(|t| t.name == name) {
             Some(t) => t,
             None => {
                 let pool = QueryPool {
-                    id: QueryPoolId::new(collections.query_pools.len() as i16 + 1),
+                    id: QueryPoolId::new(self.stage.query_pools.len() as i16 + 1),
                     name: name.to_string(),
                 };
-                collections.query_pools.push(pool);
-                collections.query_pools.last().unwrap()
+                self.stage.query_pools.push(pool);
+                self.stage.query_pools.last().unwrap()
             }
         };
 
@@ -201,35 +191,34 @@ impl QueryPoolRepo for MemCatalog {
 }
 
 #[async_trait]
-impl NamespaceRepo for MemCatalog {
+impl NamespaceRepo for MemTxn {
     async fn create(
-        &self,
+        &mut self,
         name: &str,
         retention_duration: &str,
         kafka_topic_id: KafkaTopicId,
         query_pool_id: QueryPoolId,
     ) -> Result<Namespace> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-        if collections.namespaces.iter().any(|n| n.name == name) {
+        if self.stage.namespaces.iter().any(|n| n.name == name) {
             return Err(Error::NameExists {
                 name: name.to_string(),
             });
         }
 
         let namespace = Namespace {
-            id: NamespaceId::new(collections.namespaces.len() as i32 + 1),
+            id: NamespaceId::new(self.stage.namespaces.len() as i32 + 1),
             name: name.to_string(),
             kafka_topic_id,
             query_pool_id,
             retention_duration: Some(retention_duration.to_string()),
         };
-        collections.namespaces.push(namespace);
-        Ok(collections.namespaces.last().unwrap().clone())
+        self.stage.namespaces.push(namespace);
+        Ok(self.stage.namespaces.last().unwrap().clone())
     }
 
-    async fn get_by_name(&self, name: &str) -> Result<Option<Namespace>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        Ok(collections
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<Namespace>> {
+        Ok(self
+            .stage
             .namespaces
             .iter()
             .find(|n| n.name == name)
@@ -238,11 +227,10 @@ impl NamespaceRepo for MemCatalog {
 }
 
 #[async_trait]
-impl TableRepo for MemCatalog {
-    async fn create_or_get(&self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
-        let table = match collections
+impl TableRepo for MemTxn {
+    async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
+        let table = match self
+            .stage
             .tables
             .iter()
             .find(|t| t.name == name && t.namespace_id == namespace_id)
@@ -250,21 +238,21 @@ impl TableRepo for MemCatalog {
             Some(t) => t,
             None => {
                 let table = Table {
-                    id: TableId::new(collections.tables.len() as i32 + 1),
+                    id: TableId::new(self.stage.tables.len() as i32 + 1),
                     namespace_id,
                     name: name.to_string(),
                 };
-                collections.tables.push(table);
-                collections.tables.last().unwrap()
+                self.stage.tables.push(table);
+                self.stage.tables.last().unwrap()
             }
         };
 
         Ok(table.clone())
     }
 
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let tables: Vec<_> = collections
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let tables: Vec<_> = self
+            .stage
             .tables
             .iter()
             .filter(|t| t.namespace_id == namespace_id)
@@ -275,16 +263,15 @@ impl TableRepo for MemCatalog {
 }
 
 #[async_trait]
-impl ColumnRepo for MemCatalog {
+impl ColumnRepo for MemTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         name: &str,
         table_id: TableId,
         column_type: ColumnType,
     ) -> Result<Column> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
-        let column = match collections
+        let column = match self
+            .stage
             .columns
             .iter()
             .find(|t| t.name == name && t.table_id == table_id)
@@ -302,31 +289,31 @@ impl ColumnRepo for MemCatalog {
             }
             None => {
                 let column = Column {
-                    id: ColumnId::new(collections.columns.len() as i32 + 1),
+                    id: ColumnId::new(self.stage.columns.len() as i32 + 1),
                     table_id,
                     name: name.to_string(),
                     column_type: column_type as i16,
                 };
-                collections.columns.push(column);
-                collections.columns.last().unwrap()
+                self.stage.columns.push(column);
+                self.stage.columns.last().unwrap()
             }
         };
 
         Ok(column.clone())
     }
 
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-
-        let table_ids: Vec<_> = collections
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let table_ids: Vec<_> = self
+            .stage
             .tables
             .iter()
             .filter(|t| t.namespace_id == namespace_id)
             .map(|t| t.id)
             .collect();
-        println!("tables: {:?}", collections.tables);
+        println!("tables: {:?}", self.stage.tables);
         println!("table_ids: {:?}", table_ids);
-        let columns: Vec<_> = collections
+        let columns: Vec<_> = self
+            .stage
             .columns
             .iter()
             .filter(|c| table_ids.contains(&c.table_id))
@@ -338,15 +325,14 @@ impl ColumnRepo for MemCatalog {
 }
 
 #[async_trait]
-impl SequencerRepo for MemCatalog {
+impl SequencerRepo for MemTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         topic: &KafkaTopic,
         partition: KafkaPartition,
     ) -> Result<Sequencer> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
-        let sequencer = match collections
+        let sequencer = match self
+            .stage
             .sequencers
             .iter()
             .find(|s| s.kafka_topic_id == topic.id && s.kafka_partition == partition)
@@ -354,13 +340,13 @@ impl SequencerRepo for MemCatalog {
             Some(t) => t,
             None => {
                 let sequencer = Sequencer {
-                    id: SequencerId::new(collections.sequencers.len() as i16 + 1),
+                    id: SequencerId::new(self.stage.sequencers.len() as i16 + 1),
                     kafka_topic_id: topic.id,
                     kafka_partition: partition,
                     min_unpersisted_sequence_number: 0,
                 };
-                collections.sequencers.push(sequencer);
-                collections.sequencers.last().unwrap()
+                self.stage.sequencers.push(sequencer);
+                self.stage.sequencers.last().unwrap()
             }
         };
 
@@ -368,12 +354,12 @@ impl SequencerRepo for MemCatalog {
     }
 
     async fn get_by_topic_id_and_partition(
-        &self,
+        &mut self,
         topic_id: KafkaTopicId,
         partition: KafkaPartition,
     ) -> Result<Option<Sequencer>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let sequencer = collections
+        let sequencer = self
+            .stage
             .sequencers
             .iter()
             .find(|s| s.kafka_topic_id == topic_id && s.kafka_partition == partition)
@@ -381,14 +367,13 @@ impl SequencerRepo for MemCatalog {
         Ok(sequencer)
     }
 
-    async fn list(&self) -> Result<Vec<Sequencer>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        Ok(collections.sequencers.clone())
+    async fn list(&mut self) -> Result<Vec<Sequencer>> {
+        Ok(self.stage.sequencers.clone())
     }
 
-    async fn list_by_kafka_topic(&self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let sequencers: Vec<_> = collections
+    async fn list_by_kafka_topic(&mut self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
+        let sequencers: Vec<_> = self
+            .stage
             .sequencers
             .iter()
             .filter(|s| s.kafka_topic_id == topic.id)
@@ -399,36 +384,35 @@ impl SequencerRepo for MemCatalog {
 }
 
 #[async_trait]
-impl PartitionRepo for MemCatalog {
+impl PartitionRepo for MemTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         key: &str,
         sequencer_id: SequencerId,
         table_id: TableId,
     ) -> Result<Partition> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-        let partition = match collections.partitions.iter().find(|p| {
+        let partition = match self.stage.partitions.iter().find(|p| {
             p.partition_key == key && p.sequencer_id == sequencer_id && p.table_id == table_id
         }) {
             Some(p) => p,
             None => {
                 let p = Partition {
-                    id: PartitionId::new(collections.partitions.len() as i64 + 1),
+                    id: PartitionId::new(self.stage.partitions.len() as i64 + 1),
                     sequencer_id,
                     table_id,
                     partition_key: key.to_string(),
                 };
-                collections.partitions.push(p);
-                collections.partitions.last().unwrap()
+                self.stage.partitions.push(p);
+                self.stage.partitions.last().unwrap()
             }
         };
 
         Ok(partition.clone())
     }
 
-    async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let partitions: Vec<_> = collections
+    async fn list_by_sequencer(&mut self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
+        let partitions: Vec<_> = self
+            .stage
             .partitions
             .iter()
             .filter(|p| p.sequencer_id == sequencer_id)
@@ -438,24 +422,26 @@ impl PartitionRepo for MemCatalog {
     }
 
     async fn partition_info_by_id(
-        &self,
+        &mut self,
         partition_id: PartitionId,
     ) -> Result<Option<PartitionInfo>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let partition = collections
+        let partition = self
+            .stage
             .partitions
             .iter()
             .find(|p| p.id == partition_id)
             .cloned();
 
         if let Some(partition) = partition {
-            let table = collections
+            let table = self
+                .stage
                 .tables
                 .iter()
                 .find(|t| t.id == partition.table_id)
                 .cloned();
             if let Some(table) = table {
-                let namespace = collections
+                let namespace = self
+                    .stage
                     .namespaces
                     .iter()
                     .find(|n| n.id == table.namespace_id)
@@ -475,9 +461,9 @@ impl PartitionRepo for MemCatalog {
 }
 
 #[async_trait]
-impl TombstoneRepo for MemCatalog {
+impl TombstoneRepo for MemTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         table_id: TableId,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
@@ -485,8 +471,7 @@ impl TombstoneRepo for MemCatalog {
         max_time: Timestamp,
         predicate: &str,
     ) -> Result<Tombstone> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-        let tombstone = match collections.tombstones.iter().find(|t| {
+        let tombstone = match self.stage.tombstones.iter().find(|t| {
             t.table_id == table_id
                 && t.sequencer_id == sequencer_id
                 && t.sequence_number == sequence_number
@@ -494,7 +479,7 @@ impl TombstoneRepo for MemCatalog {
             Some(t) => t,
             None => {
                 let t = Tombstone {
-                    id: TombstoneId::new(collections.tombstones.len() as i64 + 1),
+                    id: TombstoneId::new(self.stage.tombstones.len() as i64 + 1),
                     table_id,
                     sequencer_id,
                     sequence_number,
@@ -502,8 +487,8 @@ impl TombstoneRepo for MemCatalog {
                     max_time,
                     serialized_predicate: predicate.to_string(),
                 };
-                collections.tombstones.push(t);
-                collections.tombstones.last().unwrap()
+                self.stage.tombstones.push(t);
+                self.stage.tombstones.last().unwrap()
             }
         };
 
@@ -511,12 +496,12 @@ impl TombstoneRepo for MemCatalog {
     }
 
     async fn list_tombstones_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<Tombstone>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let tombstones: Vec<_> = collections
+        let tombstones: Vec<_> = self
+            .stage
             .tombstones
             .iter()
             .filter(|t| t.sequencer_id == sequencer_id && t.sequence_number > sequence_number)
@@ -527,10 +512,9 @@ impl TombstoneRepo for MemCatalog {
 }
 
 #[async_trait]
-impl ParquetFileRepo for MemCatalog {
+impl ParquetFileRepo for MemTxn {
     async fn create(
-        &self,
-        _txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -540,8 +524,8 @@ impl ParquetFileRepo for MemCatalog {
         min_time: Timestamp,
         max_time: Timestamp,
     ) -> Result<ParquetFile> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-        if collections
+        if self
+            .stage
             .parquet_files
             .iter()
             .any(|f| f.object_store_id == object_store_id)
@@ -550,7 +534,7 @@ impl ParquetFileRepo for MemCatalog {
         }
 
         let parquet_file = ParquetFile {
-            id: ParquetFileId::new(collections.parquet_files.len() as i64 + 1),
+            id: ParquetFileId::new(self.stage.parquet_files.len() as i64 + 1),
             sequencer_id,
             table_id,
             partition_id,
@@ -561,14 +545,12 @@ impl ParquetFileRepo for MemCatalog {
             max_time,
             to_delete: false,
         };
-        collections.parquet_files.push(parquet_file);
-        Ok(*collections.parquet_files.last().unwrap())
+        self.stage.parquet_files.push(parquet_file);
+        Ok(*self.stage.parquet_files.last().unwrap())
     }
 
-    async fn flag_for_delete(&self, id: ParquetFileId) -> Result<()> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
-        match collections.parquet_files.iter_mut().find(|p| p.id == id) {
+    async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()> {
+        match self.stage.parquet_files.iter_mut().find(|p| p.id == id) {
             Some(f) => f.to_delete = true,
             None => return Err(Error::ParquetRecordNotFound { id }),
         }
@@ -577,12 +559,12 @@ impl ParquetFileRepo for MemCatalog {
     }
 
     async fn list_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<ParquetFile>> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let files: Vec<_> = collections
+        let files: Vec<_> = self
+            .stage
             .parquet_files
             .iter()
             .filter(|f| f.sequencer_id == sequencer_id && f.max_sequence_number > sequence_number)
@@ -591,14 +573,12 @@ impl ParquetFileRepo for MemCatalog {
         Ok(files)
     }
 
-    async fn exist(&self, id: ParquetFileId) -> Result<bool> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        Ok(collections.parquet_files.iter().any(|f| f.id == id))
+    async fn exist(&mut self, id: ParquetFileId) -> Result<bool> {
+        Ok(self.stage.parquet_files.iter().any(|f| f.id == id))
     }
 
-    async fn count(&self) -> Result<i64> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let count = collections.parquet_files.len();
+    async fn count(&mut self) -> Result<i64> {
+        let count = self.stage.parquet_files.len();
         let count_i64 = i64::try_from(count);
         if count_i64.is_err() {
             return Err(Error::InvalidValue { value: count });
@@ -608,17 +588,15 @@ impl ParquetFileRepo for MemCatalog {
 }
 
 #[async_trait]
-impl ProcessedTombstoneRepo for MemCatalog {
+impl ProcessedTombstoneRepo for MemTxn {
     async fn create_many(
-        &self,
-        _txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstones: &[Tombstone],
     ) -> Result<Vec<ProcessedTombstone>> {
-        let mut collections = self.collections.lock().expect("mutex poisoned");
-
         // check if the parquet file available
-        if !collections
+        if !self
+            .stage
             .parquet_files
             .iter()
             .any(|f| f.id == parquet_file_id)
@@ -631,13 +609,14 @@ impl ProcessedTombstoneRepo for MemCatalog {
         let mut processed_tombstones = vec![];
         for tombstone in tombstones {
             // check if tomstone exists
-            if !collections.tombstones.iter().any(|f| f.id == tombstone.id) {
+            if !self.stage.tombstones.iter().any(|f| f.id == tombstone.id) {
                 return Err(Error::TombstoneNotFound {
                     id: tombstone.id.get(),
                 });
             }
 
-            if collections
+            if self
+                .stage
                 .processed_tombstones
                 .iter()
                 .any(|pt| pt.tombstone_id == tombstone.id && pt.parquet_file_id == parquet_file_id)
@@ -660,7 +639,7 @@ impl ProcessedTombstoneRepo for MemCatalog {
         let return_processed_tombstones = processed_tombstones.clone();
 
         // Add to the catalog
-        collections
+        self.stage
             .processed_tombstones
             .append(&mut processed_tombstones);
 
@@ -668,20 +647,19 @@ impl ProcessedTombstoneRepo for MemCatalog {
     }
 
     async fn exist(
-        &self,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstone_id: TombstoneId,
     ) -> Result<bool> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        Ok(collections
+        Ok(self
+            .stage
             .processed_tombstones
             .iter()
             .any(|f| f.parquet_file_id == parquet_file_id && f.tombstone_id == tombstone_id))
     }
 
-    async fn count(&self) -> Result<i64> {
-        let collections = self.collections.lock().expect("mutex poisoned");
-        let count = collections.processed_tombstones.len();
+    async fn count(&mut self) -> Result<i64> {
+        let count = self.stage.processed_tombstones.len();
         let count_i64 = i64::try_from(count);
         if count_i64.is_err() {
             return Err(Error::InvalidValue { value: count });
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 078bdd8f8f..7fa63859cb 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -1,18 +1,16 @@
 //! A Postgres backed implementation of the Catalog
 
 use crate::interface::{
-    Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic, KafkaTopicId,
-    KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile, ParquetFileId,
-    ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo, ProcessedTombstone,
-    ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result, SequenceNumber,
-    Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp, Tombstone,
-    TombstoneId, TombstoneRepo,
+    sealed::TransactionFinalize, Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition,
+    KafkaTopic, KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
+    ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo,
+    ProcessedTombstone, ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result,
+    SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp,
+    Tombstone, TombstoneId, TombstoneRepo, Transaction,
 };
 use async_trait::async_trait;
 use observability_deps::tracing::{info, warn};
-use sqlx::{
-    migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Row, Transaction,
-};
+use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Row};
 use std::time::Duration;
 use uuid::Uuid;
 
@@ -24,7 +22,7 @@ pub const SCHEMA_NAME: &str = "iox_catalog";
 
 static MIGRATOR: Migrator = sqlx::migrate!();
 
-/// In-memory catalog that implements the `RepoCollection` and individual repo traits.
+/// PostgreSQL catalog.
 #[derive(Debug)]
 pub struct PostgresCatalog {
     pool: Pool<Postgres>,
@@ -72,6 +70,50 @@ impl PostgresCatalog {
     }
 }
 
+/// transaction for [`PostgresCatalog`].
+#[derive(Debug)]
+pub struct PostgresTxn {
+    transaction: Option<sqlx::Transaction<'static, Postgres>>,
+}
+
+impl PostgresTxn {
+    fn transaction(&mut self) -> &mut sqlx::Transaction<'static, Postgres> {
+        self.transaction.as_mut().expect("Not yet finalized")
+    }
+}
+
+impl Drop for PostgresTxn {
+    fn drop(&mut self) {
+        if self.transaction.is_some() {
+            warn!("Dropping PostgresTxn w/o finalizing (commit or abort)");
+
+            // SQLx ensures that the inner transaction enqueues a rollback when it is dropped, so we don't need to spawn
+            // a task here to call `rollback` manually.
+        }
+    }
+}
+
+#[async_trait]
+impl TransactionFinalize for PostgresTxn {
+    async fn commit_inplace(&mut self) -> Result<(), Error> {
+        self.transaction
+            .take()
+            .expect("Not yet finalized")
+            .commit()
+            .await
+            .map_err(|e| Error::SqlxError { source: e })
+    }
+
+    async fn abort_inplace(&mut self) -> Result<(), Error> {
+        self.transaction
+            .take()
+            .expect("Not yet finalized")
+            .rollback()
+            .await
+            .map_err(|e| Error::SqlxError { source: e })
+    }
+}
+
 #[async_trait]
 impl Catalog for PostgresCatalog {
     async fn setup(&self) -> Result<(), Error> {
@@ -83,111 +125,65 @@ impl Catalog for PostgresCatalog {
         Ok(())
     }
 
-    fn kafka_topics(&self) -> &dyn KafkaTopicRepo {
-        self
-    }
+    async fn start_transaction(&self) -> Result<Box<dyn Transaction>, Error> {
+        let transaction = self
+            .pool
+            .begin()
+            .await
+            .map_err(|e| Error::SqlxError { source: e })?;
 
-    fn query_pools(&self) -> &dyn QueryPoolRepo {
-        self
-    }
-
-    fn namespaces(&self) -> &dyn NamespaceRepo {
-        self
-    }
-
-    fn tables(&self) -> &dyn TableRepo {
-        self
-    }
-
-    fn columns(&self) -> &dyn ColumnRepo {
-        self
-    }
-
-    fn sequencers(&self) -> &dyn SequencerRepo {
-        self
-    }
-
-    fn partitions(&self) -> &dyn PartitionRepo {
-        self
-    }
-
-    fn tombstones(&self) -> &dyn TombstoneRepo {
-        self
-    }
-
-    fn parquet_files(&self) -> &dyn ParquetFileRepo {
-        self
-    }
-
-    fn processed_tombstones(&self) -> &dyn ProcessedTombstoneRepo {
-        self
-    }
-
-    async fn add_parquet_file_with_tombstones(
-        &self,
-        parquet_file: &ParquetFile,
-        tombstones: &[Tombstone],
-    ) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
-        // Start a transaction
-        let txt = self.pool.begin().await;
-        if let Err(error) = txt {
-            return Err(Error::StartTransaction { source: error });
-        }
-        let mut txt = txt.unwrap();
-
-        // create a parquet file in the catalog first
-        let parquet = self
-            .parquet_files()
-            .create(
-                Some(&mut txt),
-                parquet_file.sequencer_id,
-                parquet_file.table_id,
-                parquet_file.partition_id,
-                parquet_file.object_store_id,
-                parquet_file.min_sequence_number,
-                parquet_file.max_sequence_number,
-                parquet_file.min_time,
-                parquet_file.max_time,
-            )
-            .await;
-
-        if let Err(error) = parquet {
-            // Error while adding parquet file into the catalog, stop the transaction
-            warn!(object_store_id=?parquet_file.object_store_id.to_string(), "{}", error.to_string());
-            let _rollback = txt.rollback().await;
-            return Err(error);
-        }
-        let parquet = parquet.unwrap();
-
-        // Now the parquet available, create its processed tombstones
-        let processed_tombstones = self
-            .processed_tombstones()
-            .create_many(Some(&mut txt), parquet.id, tombstones)
-            .await;
-
-        let processed_tombstones = match processed_tombstones {
-            Ok(processed_tombstones) => processed_tombstones,
-            Err(e) => {
-                // Error while adding processed tombstones
-                warn!(
-                    "Error while adding processed tombstone: {}. Transaction stops.",
-                    e.to_string()
-                );
-                let _rollback = txt.rollback().await;
-                return Err(e);
-            }
-        };
-
-        // Commit the transaction
-        let _commit = txt.commit().await;
-
-        Ok((parquet, processed_tombstones))
+        Ok(Box::new(PostgresTxn {
+            transaction: Some(transaction),
+        }))
     }
 }
 
 #[async_trait]
-impl KafkaTopicRepo for PostgresCatalog {
-    async fn create_or_get(&self, name: &str) -> Result<KafkaTopic> {
+impl Transaction for PostgresTxn {
+    fn kafka_topics(&mut self) -> &mut dyn KafkaTopicRepo {
+        self
+    }
+
+    fn query_pools(&mut self) -> &mut dyn QueryPoolRepo {
+        self
+    }
+
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn sequencers(&mut self) -> &mut dyn SequencerRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn tombstones(&mut self) -> &mut dyn TombstoneRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+
+    fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl KafkaTopicRepo for PostgresTxn {
+    async fn create_or_get(&mut self, name: &str) -> Result<KafkaTopic> {
         let rec = sqlx::query_as::<_, KafkaTopic>(
             r#"
 INSERT INTO kafka_topic ( name )
@@ -197,21 +193,21 @@ DO UPDATE SET name = kafka_topic.name RETURNING *;
         "#,
         )
         .bind(&name) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
         Ok(rec)
     }
 
-    async fn get_by_name(&self, name: &str) -> Result<Option<KafkaTopic>> {
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<KafkaTopic>> {
         let rec = sqlx::query_as::<_, KafkaTopic>(
             r#"
 SELECT * FROM kafka_topic WHERE name = $1;
         "#,
         )
         .bind(&name) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await;
 
         if let Err(sqlx::Error::RowNotFound) = rec {
@@ -225,8 +221,8 @@ SELECT * FROM kafka_topic WHERE name = $1;
 }
 
 #[async_trait]
-impl QueryPoolRepo for PostgresCatalog {
-    async fn create_or_get(&self, name: &str) -> Result<QueryPool> {
+impl QueryPoolRepo for PostgresTxn {
+    async fn create_or_get(&mut self, name: &str) -> Result<QueryPool> {
         let rec = sqlx::query_as::<_, QueryPool>(
             r#"
 INSERT INTO query_pool ( name )
@@ -236,7 +232,7 @@ DO UPDATE SET name = query_pool.name RETURNING *;
         "#,
         )
         .bind(&name) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -245,9 +241,9 @@ DO UPDATE SET name = query_pool.name RETURNING *;
 }
 
 #[async_trait]
-impl NamespaceRepo for PostgresCatalog {
+impl NamespaceRepo for PostgresTxn {
     async fn create(
-        &self,
+        &mut self,
         name: &str,
         retention_duration: &str,
         kafka_topic_id: KafkaTopicId,
@@ -264,7 +260,7 @@ RETURNING *
         .bind(&retention_duration) // $2
         .bind(kafka_topic_id) // $3
         .bind(query_pool_id) // $4
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_unique_violation(&e) {
@@ -281,14 +277,14 @@ RETURNING *
         Ok(rec)
     }
 
-    async fn get_by_name(&self, name: &str) -> Result<Option<Namespace>> {
+    async fn get_by_name(&mut self, name: &str) -> Result<Option<Namespace>> {
         let rec = sqlx::query_as::<_, Namespace>(
             r#"
 SELECT * FROM namespace WHERE name = $1;
         "#,
         )
         .bind(&name) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await;
 
         if let Err(sqlx::Error::RowNotFound) = rec {
@@ -302,8 +298,8 @@ SELECT * FROM namespace WHERE name = $1;
 }
 
 #[async_trait]
-impl TableRepo for PostgresCatalog {
-    async fn create_or_get(&self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
+impl TableRepo for PostgresTxn {
+    async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
         let rec = sqlx::query_as::<_, Table>(
             r#"
 INSERT INTO table_name ( name, namespace_id )
@@ -314,7 +310,7 @@ DO UPDATE SET name = table_name.name RETURNING *;
         )
         .bind(&name) // $1
         .bind(&namespace_id) // $2
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
@@ -327,7 +323,7 @@ DO UPDATE SET name = table_name.name RETURNING *;
         Ok(rec)
     }
 
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
         let rec = sqlx::query_as::<_, Table>(
             r#"
 SELECT * FROM table_name
@@ -335,7 +331,7 @@ WHERE namespace_id = $1;
             "#,
         )
         .bind(&namespace_id)
-        .fetch_all(&self.pool)
+        .fetch_all(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -344,9 +340,9 @@ WHERE namespace_id = $1;
 }
 
 #[async_trait]
-impl ColumnRepo for PostgresCatalog {
+impl ColumnRepo for PostgresTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         name: &str,
         table_id: TableId,
         column_type: ColumnType,
@@ -364,7 +360,7 @@ DO UPDATE SET name = column_name.name RETURNING *;
         .bind(&name) // $1
         .bind(&table_id) // $2
         .bind(&ct) // $3
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
@@ -385,7 +381,7 @@ DO UPDATE SET name = column_name.name RETURNING *;
         Ok(rec)
     }
 
-    async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
         let rec = sqlx::query_as::<_, Column>(
             r#"
 SELECT column_name.* FROM table_name
@@ -394,7 +390,7 @@ WHERE table_name.namespace_id = $1;
             "#,
         )
         .bind(&namespace_id)
-        .fetch_all(&self.pool)
+        .fetch_all(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -403,9 +399,9 @@ WHERE table_name.namespace_id = $1;
 }
 
 #[async_trait]
-impl SequencerRepo for PostgresCatalog {
+impl SequencerRepo for PostgresTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         topic: &KafkaTopic,
         partition: KafkaPartition,
     ) -> Result<Sequencer> {
@@ -421,7 +417,7 @@ impl SequencerRepo for PostgresCatalog {
         )
         .bind(&topic.id) // $1
         .bind(&partition) // $2
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
@@ -433,7 +429,7 @@ impl SequencerRepo for PostgresCatalog {
     }
 
     async fn get_by_topic_id_and_partition(
-        &self,
+        &mut self,
         topic_id: KafkaTopicId,
         partition: KafkaPartition,
     ) -> Result<Option<Sequencer>> {
@@ -444,7 +440,7 @@ SELECT * FROM sequencer WHERE kafka_topic_id = $1 AND kafka_partition = $2;
         )
         .bind(topic_id) // $1
         .bind(partition) // $2
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await;
 
         if let Err(sqlx::Error::RowNotFound) = rec {
@@ -456,26 +452,26 @@ SELECT * FROM sequencer WHERE kafka_topic_id = $1 AND kafka_partition = $2;
         Ok(Some(sequencer))
     }
 
-    async fn list(&self) -> Result<Vec<Sequencer>> {
+    async fn list(&mut self) -> Result<Vec<Sequencer>> {
         sqlx::query_as::<_, Sequencer>(r#"SELECT * FROM sequencer;"#)
-            .fetch_all(&self.pool)
+            .fetch_all(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
 
-    async fn list_by_kafka_topic(&self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
+    async fn list_by_kafka_topic(&mut self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
         sqlx::query_as::<_, Sequencer>(r#"SELECT * FROM sequencer WHERE kafka_topic_id = $1;"#)
             .bind(&topic.id) // $1
-            .fetch_all(&self.pool)
+            .fetch_all(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
 }
 
 #[async_trait]
-impl PartitionRepo for PostgresCatalog {
+impl PartitionRepo for PostgresTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         key: &str,
         sequencer_id: SequencerId,
         table_id: TableId,
@@ -493,7 +489,7 @@ impl PartitionRepo for PostgresCatalog {
         .bind(key) // $1
         .bind(&sequencer_id) // $2
         .bind(&table_id) // $3
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
@@ -504,16 +500,16 @@ impl PartitionRepo for PostgresCatalog {
         })
     }
 
-    async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
+    async fn list_by_sequencer(&mut self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
         sqlx::query_as::<_, Partition>(r#"SELECT * FROM partition WHERE sequencer_id = $1;"#)
             .bind(&sequencer_id) // $1
-            .fetch_all(&self.pool)
+            .fetch_all(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
 
     async fn partition_info_by_id(
-        &self,
+        &mut self,
         partition_id: PartitionId,
     ) -> Result<Option<PartitionInfo>> {
         let info = sqlx::query(
@@ -526,7 +522,7 @@ impl PartitionRepo for PostgresCatalog {
         WHERE partition.id = $1;"#,
         )
         .bind(&partition_id) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -548,9 +544,9 @@ impl PartitionRepo for PostgresCatalog {
 }
 
 #[async_trait]
-impl TombstoneRepo for PostgresCatalog {
+impl TombstoneRepo for PostgresTxn {
     async fn create_or_get(
-        &self,
+        &mut self,
         table_id: TableId,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
@@ -574,7 +570,7 @@ impl TombstoneRepo for PostgresCatalog {
         .bind(&min_time) // $4
         .bind(&max_time) // $5
         .bind(predicate) // $6
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
@@ -586,24 +582,23 @@ impl TombstoneRepo for PostgresCatalog {
     }
 
     async fn list_tombstones_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<Tombstone>> {
         sqlx::query_as::<_, Tombstone>(r#"SELECT * FROM tombstone WHERE sequencer_id = $1 AND sequence_number > $2 ORDER BY id;"#)
             .bind(&sequencer_id) // $1
             .bind(&sequence_number) // $2
-            .fetch_all(&self.pool)
+            .fetch_all(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
 }
 
 #[async_trait]
-impl ParquetFileRepo for PostgresCatalog {
+impl ParquetFileRepo for PostgresTxn {
     async fn create(
-        &self,
-        txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         sequencer_id: SequencerId,
         table_id: TableId,
         partition_id: PartitionId,
@@ -627,30 +622,28 @@ RETURNING *
             .bind(min_sequence_number) // $5
             .bind(max_sequence_number) // $6
             .bind(min_time) // $7
-            .bind(max_time); // $8
-
-        let rec = match txt {
-            Some(txt) => rec.fetch_one(txt).await,
-            None => rec.fetch_one(&self.pool).await,
-        };
-
-        let rec = rec.map_err(|e| {
-            if is_unique_violation(&e) {
-                Error::FileExists { object_store_id }
-            } else if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
-            } else {
-                Error::SqlxError { source: e }
-            }
-        })?;
+            .bind(max_time) // $8
+            .fetch_one(self.transaction())
+            .await
+            .map_err(|e| {
+                if is_unique_violation(&e) {
+                    Error::FileExists {
+                        object_store_id,
+                    }
+                } else if is_fk_violation(&e) {
+                    Error::ForeignKeyViolation { source: e }
+                } else {
+                    Error::SqlxError { source: e }
+                }
+            })?;
 
         Ok(rec)
     }
 
-    async fn flag_for_delete(&self, id: ParquetFileId) -> Result<()> {
+    async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()> {
         let _ = sqlx::query(r#"UPDATE parquet_file SET to_delete = true WHERE id = $1;"#)
             .bind(&id) // $1
-            .execute(&self.pool)
+            .execute(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -658,34 +651,34 @@ RETURNING *
     }
 
     async fn list_by_sequencer_greater_than(
-        &self,
+        &mut self,
         sequencer_id: SequencerId,
         sequence_number: SequenceNumber,
     ) -> Result<Vec<ParquetFile>> {
         sqlx::query_as::<_, ParquetFile>(r#"SELECT * FROM parquet_file WHERE sequencer_id = $1 AND max_sequence_number > $2 ORDER BY id;"#)
             .bind(&sequencer_id) // $1
             .bind(&sequence_number) // $2
-            .fetch_all(&self.pool)
+            .fetch_all(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })
     }
 
-    async fn exist(&self, id: ParquetFileId) -> Result<bool> {
+    async fn exist(&mut self, id: ParquetFileId) -> Result<bool> {
         let read_result = sqlx::query_as::<_, Count>(
             r#"SELECT count(*) as count FROM parquet_file WHERE id = $1;"#,
         )
         .bind(&id) // $1
-        .fetch_one(&self.pool)
+        .fetch_one(self.transaction())
         .await
         .map_err(|e| Error::SqlxError { source: e })?;
 
         Ok(read_result.count > 0)
     }
 
-    async fn count(&self) -> Result<i64> {
+    async fn count(&mut self) -> Result<i64> {
         let read_result =
             sqlx::query_as::<_, Count>(r#"SELECT count(*) as count  FROM parquet_file;"#)
-                .fetch_one(&self.pool)
+                .fetch_one(self.transaction())
                 .await
                 .map_err(|e| Error::SqlxError { source: e })?;
 
@@ -694,18 +687,12 @@ RETURNING *
 }
 
 #[async_trait]
-impl ProcessedTombstoneRepo for PostgresCatalog {
+impl ProcessedTombstoneRepo for PostgresTxn {
     async fn create_many(
-        &self,
-        txt: Option<&mut Transaction<'_, Postgres>>,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstones: &[Tombstone],
     ) -> Result<Vec<ProcessedTombstone>> {
-        if txt.is_none() {
-            return Err(Error::NoTransaction);
-        }
-        let txt = txt.unwrap();
-
         // no transaction provided
         // todo: we should never needs this but since right now we implement 2 catalogs,
         // postgres (for production)  and mem (for testing only) that does not need to provide txt
@@ -721,7 +708,7 @@ impl ProcessedTombstoneRepo for PostgresCatalog {
             )
             .bind(tombstone.id) // $1
             .bind(parquet_file_id) // $2
-            .fetch_one(&mut *txt)
+            .fetch_one(self.transaction())
             .await
             .map_err(|e| {
                 if is_unique_violation(&e) {
@@ -743,7 +730,7 @@ impl ProcessedTombstoneRepo for PostgresCatalog {
     }
 
     async fn exist(
-        &self,
+        &mut self,
         parquet_file_id: ParquetFileId,
         tombstone_id: TombstoneId,
     ) -> Result<bool> {
@@ -751,17 +738,17 @@ impl ProcessedTombstoneRepo for PostgresCatalog {
             r#"SELECT count(*) as count FROM processed_tombstone WHERE parquet_file_id = $1 AND tombstone_id = $2;"#)
             .bind(&parquet_file_id) // $1
             .bind(&tombstone_id) // $2
-            .fetch_one(&self.pool)
+            .fetch_one(self.transaction())
             .await
             .map_err(|e| Error::SqlxError { source: e })?;
 
         Ok(read_result.count > 0)
     }
 
-    async fn count(&self) -> Result<i64> {
+    async fn count(&mut self) -> Result<i64> {
         let read_result =
             sqlx::query_as::<_, Count>(r#"SELECT count(*) as count FROM processed_tombstone;"#)
-                .fetch_one(&self.pool)
+                .fetch_one(self.transaction())
                 .await
                 .map_err(|e| Error::SqlxError { source: e })?;
 
diff --git a/router2/src/dml_handlers/ns_autocreation.rs b/router2/src/dml_handlers/ns_autocreation.rs
index ef161375b4..c1a883cf33 100644
--- a/router2/src/dml_handlers/ns_autocreation.rs
+++ b/router2/src/dml_handlers/ns_autocreation.rs
@@ -91,8 +91,13 @@ where
         if self.cache.get_schema(&namespace).is_none() {
             trace!(%namespace, "namespace auto-create cache miss");
 
-            match self
+            let mut txn = self
                 .catalog
+                .start_transaction()
+                .await
+                .map_err(NamespaceCreationError::Create)?;
+
+            match txn
                 .namespaces()
                 .create(
                     namespace.as_str(),
@@ -103,6 +108,8 @@ where
                 .await
             {
                 Ok(_) => {
+                    txn.commit().await.map_err(NamespaceCreationError::Create)?;
+
                     debug!(%namespace, "created namespace");
                 }
                 Err(iox_catalog::interface::Error::NameExists { .. }) => {
@@ -110,9 +117,11 @@ where
                     // namespace, or another thread raced populating the catalog
                     // and beat this thread to it.
                     debug!(%namespace, "spurious namespace create failed");
+                    txn.abort().await.map_err(NamespaceCreationError::Create)?;
                 }
                 Err(e) => {
                     error!(error=%e, %namespace, "failed to auto-create namespace");
+                    txn.abort().await.map_err(NamespaceCreationError::Create)?;
                     return Err(NamespaceCreationError::Create(e));
                 }
             }
@@ -190,15 +199,19 @@ mod tests {
 
         // The cache hit should mean the catalog SHOULD NOT see a create request
         // for the namespace.
+        let mut txn = catalog
+            .start_transaction()
+            .await
+            .expect("failed to start UoW");
         assert!(
-            catalog
-                .namespaces()
+            txn.namespaces()
                 .get_by_name(ns.as_str())
                 .await
                 .expect("lookup should not error")
                 .is_none(),
             "expected no request to the catalog"
         );
+        txn.abort().await.expect("failed to abort UoW");
 
         // And the DML handler must be called.
         assert_matches!(mock_handler.calls().as_slice(), [MockDmlHandlerCall::Write { namespace, .. }] => {
@@ -230,12 +243,17 @@ mod tests {
 
         // The cache miss should mean the catalog MUST see a create request for
         // the namespace.
-        let got = catalog
+        let mut txn = catalog
+            .start_transaction()
+            .await
+            .expect("failed to start UoW");
+        let got = txn
             .namespaces()
             .get_by_name(ns.as_str())
             .await
             .expect("lookup should not error")
             .expect("creation request should be sent to catalog");
+        txn.abort().await.expect("failed to abort UoW");
 
         assert_eq!(
             got,
diff --git a/router2/src/dml_handlers/schema_validation.rs b/router2/src/dml_handlers/schema_validation.rs
index ce2fd14634..225c5abf0a 100644
--- a/router2/src/dml_handlers/schema_validation.rs
+++ b/router2/src/dml_handlers/schema_validation.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{ops::DerefMut, sync::Arc};
 
 use async_trait::async_trait;
 use data_types::{delete_predicate::DeletePredicate, DatabaseName};
@@ -135,6 +135,12 @@ where
         batches: HashMap<String, MutableBatch>,
         span_ctx: Option<SpanContext>,
     ) -> Result<(), Self::WriteError> {
+        let mut txn = self
+            .catalog
+            .start_transaction()
+            .await
+            .map_err(SchemaError::NamespaceLookup)?;
+
         // Load the namespace schema from the cache, falling back to pulling it
         // from the global catalog (if it exists).
         let schema = self.cache.get_schema(&namespace);
@@ -143,7 +149,7 @@ where
             None => {
                 // Pull the schema from the global catalog or error if it does
                 // not exist.
-                let schema = get_schema_by_name(&namespace, &*self.catalog)
+                let schema = get_schema_by_name(&namespace, txn.deref_mut())
                     .await
                     .map_err(|e| {
                         warn!(error=%e, %namespace, "failed to retrieve namespace schema");
@@ -162,7 +168,7 @@ where
         let maybe_new_schema = validate_or_insert_schema(
             batches.iter().map(|(k, v)| (k.as_str(), v)),
             &schema,
-            &*self.catalog,
+            txn.deref_mut(),
         )
         .await
         .map_err(|e| {
@@ -171,6 +177,8 @@ where
         })?
         .map(Arc::new);
 
+        txn.commit().await.map_err(SchemaError::NamespaceLookup)?;
+
         trace!(%namespace, "schema validation complete");
 
         // If the schema has been updated, immediately add it to the cache
@@ -246,8 +254,12 @@ mod tests {
     /// named [`NAMESPACE`].
     async fn create_catalog() -> Arc<dyn Catalog> {
         let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new());
-        catalog
-            .namespaces()
+
+        let mut txn = catalog
+            .start_transaction()
+            .await
+            .expect("failed to start UoW");
+        txn.namespaces()
             .create(
                 NAMESPACE,
                 "inf",
@@ -256,6 +268,8 @@ mod tests {
             )
             .await
             .expect("failed to create test namespace");
+        txn.commit().await.expect("failed to commit UoW");
+
         catalog
     }
 

From d986c04421e0d9ba0943c1d93fbb458cbc443c07 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 8 Feb 2022 13:48:44 +0000
Subject: [PATCH 22/30] feat: lazy system tables (#3661)

* feat: lazy system tables

* chore: review feedback

* chore: fmt

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/system_tables.rs             | 256 ++++++++++++----------------
 db/src/system_tables/chunks.rs      |  12 +-
 db/src/system_tables/columns.rs     |  21 ++-
 db/src/system_tables/operations.rs  |  14 +-
 db/src/system_tables/persistence.rs |  12 +-
 db/src/system_tables/queries.rs     |  53 ++++--
 6 files changed, 195 insertions(+), 173 deletions(-)

diff --git a/db/src/system_tables.rs b/db/src/system_tables.rs
index 82a61b0edb..1738a408ac 100644
--- a/db/src/system_tables.rs
+++ b/db/src/system_tables.rs
@@ -8,19 +8,19 @@
 //! For example `SELECT * FROM system.chunks`
 
 use super::{catalog::Catalog, query_log::QueryLog};
-use arrow::{
-    datatypes::{Field, Schema, SchemaRef},
-    error::Result,
-    record_batch::RecordBatch,
-};
+use arrow::{datatypes::SchemaRef, error::Result, record_batch::RecordBatch};
 use async_trait::async_trait;
+use datafusion::execution::runtime_env::RuntimeEnv;
+use datafusion::physical_plan::{
+    Partitioning, RecordBatchStream, SendableRecordBatchStream, Statistics,
+};
 use datafusion::{
-    catalog::schema::SchemaProvider,
-    datasource::TableProvider,
-    error::{DataFusionError, Result as DataFusionResult},
-    physical_plan::{memory::MemoryExec, ExecutionPlan},
+    catalog::schema::SchemaProvider, datasource::TableProvider, error::Result as DataFusionResult,
+    physical_plan::ExecutionPlan,
 };
 use job_registry::JobRegistry;
+use std::pin::Pin;
+use std::task::{Context, Poll};
 use std::{any::Any, sync::Arc};
 
 mod chunks;
@@ -65,22 +65,22 @@ impl SystemSchemaProvider {
     ) -> Self {
         let db_name = db_name.into();
         let chunks = Arc::new(SystemTableProvider {
-            inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
+            table: Arc::new(chunks::ChunksTable::new(Arc::clone(&catalog))),
         });
         let columns = Arc::new(SystemTableProvider {
-            inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
+            table: Arc::new(columns::ColumnsTable::new(Arc::clone(&catalog))),
         });
         let chunk_columns = Arc::new(SystemTableProvider {
-            inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
+            table: Arc::new(columns::ChunkColumnsTable::new(Arc::clone(&catalog))),
         });
         let operations = Arc::new(SystemTableProvider {
-            inner: operations::OperationsTable::new(db_name, jobs),
+            table: Arc::new(operations::OperationsTable::new(db_name, jobs)),
         });
         let persistence_windows = Arc::new(SystemTableProvider {
-            inner: persistence::PersistenceWindowsTable::new(catalog),
+            table: Arc::new(persistence::PersistenceWindowsTable::new(catalog)),
         });
         let queries = Arc::new(SystemTableProvider {
-            inner: queries::QueriesTable::new(query_log),
+            table: Arc::new(queries::QueriesTable::new(query_log)),
         });
         Self {
             chunks,
@@ -133,21 +133,20 @@ impl SchemaProvider for SystemSchemaProvider {
     }
 }
 
+type BatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>> + Send + Sync>;
+
 /// The minimal thing that a system table needs to implement
 trait IoxSystemTable: Send + Sync {
     /// Produce the schema from this system table
     fn schema(&self) -> SchemaRef;
 
-    /// Get the contents of the system table as a single RecordBatch
-    fn batch(&self) -> Result<RecordBatch>;
+    /// Get the contents of the system table
+    fn scan(&self, batch_size: usize) -> Result<BatchIterator>;
 }
 
 /// Adapter that makes any `IoxSystemTable` a DataFusion `TableProvider`
-struct SystemTableProvider<T>
-where
-    T: IoxSystemTable,
-{
-    inner: T,
+struct SystemTableProvider<T: IoxSystemTable> {
+    table: Arc<T>,
 }
 
 #[async_trait]
@@ -160,7 +159,7 @@ where
     }
 
     fn schema(&self) -> SchemaRef {
-        self.inner.schema()
+        self.table.schema()
     }
 
     async fn scan(
@@ -170,134 +169,97 @@ where
         _filters: &[datafusion::logical_plan::Expr],
         _limit: Option<usize>,
     ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
-        scan_batch(self.inner.batch()?, self.schema(), projection.as_ref())
+        let schema = self.table.schema();
+        let projected_schema = match projection.as_ref() {
+            Some(projection) => Arc::new(schema.project(projection)?),
+            None => schema,
+        };
+
+        Ok(Arc::new(SystemTableExecutionPlan {
+            table: Arc::clone(&self.table),
+            projection: projection.clone(),
+            projected_schema,
+        }))
     }
 }
 
-/// Creates a DataFusion ExecutionPlan node that scans a single batch
-/// of records.
-fn scan_batch(
-    batch: RecordBatch,
-    schema: SchemaRef,
-    projection: Option<&Vec<usize>>,
-) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
-    // apply projection, if any
-    let (schema, batch) = match projection {
-        None => (schema, batch),
-        Some(projection) => {
-            let projected_columns: DataFusionResult<Vec<Field>> = projection
-                .iter()
-                .map(|i| {
-                    if *i < schema.fields().len() {
-                        Ok(schema.field(*i).clone())
-                    } else {
-                        Err(DataFusionError::Internal(format!(
-                            "Projection index out of range in ChunksProvider: {}",
-                            i
-                        )))
-                    }
-                })
-                .collect();
-
-            let projected_schema = Arc::new(Schema::new(projected_columns?));
-
-            let columns = projection
-                .iter()
-                .map(|i| Arc::clone(batch.column(*i)))
-                .collect::<Vec<_>>();
-
-            let projected_batch = RecordBatch::try_new(Arc::clone(&projected_schema), columns)?;
-            (projected_schema, projected_batch)
-        }
-    };
-
-    Ok(Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)?))
+struct SystemTableExecutionPlan<T> {
+    table: Arc<T>,
+    projected_schema: SchemaRef,
+    projection: Option<Vec<usize>>,
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow::array::{ArrayRef, UInt64Array};
-    use arrow_util::assert_batches_eq;
-    use datafusion_util::test_collect;
-
-    fn seq_array(start: u64, end: u64) -> ArrayRef {
-        Arc::new(UInt64Array::from_iter_values(start..end))
-    }
-
-    #[tokio::test]
-    async fn test_scan_batch_no_projection() {
-        let batch = RecordBatch::try_from_iter(vec![
-            ("col1", seq_array(0, 3)),
-            ("col2", seq_array(1, 4)),
-            ("col3", seq_array(2, 5)),
-            ("col4", seq_array(3, 6)),
-        ])
-        .unwrap();
-
-        let projection = None;
-        let scan = scan_batch(batch.clone(), batch.schema(), projection).unwrap();
-        let collected = test_collect(scan).await;
-
-        let expected = vec![
-            "+------+------+------+------+",
-            "| col1 | col2 | col3 | col4 |",
-            "+------+------+------+------+",
-            "| 0    | 1    | 2    | 3    |",
-            "| 1    | 2    | 3    | 4    |",
-            "| 2    | 3    | 4    | 5    |",
-            "+------+------+------+------+",
-        ];
-
-        assert_batches_eq!(&expected, &collected);
-    }
-
-    #[tokio::test]
-    async fn test_scan_batch_good_projection() {
-        let batch = RecordBatch::try_from_iter(vec![
-            ("col1", seq_array(0, 3)),
-            ("col2", seq_array(1, 4)),
-            ("col3", seq_array(2, 5)),
-            ("col4", seq_array(3, 6)),
-        ])
-        .unwrap();
-
-        let projection = Some(vec![3, 1]);
-        let scan = scan_batch(batch.clone(), batch.schema(), projection.as_ref()).unwrap();
-        let collected = test_collect(scan).await;
-
-        let expected = vec![
-            "+------+------+",
-            "| col4 | col2 |",
-            "+------+------+",
-            "| 3    | 1    |",
-            "| 4    | 2    |",
-            "| 5    | 3    |",
-            "+------+------+",
-        ];
-
-        assert_batches_eq!(&expected, &collected);
-    }
-
-    #[tokio::test]
-    async fn test_scan_batch_bad_projection() {
-        let batch = RecordBatch::try_from_iter(vec![
-            ("col1", seq_array(0, 3)),
-            ("col2", seq_array(1, 4)),
-            ("col3", seq_array(2, 5)),
-            ("col4", seq_array(3, 6)),
-        ])
-        .unwrap();
-
-        // no column idex 5
-        let projection = Some(vec![3, 1, 5]);
-        let result = scan_batch(batch.clone(), batch.schema(), projection.as_ref());
-        let err_string = result.unwrap_err().to_string();
-        assert!(
-            err_string
-                .contains("Internal error: Projection index out of range in ChunksProvider: 5"),
-            "Actual error: {}",
-            err_string
-        );
+impl<T> std::fmt::Debug for SystemTableExecutionPlan<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SystemTableExecutionPlan")
+            .field("projection", &self.projection)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl<T: IoxSystemTable + 'static> ExecutionPlan for SystemTableExecutionPlan<T> {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.projected_schema)
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        &self,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        unimplemented!()
+    }
+
+    async fn execute(
+        &self,
+        _partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> DataFusionResult<SendableRecordBatchStream> {
+        Ok(Box::pin(SystemTableStream {
+            projected_schema: Arc::clone(&self.projected_schema),
+            batches: self.table.scan(runtime.batch_size)?,
+            projection: self.projection.clone(),
+        }))
+    }
+
+    fn statistics(&self) -> Statistics {
+        Statistics::default()
+    }
+}
+
+struct SystemTableStream {
+    projected_schema: SchemaRef,
+    projection: Option<Vec<usize>>,
+    batches: BatchIterator,
+}
+
+impl RecordBatchStream for SystemTableStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.projected_schema)
+    }
+}
+
+impl futures::Stream for SystemTableStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Poll::Ready(self.batches.next().map(|maybe_batch| {
+            maybe_batch.and_then(|batch| match &self.projection {
+                Some(projection) => batch.project(projection),
+                None => Ok(batch),
+            })
+        }))
     }
 }
diff --git a/db/src/system_tables/chunks.rs b/db/src/system_tables/chunks.rs
index 4626f86618..d5fc0fc174 100644
--- a/db/src/system_tables/chunks.rs
+++ b/db/src/system_tables/chunks.rs
@@ -1,3 +1,4 @@
+use crate::system_tables::BatchIterator;
 use crate::{catalog::Catalog, system_tables::IoxSystemTable};
 use arrow::{
     array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array},
@@ -30,9 +31,14 @@ impl IoxSystemTable for ChunksTable {
         Arc::clone(&self.schema)
     }
 
-    fn batch(&self) -> Result<RecordBatch> {
-        from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
-            .log_if_error("system.chunks table")
+    fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
+        let schema = Arc::clone(&self.schema);
+        let catalog = Arc::clone(&self.catalog);
+
+        Ok(Box::new(std::iter::once_with(move || {
+            from_chunk_summaries(schema, catalog.chunk_summaries())
+                .log_if_error("system.chunks table")
+        })))
     }
 }
 
diff --git a/db/src/system_tables/columns.rs b/db/src/system_tables/columns.rs
index 2384e1e8dd..f6a92c75b4 100644
--- a/db/src/system_tables/columns.rs
+++ b/db/src/system_tables/columns.rs
@@ -1,3 +1,4 @@
+use crate::system_tables::BatchIterator;
 use crate::{catalog::Catalog, system_tables::IoxSystemTable};
 use arrow::array::UInt32Array;
 use arrow::{
@@ -33,9 +34,13 @@ impl IoxSystemTable for ColumnsTable {
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
-    fn batch(&self) -> Result<RecordBatch> {
-        from_partition_summaries(self.schema(), self.catalog.partition_summaries())
-            .log_if_error("system.columns table")
+    fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
+        let schema = Arc::clone(&self.schema);
+        let catalog = Arc::clone(&self.catalog);
+        Ok(Box::new(std::iter::once_with(move || {
+            from_partition_summaries(schema, catalog.partition_summaries())
+                .log_if_error("system.columns table")
+        })))
     }
 }
 
@@ -113,9 +118,13 @@ impl IoxSystemTable for ChunkColumnsTable {
         Arc::clone(&self.schema)
     }
 
-    fn batch(&self) -> Result<RecordBatch> {
-        assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
-            .log_if_error("system.column_chunks table")
+    fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
+        let schema = Arc::clone(&self.schema);
+        let catalog = Arc::clone(&self.catalog);
+        Ok(Box::new(std::iter::once_with(move || {
+            assemble_chunk_columns(schema, catalog.detailed_chunk_summaries())
+                .log_if_error("system.column_chunks table")
+        })))
     }
 }
 
diff --git a/db/src/system_tables/operations.rs b/db/src/system_tables/operations.rs
index 2b8b6e7ec9..349f514a5e 100644
--- a/db/src/system_tables/operations.rs
+++ b/db/src/system_tables/operations.rs
@@ -1,4 +1,4 @@
-use crate::system_tables::IoxSystemTable;
+use crate::system_tables::{BatchIterator, IoxSystemTable};
 use arrow::{
     array::{ArrayRef, StringArray, Time64NanosecondArray, TimestampNanosecondArray},
     datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
@@ -34,9 +34,15 @@ impl IoxSystemTable for OperationsTable {
         Arc::clone(&self.schema)
     }
 
-    fn batch(&self) -> Result<RecordBatch> {
-        from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
-            .log_if_error("system.operations table")
+    fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
+        let schema = Arc::clone(&self.schema);
+        let jobs = Arc::clone(&self.jobs);
+        let db_name = self.db_name.clone();
+
+        Ok(Box::new(std::iter::once_with(move || {
+            from_task_trackers(schema, &db_name, jobs.tracked())
+                .log_if_error("system.operations table")
+        })))
     }
 }
 
diff --git a/db/src/system_tables/persistence.rs b/db/src/system_tables/persistence.rs
index 0fdd9658be..5a1655898f 100644
--- a/db/src/system_tables/persistence.rs
+++ b/db/src/system_tables/persistence.rs
@@ -1,3 +1,4 @@
+use crate::system_tables::BatchIterator;
 use crate::{catalog::Catalog, system_tables::IoxSystemTable};
 use arrow::{
     array::{StringArray, TimestampNanosecondArray, UInt64Array},
@@ -31,9 +32,14 @@ impl IoxSystemTable for PersistenceWindowsTable {
         Arc::clone(&self.schema)
     }
 
-    fn batch(&self) -> Result<RecordBatch> {
-        from_write_summaries(self.schema(), self.catalog.persistence_summaries())
-            .log_if_error("system.persistence_windows table")
+    fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
+        let schema = Arc::clone(&self.schema);
+        let catalog = Arc::clone(&self.catalog);
+
+        Ok(Box::new(std::iter::once_with(move || {
+            from_write_summaries(schema, catalog.persistence_summaries())
+                .log_if_error("system.persistence_windows table")
+        })))
     }
 }
 
diff --git a/db/src/system_tables/queries.rs b/db/src/system_tables/queries.rs
index 14428bf9f5..138299eecf 100644
--- a/db/src/system_tables/queries.rs
+++ b/db/src/system_tables/queries.rs
@@ -1,3 +1,4 @@
+use crate::system_tables::BatchIterator;
 use crate::{
     query_log::{QueryLog, QueryLogEntry},
     system_tables::IoxSystemTable,
@@ -8,7 +9,7 @@ use arrow::{
     error::Result,
     record_batch::RecordBatch,
 };
-use data_types::error::ErrorLogger;
+use observability_deps::tracing::error;
 use std::{collections::VecDeque, sync::Arc};
 
 /// Implementation of system.queries table
@@ -32,9 +33,27 @@ impl IoxSystemTable for QueriesTable {
         Arc::clone(&self.schema)
     }
 
-    fn batch(&self) -> Result<RecordBatch> {
-        from_query_log_entries(self.schema(), self.query_log.entries())
-            .log_if_error("system.chunks table")
+    fn scan(&self, batch_size: usize) -> Result<BatchIterator> {
+        let schema = self.schema();
+        let entries = self.query_log.entries();
+        let mut offset = 0;
+        Ok(Box::new(std::iter::from_fn(move || {
+            if offset >= entries.len() {
+                return None;
+            }
+
+            let len = batch_size.min(entries.len() - offset);
+            match from_query_log_entries(schema.clone(), &entries, offset, len) {
+                Ok(batch) => {
+                    offset += len;
+                    Some(Ok(batch))
+                }
+                Err(e) => {
+                    error!("Error system.chunks table: {:?}", e);
+                    Some(Err(e))
+                }
+            }
+        })))
     }
 }
 
@@ -57,26 +76,36 @@ fn queries_schema() -> SchemaRef {
 
 fn from_query_log_entries(
     schema: SchemaRef,
-    entries: VecDeque<Arc<QueryLogEntry>>,
+    entries: &VecDeque<Arc<QueryLogEntry>>,
+    offset: usize,
+    len: usize,
 ) -> Result<RecordBatch> {
     let issue_time = entries
         .iter()
+        .skip(offset)
+        .take(len)
         .map(|e| e.issue_time)
         .map(|ts| Some(ts.timestamp_nanos()))
         .collect::<TimestampNanosecondArray>();
 
     let query_type = entries
         .iter()
+        .skip(offset)
+        .take(len)
         .map(|e| Some(&e.query_type))
         .collect::<StringArray>();
 
     let query_text = entries
         .iter()
+        .skip(offset)
+        .take(len)
         .map(|e| Some(e.query_text.to_string()))
         .collect::<StringArray>();
 
     let query_runtime = entries
         .iter()
+        .skip(offset)
+        .take(len)
         .map(|e| e.query_completed_duration().map(|d| d.as_nanos() as i64))
         .collect::<DurationNanosecondArray>();
 
@@ -101,12 +130,15 @@ mod tests {
     fn test_from_query_log() {
         let now = Time::from_rfc3339("1996-12-19T16:39:57+00:00").unwrap();
         let time_provider = Arc::new(time::MockProvider::new(now));
+
         let query_log = QueryLog::new(10, Arc::clone(&time_provider) as Arc<dyn TimeProvider>);
         query_log.push("sql", Box::new("select * from foo"));
         time_provider.inc(std::time::Duration::from_secs(24 * 60 * 60));
         query_log.push("sql", Box::new("select * from bar"));
         let read_filter_entry = query_log.push("read_filter", Box::new("json goop"));
 
+        let table = QueriesTable::new(Arc::new(query_log));
+
         let expected = vec![
             "+----------------------+-------------+-------------------+--------------------+",
             "| issue_time           | query_type  | query_text        | completed_duration |",
@@ -117,9 +149,9 @@ mod tests {
             "+----------------------+-------------+-------------------+--------------------+",
         ];
 
-        let schema = queries_schema();
-        let batch = from_query_log_entries(schema.clone(), query_log.entries()).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
+        let entries = table.scan(3).unwrap().collect::<Result<Vec<_>>>().unwrap();
+        assert_eq!(entries.len(), 1);
+        assert_batches_eq!(&expected, &entries);
 
         // mark one of the queries completed after 4s
         let now = Time::from_rfc3339("1996-12-20T16:40:01+00:00").unwrap();
@@ -135,7 +167,8 @@ mod tests {
             "+----------------------+-------------+-------------------+--------------------+",
         ];
 
-        let batch = from_query_log_entries(schema, query_log.entries()).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
+        let entries = table.scan(2).unwrap().collect::<Result<Vec<_>>>().unwrap();
+        assert_eq!(entries.len(), 2);
+        assert_batches_eq!(&expected, &entries);
     }
 }

From 9a874186d59ed653a0df8e5b7ee94e25cdc6f895 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 13:59:02 +0000
Subject: [PATCH 23/30] chore(deps): bump libc from 0.2.116 to 0.2.117 (#3666)

Bumps [libc](https://github.com/rust-lang/libc) from 0.2.116 to 0.2.117.
- [Release notes](https://github.com/rust-lang/libc/releases)
- [Commits](https://github.com/rust-lang/libc/compare/0.2.116...0.2.117)

---
updated-dependencies:
- dependency-name: libc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 723bb10d5c..b3366d6553 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2449,9 +2449,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.116"
+version = "0.2.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "565dbd88872dbe4cc8a46e527f26483c1d1f7afa6b884a3bd6cd893d4f98da74"
+checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
 
 [[package]]
 name = "libloading"

From 3c9a27b31b22fdb9d38fe5877b52ddf6a708e016 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 14:35:34 +0000
Subject: [PATCH 24/30] chore(deps): bump futures-test from 0.3.19 to 0.3.21
 (#3667)

Bumps [futures-test](https://github.com/rust-lang/futures-rs) from 0.3.19 to 0.3.21.
- [Release notes](https://github.com/rust-lang/futures-rs/releases)
- [Changelog](https://github.com/rust-lang/futures-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/futures-rs/compare/0.3.19...0.3.21)

---
updated-dependencies:
- dependency-name: futures-test
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b3366d6553..57e610e32d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1325,9 +1325,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba3dda0b6588335f360afc675d0564c17a77a2bda81ca178a4b6081bd86c7f0b"
+checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1335,15 +1335,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c8ff0461b82559810cdccfde3215c3f373807f5e5232b71479bff7bb2583d7"
+checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29d6d2ff5bb10fb95c85b8ce46538a2e5f5e7fdc755623a7d4529ab8a4ed9d2a"
+checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1363,15 +1363,15 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f9d34af5a1aac6fb380f735fe510746c38067c5bf16c7fd250280503c971b2"
+checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dbd947adfffb0efc70599b3ddcf7b5597bb5fa9e245eb99f62b3a5f7bb8bd3c"
+checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1380,21 +1380,21 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3055baccb68d74ff6480350f8d6eb8fcfa3aa11bdc1a1ae3afdd0514617d508"
+checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868"
 
 [[package]]
 name = "futures-task"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ee7c6485c30167ce4dfb83ac568a849fe53274c831081476ee13e0dce1aad72"
+checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a"
 
 [[package]]
 name = "futures-test"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e741bc851e1e90ad08901b329389ae77e02d5e9a0ec61955b80834630fbdc2f"
+checksum = "8c3e9379dbbfb35dd6df79e895d73c0f75558827fe68eb853b858ff417a8ee98"
 dependencies = [
  "futures-core",
  "futures-executor",
@@ -1409,9 +1409,9 @@ dependencies = [
 
 [[package]]
 name = "futures-util"
-version = "0.3.19"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b5cf40b47a271f77a8b1bec03ca09044d99d2372c0de244e66430761127164"
+checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
 dependencies = [
  "futures-channel",
  "futures-core",

From ba87ae2918f795546615265763cf1f16ef411072 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 14:48:31 +0000
Subject: [PATCH 25/30] chore(deps): bump crc32fast from 1.3.1 to 1.3.2 (#3668)

Bumps [crc32fast](https://github.com/srijs/rust-crc32fast) from 1.3.1 to 1.3.2.
- [Release notes](https://github.com/srijs/rust-crc32fast/releases)
- [Commits](https://github.com/srijs/rust-crc32fast/compare/v1.3.1...v1.3.2)

---
updated-dependencies:
- dependency-name: crc32fast
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock        | 4 ++--
 server/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 57e610e32d..2183e6c1aa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -678,9 +678,9 @@ dependencies = [
 
 [[package]]
 name = "crc32fast"
-version = "1.3.1"
+version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2209c310e29876f7f0b2721e7e26b84aff178aa3da5d091f9bfbf47669e60e3"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
 dependencies = [
  "cfg-if",
 ]
diff --git a/server/Cargo.toml b/server/Cargo.toml
index 9ea66d5e76..ef86852910 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -10,7 +10,7 @@ async-trait = "0.1"
 bytes = "1.0"
 chrono = { version = "0.4", default-features = false }
 cache_loader_async = { version = "0.1.2", features = ["ttl-cache"] }
-crc32fast = "1.3.0"
+crc32fast = "1.3.2"
 data_types = { path = "../data_types" }
 db = { path = "../db" }
 futures = "0.3"

From 9369152096b16352766daac5d293804fe151887b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:12:39 +0000
Subject: [PATCH 26/30] chore(deps): bump tracing-core from 0.1.21 to 0.1.22
 (#3675)

Bumps [tracing-core](https://github.com/tokio-rs/tracing) from 0.1.21 to 0.1.22.
- [Release notes](https://github.com/tokio-rs/tracing/releases)
- [Commits](https://github.com/tokio-rs/tracing/compare/tracing-core-0.1.21...tracing-core-0.1.22)

---
updated-dependencies:
- dependency-name: tracing-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                | 11 +++++++++--
 workspace-hack/Cargo.toml |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2183e6c1aa..c2061fe609 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5414,11 +5414,12 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.21"
+version = "0.1.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
+checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23"
 dependencies = [
  "lazy_static",
+ "valuable",
 ]
 
 [[package]]
@@ -5604,6 +5605,12 @@ dependencies = [
  "getrandom",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index b6c98e5fb8..5a46177403 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -55,7 +55,7 @@ tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }
 tokio-util = { version = "0.6", features = ["codec", "io", "slab", "time"] }
 tower = { version = "0.4", features = ["balance", "buffer", "discover", "futures-util", "indexmap", "limit", "load", "log", "make", "rand", "ready-cache", "slab", "timeout", "tokio", "tokio-stream", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "max_level_trace", "release_max_level_debug", "std", "tracing-attributes"] }
-tracing-core = { version = "0.1", features = ["lazy_static", "std"] }
+tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] }
 tracing-log = { version = "0.1", features = ["log-tracer", "std", "trace-logger"] }
 tracing-subscriber = { version = "0.3", features = ["alloc", "ansi", "ansi_term", "env-filter", "fmt", "json", "lazy_static", "matchers", "regex", "registry", "serde", "serde_json", "sharded-slab", "smallvec", "std", "thread_local", "tracing", "tracing-log", "tracing-serde"] }
 uuid = { version = "0.8", features = ["getrandom", "std", "v4"] }

From 59b2141c0b542723171fee25f738c710557e9843 Mon Sep 17 00:00:00 2001
From: Paul Dix <paul@pauldix.net>
Date: Tue, 8 Feb 2022 10:23:40 -0500
Subject: [PATCH 27/30] feat: Add lifecycle manager to ingester (#3645)

This adds the lifecycle manager to the ingester. It will trigger based on a threshold for max partition size or age or based on keeping total memory under a certain threshold.

It defines a new interface for a persister, which is stubbed out for IngesterData. I'm not sure yet how persistence errors should be handled. The assumption here is that the persister continues to retry persistence forever until it succeeds.

There is one scenario I can think of that may cause this lifecycle manager problems. If a single partition is very high throughput, it could cause things to back up as persistence is not parallelized within a single partition. Any given partition can currently only run one persistence operation at a time. We can address this later.

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                                |   1 +
 influxdb_iox/src/commands/run/ingester.rs |  48 +++
 ingester/Cargo.toml                       |   1 +
 ingester/src/data.rs                      |  20 +
 ingester/src/handler.rs                   |  26 +-
 ingester/src/lib.rs                       |   1 +
 ingester/src/lifecycle.rs                 | 475 ++++++++++++++++++++++
 7 files changed, 569 insertions(+), 3 deletions(-)
 create mode 100644 ingester/src/lifecycle.rs

diff --git a/Cargo.lock b/Cargo.lock
index c2061fe609..a4e9ae8946 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2021,6 +2021,7 @@ version = "0.1.0"
 dependencies = [
  "arrow",
  "arrow_util",
+ "async-trait",
  "base64 0.13.0",
  "bytes",
  "chrono",
diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs
index ce5595b46d..0a008a8803 100644
--- a/influxdb_iox/src/commands/run/ingester.rs
+++ b/influxdb_iox/src/commands/run/ingester.rs
@@ -14,6 +14,7 @@ use crate::{
 };
 use ingester::{
     handler::IngestHandlerImpl,
+    lifecycle::LifecycleConfig,
     server::{grpc::GrpcDelegate, http::HttpDelegate, IngesterServer},
 };
 use iox_catalog::interface::KafkaPartition;
@@ -22,6 +23,7 @@ use observability_deps::tracing::*;
 use std::collections::BTreeMap;
 use std::convert::TryFrom;
 use std::sync::Arc;
+use std::time::Duration;
 use thiserror::Error;
 
 #[derive(Debug, Error)]
@@ -93,6 +95,45 @@ pub struct Config {
         env = "INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END"
     )]
     pub write_buffer_partition_range_end: i32,
+
+    /// The ingester will continue to pull data and buffer it from Kafka
+    /// as long as it is below this size. If it hits this size it will pause
+    /// ingest from Kafka until persistence goes below this threshold.
+    #[clap(
+        long = "--pause-ingest-size-bytes",
+        env = "INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES"
+    )]
+    pub pause_ingest_size_bytes: usize,
+
+    /// Once the ingester crosses this threshold of data buffered across
+    /// all sequencers, it will pick the largest partitions and persist
+    /// them until it falls below this threshold. An ingester running in
+    /// a steady state is expected to take up this much memory.
+    #[clap(
+        long = "--persist-memory-threshold-bytes",
+        env = "INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES"
+    )]
+    pub persist_memory_threshold_bytes: usize,
+
+    /// If an individual partition crosses this size threshold, it will be persisted.
+    /// The default value is 300MB (in bytes).
+    #[clap(
+        long = "--persist-partition-size-threshold-bytes",
+        env = "INFLUXDB_IOX_PERSIST_PARTITION_SIZE_THRESHOLD_BYTES",
+        default_value = "314572800"
+    )]
+    pub persist_partition_size_threshold_bytes: usize,
+
+    /// If a partition has had data buffered for longer than this period of time
+    /// it will be persisted. This puts an upper bound on how far back the
+    /// ingester may need to read in Kafka on restart or recovery. The default value
+    /// is 30 minutes (in seconds).
+    #[clap(
+        long = "--persist-partition-age-threshold-seconds",
+        env = "INFLUXDB_IOX_PERSIST_PARTITION_AGE_THRESHOLD_SECONDS",
+        default_value = "1800"
+    )]
+    pub persist_partition_age_threshold_seconds: u64,
 }
 
 pub async fn command(config: Config) -> Result<()> {
@@ -140,8 +181,15 @@ pub async fn command(config: Config) -> Result<()> {
         .reading(Arc::clone(&metric_registry), trace_collector.clone())
         .await?;
 
+    let lifecycle_config = LifecycleConfig::new(
+        config.pause_ingest_size_bytes,
+        config.persist_memory_threshold_bytes,
+        config.persist_partition_size_threshold_bytes,
+        Duration::from_secs(config.persist_partition_age_threshold_seconds),
+    );
     let ingest_handler = Arc::new(
         IngestHandlerImpl::new(
+            lifecycle_config,
             kafka_topic,
             sequencers,
             catalog,
diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml
index c40009c2a7..bf4927ed25 100644
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 [dependencies]
 arrow = { version = "8.0", features = ["prettyprint"] }
 arrow_util = { path = "../arrow_util" }
+async-trait = "0.1.42"
 base64 = "0.13"
 bytes = "1.0"
 datafusion = { path = "../datafusion" }
diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 95af014bd7..75e5f2ff0f 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -3,6 +3,7 @@
 use arrow::record_batch::RecordBatch;
 use data_types::delete_predicate::DeletePredicate;
 
+use async_trait::async_trait;
 use chrono::{format::StrftimeItems, TimeZone, Utc};
 use dml::DmlOperation;
 use iox_catalog::interface::{
@@ -100,6 +101,25 @@ impl IngesterData {
     }
 }
 
+/// The Persister has a single function that will persist a given partition Id. It is expected
+/// that the persist function will retry forever until it succeeds.
+#[async_trait]
+pub(crate) trait Persister: Send + Sync + 'static {
+    async fn persist(&self, partition_id: PartitionId);
+}
+
+#[async_trait]
+impl Persister for IngesterData {
+    async fn persist(&self, _partition_id: PartitionId) {
+        // lookup the TableData
+        // let persisting_batch = table_data.create_persisting_batch(partition.partition_key);
+        // do the persist with this persisting batch
+        // update the catalog
+        // table_data.clear_persisting_batch() (behind the scenes this will remove the persisting batch
+        // and if the partition is empty, remove it from the map in table_data)
+    }
+}
+
 /// Data of a Shard
 #[derive(Default)]
 pub struct SequencerData {
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index 18a100f132..62e6fb7244 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -3,7 +3,10 @@
 use iox_catalog::interface::{Catalog, KafkaPartition, KafkaTopic, Sequencer, SequencerId};
 use object_store::ObjectStore;
 
-use crate::data::{IngesterData, SequencerData};
+use crate::{
+    data::{IngesterData, SequencerData},
+    lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
+};
 use db::write_buffer::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
 use futures::StreamExt;
 use observability_deps::tracing::{debug, warn};
@@ -14,6 +17,7 @@ use std::{
     sync::Arc,
     time::{Duration, Instant},
 };
+use time::SystemProvider;
 use tokio::task::JoinHandle;
 use trace::span::SpanRecorder;
 use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
@@ -49,11 +53,11 @@ pub struct IngestHandlerImpl {
     #[allow(dead_code)]
     kafka_topic: KafkaTopic,
     /// Future that resolves when the background worker exits
-    #[allow(dead_code)]
     join_handles: Vec<JoinHandle<()>>,
     /// The cache and buffered data for the ingester
-    #[allow(dead_code)]
     data: Arc<IngesterData>,
+    /// The lifecycle manager, keeping state of partitions across all sequencers
+    lifecycle_manager: Arc<LifecycleManager>,
 }
 
 impl std::fmt::Debug for IngestHandlerImpl {
@@ -65,6 +69,7 @@ impl std::fmt::Debug for IngestHandlerImpl {
 impl IngestHandlerImpl {
     /// Initialize the Ingester
     pub async fn new(
+        lifecycle_config: LifecycleConfig,
         topic: KafkaTopic,
         sequencer_states: BTreeMap<KafkaPartition, Sequencer>,
         catalog: Arc<dyn Catalog>,
@@ -109,10 +114,23 @@ impl IngestHandlerImpl {
             )));
         }
 
+        // start the lifecycle manager
+        let persister = Arc::clone(&data);
+        let lifecycle_manager = Arc::new(LifecycleManager::new(
+            lifecycle_config,
+            Arc::new(SystemProvider::new()),
+        ));
+        let manager = Arc::clone(&lifecycle_manager);
+        let handle = tokio::task::spawn(async move {
+            run_lifecycle_manager(manager, persister).await;
+        });
+        join_handles.push(handle);
+
         Ok(Self {
             data,
             kafka_topic: topic,
             join_handles,
+            lifecycle_manager,
         })
     }
 }
@@ -291,7 +309,9 @@ mod tests {
         let object_store = Arc::new(ObjectStore::new_in_memory());
         let metrics: Arc<metric::Registry> = Default::default();
 
+        let lifecycle_config = LifecycleConfig::new(1000000, 1000, 1000, Duration::from_secs(10));
         let ingester = IngestHandlerImpl::new(
+            lifecycle_config,
             kafka_topic,
             sequencer_states,
             Arc::new(catalog),
diff --git a/ingester/src/lib.rs b/ingester/src/lib.rs
index 1690beac8a..766a662026 100644
--- a/ingester/src/lib.rs
+++ b/ingester/src/lib.rs
@@ -16,6 +16,7 @@
 pub mod compact;
 pub mod data;
 pub mod handler;
+pub mod lifecycle;
 pub mod persist;
 pub mod query;
 pub mod server;
diff --git a/ingester/src/lifecycle.rs b/ingester/src/lifecycle.rs
new file mode 100644
index 0000000000..87d388aab2
--- /dev/null
+++ b/ingester/src/lifecycle.rs
@@ -0,0 +1,475 @@
+//! Manages the persistence and eviction lifecycle of data in the buffer across all sequencers.
+//! Note that the byte counts logged by the lifecycle manager and when exactly persistence gets
+//! triggered aren't required to be absolutely accurate. The byte count is just an estimate
+//! anyway, this just needs to keep things moving along to keep memory use roughly under
+//! some absolute number and individual Parquet files that get persisted below some number. It
+//! is expected that they may be above or below the absolute thresholds.
+
+use crate::data::Persister;
+use iox_catalog::interface::PartitionId;
+use parking_lot::Mutex;
+use std::collections::BTreeMap;
+use std::sync::Arc;
+use std::time::Duration;
+use time::{Time, TimeProvider};
+
+/// The lifecycle manager keeps track of the size and age of partitions across all sequencers.
+/// It triggers persistence based on keeping total memory usage around a set amount while
+/// ensuring that partitions don't get too old or large before being persisted.
+pub(crate) struct LifecycleManager {
+    config: LifecycleConfig,
+    time_provider: Arc<dyn TimeProvider>,
+    state: Mutex<LifecycleState>,
+    persist_running: tokio::sync::Mutex<()>,
+}
+
+/// The configuration options for the lifecycle on the ingester.
+#[derive(Debug, Clone, Copy)]
+pub struct LifecycleConfig {
+    /// The ingester will pause pulling data from Kafka if it hits this amount of memory used, waiting
+    /// until persistence evicts partitions from memory.
+    pause_ingest_size: usize,
+    /// When the ingester hits this threshold, the lifecycle manager will persist the largest
+    /// partitions currently buffered until it falls below this threshold. An ingester running
+    /// in a steady state should operate around this amount of memory usage.
+    persist_memory_threshold: usize,
+    /// If an individual partition crosses this threshold, it will be persisted. The purpose of this
+    /// setting to to ensure the ingester doesn't create Parquet files that are too large.
+    partition_size_threshold: usize,
+    /// If an individual partitiion has had data buffered for longer than this period of time, the
+    /// manager will persist it. This setting is to ensure we have an upper bound on how far back
+    /// we will need to read in Kafka on restart or recovery.
+    partition_age_threshold: Duration,
+}
+
+impl LifecycleConfig {
+    /// Initialize a new LifecycleConfig. panics if the passed `pause_ingest_size` is less than the
+    /// `persist_memory_threshold`.
+    pub fn new(
+        pause_ingest_size: usize,
+        persist_memory_threshold: usize,
+        partition_size_threshold: usize,
+        partition_age_threshold: Duration,
+    ) -> Self {
+        // this must be true to ensure that persistence will get triggered, freeing up memory
+        assert!(pause_ingest_size > persist_memory_threshold);
+
+        Self {
+            pause_ingest_size,
+            persist_memory_threshold,
+            partition_size_threshold,
+            partition_age_threshold,
+        }
+    }
+}
+
+#[derive(Default, Debug)]
+struct LifecycleState {
+    total_bytes: usize,
+    partition_stats: BTreeMap<PartitionId, PartitionLifecycleStats>,
+}
+
+impl LifecycleState {
+    fn remove(&mut self, partition_id: &PartitionId) -> Option<PartitionLifecycleStats> {
+        self.partition_stats.remove(partition_id).map(|stats| {
+            self.total_bytes -= stats.bytes_written;
+            stats
+        })
+    }
+}
+
+/// A snapshot of the stats for the lifecycle manager
+#[derive(Debug)]
+pub struct LifecycleStats {
+    /// total number of bytes the lifecycle manager is aware of across all sequencers and
+    /// partitions. Based on the mutable batch sizes received into all partitions.
+    pub total_bytes: usize,
+    /// the stats for every partition the lifecycle manager is tracking.
+    pub partition_stats: Vec<PartitionLifecycleStats>,
+}
+
+/// The stats for a partition
+#[derive(Debug, Clone, Copy)]
+pub struct PartitionLifecycleStats {
+    /// The partition identifier
+    partition_id: PartitionId,
+    /// Time that the partition received its first write. This is reset anytime
+    /// the partition is persisted.
+    first_write: Time,
+    /// The number of bytes in the partition as estimated by the mutable batch sizes.
+    bytes_written: usize,
+}
+
+impl LifecycleManager {
+    /// Initialize a new lifecycle manager that will persist when `maybe_persist` is called
+    /// if anything is over the size or age threshold.
+    pub(crate) fn new(config: LifecycleConfig, time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self {
+            config,
+            time_provider,
+            state: Default::default(),
+            persist_running: Default::default(),
+        }
+    }
+
+    /// Logs bytes written into a partition so that it can be tracked for the manager to
+    /// trigger persistence. Returns true if the ingester should pause consuming from the
+    /// write buffer so that persistence can catch up and free up memory.
+    pub fn log_write(&self, partition_id: PartitionId, bytes_written: usize) -> bool {
+        let mut s = self.state.lock();
+        s.partition_stats
+            .entry(partition_id)
+            .or_insert_with(|| PartitionLifecycleStats {
+                partition_id,
+                first_write: self.time_provider.now(),
+                bytes_written: 0,
+            })
+            .bytes_written += bytes_written;
+        s.total_bytes += bytes_written;
+
+        s.total_bytes > self.config.pause_ingest_size
+    }
+
+    /// Returns true if the `total_bytes` tracked by the manager is less than the pause amount.
+    /// As persistence runs, the `total_bytes` go down.
+    pub fn can_resume_ingest(&self) -> bool {
+        let s = self.state.lock();
+        s.total_bytes < self.config.pause_ingest_size
+    }
+
+    /// This will persist any partitions that are over their size or age thresholds and
+    /// persist as many partitions as necessary (largest first) to get below the memory threshold.
+    /// The persist operations are spawned in new tasks and run at the same time, but the
+    /// function waits for all to return before completing.
+    pub async fn maybe_persist<P: Persister>(&self, persister: &Arc<P>) {
+        // ensure that this is only running one at a time
+        self.persist_running.lock().await;
+
+        let LifecycleStats {
+            mut total_bytes,
+            partition_stats,
+        } = self.stats();
+
+        // get anything over the threshold size or age to persist
+        let now = self.time_provider.now();
+
+        let (to_persist, mut rest): (Vec<PartitionLifecycleStats>, Vec<PartitionLifecycleStats>) =
+            partition_stats.into_iter().partition(|s| {
+                let aged_out = now
+                    .checked_duration_since(s.first_write)
+                    .map(|age| age > self.config.partition_age_threshold)
+                    .unwrap_or(false);
+                let sized_out = s.bytes_written > self.config.partition_size_threshold;
+
+                aged_out || sized_out
+            });
+
+        let mut persist_tasks: Vec<_> = to_persist
+            .into_iter()
+            .map(|s| {
+                let bytes_removed = self
+                    .remove(s.partition_id)
+                    .map(|s| s.bytes_written)
+                    .unwrap_or(0);
+                total_bytes -= bytes_removed;
+                let persister = Arc::clone(persister);
+
+                tokio::task::spawn(async move {
+                    persister.persist(s.partition_id).await;
+                })
+            })
+            .collect();
+
+        // if we're still over the memory threshold, persist as many of the largest partitions
+        // until we're under. It's ok if this is stale, it'll just get handled on the next pass
+        // through.
+        if total_bytes > self.config.persist_memory_threshold {
+            let mut to_persist = vec![];
+
+            rest.sort_by(|a, b| b.bytes_written.cmp(&a.bytes_written));
+
+            for s in rest {
+                total_bytes -= s.bytes_written;
+                to_persist.push(s);
+                if total_bytes < self.config.persist_memory_threshold {
+                    break;
+                }
+            }
+
+            let mut to_persist: Vec<_> = to_persist
+                .into_iter()
+                .map(|s| {
+                    self.remove(s.partition_id);
+                    let persister = Arc::clone(persister);
+                    tokio::task::spawn(async move {
+                        persister.persist(s.partition_id).await;
+                    })
+                })
+                .collect();
+
+            persist_tasks.append(&mut to_persist);
+        }
+
+        let persists = futures::future::join_all(persist_tasks.into_iter());
+        persists.await;
+    }
+
+    /// Returns a point in time snapshot of the lifecycle state.
+    pub fn stats(&self) -> LifecycleStats {
+        let s = self.state.lock();
+        let partition_stats: Vec<_> = s.partition_stats.values().cloned().collect();
+
+        LifecycleStats {
+            total_bytes: s.total_bytes,
+            partition_stats,
+        }
+    }
+
+    /// Removes the partition from the state
+    pub fn remove(&self, partition_id: PartitionId) -> Option<PartitionLifecycleStats> {
+        let mut s = self.state.lock();
+        s.remove(&partition_id)
+    }
+}
+
+const CHECK_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Runs the lifecycle manager to trigger persistence every second.
+pub(crate) async fn run_lifecycle_manager<P: Persister>(
+    manager: Arc<LifecycleManager>,
+    persister: Arc<P>,
+) {
+    loop {
+        manager.maybe_persist(&persister).await;
+        tokio::time::sleep(CHECK_INTERVAL).await;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use async_trait::async_trait;
+    use std::collections::BTreeSet;
+    use time::{MockProvider, SystemProvider};
+
+    #[derive(Default)]
+    struct TestPersister {
+        persist_called: Mutex<BTreeSet<PartitionId>>,
+    }
+
+    #[async_trait]
+    impl Persister for TestPersister {
+        async fn persist(&self, partition_id: PartitionId) {
+            let mut p = self.persist_called.lock();
+            p.insert(partition_id);
+        }
+    }
+
+    impl TestPersister {
+        fn persist_called_for(&self, partition_id: PartitionId) -> bool {
+            let p = self.persist_called.lock();
+            p.contains(&partition_id)
+        }
+    }
+
+    #[test]
+    fn logs_write() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 20,
+            persist_memory_threshold: 10,
+            partition_size_threshold: 5,
+            partition_age_threshold: Duration::from_nanos(0),
+        };
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let tp = Arc::clone(&time_provider);
+        let m = LifecycleManager::new(config, tp);
+
+        // log first two writes at different times
+        assert!(!m.log_write(PartitionId::new(1), 1));
+        time_provider.inc(Duration::from_nanos(10));
+        assert!(!m.log_write(PartitionId::new(1), 1));
+
+        // log another write for different partition
+        assert!(!m.log_write(PartitionId::new(2), 3));
+
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 5);
+
+        let p1 = stats.partition_stats.get(0).unwrap();
+        assert_eq!(p1.bytes_written, 2);
+        assert_eq!(p1.partition_id, PartitionId::new(1));
+        assert_eq!(p1.first_write, Time::from_timestamp_nanos(0));
+
+        let p2 = stats.partition_stats.get(1).unwrap();
+        assert_eq!(p2.bytes_written, 3);
+        assert_eq!(p2.partition_id, PartitionId::new(2));
+        assert_eq!(p2.first_write, Time::from_timestamp_nanos(10));
+    }
+
+    #[test]
+    fn pausing_and_resuming_ingest() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 20,
+            persist_memory_threshold: 10,
+            partition_size_threshold: 5,
+            partition_age_threshold: Duration::from_nanos(0),
+        };
+        let time_provider = Arc::new(SystemProvider::new());
+        let m = LifecycleManager::new(config, time_provider);
+
+        assert!(!m.log_write(PartitionId::new(1), 15));
+
+        // now it should indicate a pause
+        assert!(m.log_write(PartitionId::new(1), 10));
+        assert!(!m.can_resume_ingest());
+
+        m.remove(PartitionId::new(1));
+        assert!(m.can_resume_ingest());
+        assert!(!m.log_write(PartitionId::new(1), 3));
+    }
+
+    #[tokio::test]
+    async fn persists_based_on_age() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 30,
+            persist_memory_threshold: 20,
+            partition_size_threshold: 10,
+            partition_age_threshold: Duration::from_nanos(5),
+        };
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let tp = Arc::clone(&time_provider);
+        let m = LifecycleManager::new(config, tp);
+        let partition_id = PartitionId::new(1);
+        let persister = Arc::new(TestPersister::default());
+        m.log_write(partition_id, 10);
+
+        m.maybe_persist(&persister).await;
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 10);
+        assert_eq!(stats.partition_stats[0].partition_id, partition_id);
+
+        // age out the partition
+        time_provider.inc(Duration::from_nanos(6));
+
+        // validate that from before, persist wasn't called for the partition
+        assert!(!persister.persist_called_for(partition_id));
+
+        // write in data for a new partition so we can be sure it isn't persisted, but the older one is
+        m.log_write(PartitionId::new(2), 6);
+
+        m.maybe_persist(&persister).await;
+
+        assert!(persister.persist_called_for(partition_id));
+        assert!(!persister.persist_called_for(PartitionId::new(2)));
+
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 6);
+        assert_eq!(stats.partition_stats.len(), 1);
+        assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(2));
+    }
+
+    #[tokio::test]
+    async fn persists_based_on_partition_size() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 30,
+            persist_memory_threshold: 20,
+            partition_size_threshold: 5,
+            partition_age_threshold: Duration::from_millis(100),
+        };
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+
+        let m = LifecycleManager::new(config, time_provider);
+        let partition_id = PartitionId::new(1);
+        let persister = Arc::new(TestPersister::default());
+        m.log_write(partition_id, 4);
+
+        m.maybe_persist(&persister).await;
+
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 4);
+        assert_eq!(stats.partition_stats[0].partition_id, partition_id);
+        assert!(!persister.persist_called_for(partition_id));
+
+        // introduce a new partition under the limit to verify it doesn't get taken with the other
+        m.log_write(PartitionId::new(2), 3);
+        m.log_write(partition_id, 5);
+
+        m.maybe_persist(&persister).await;
+
+        assert!(persister.persist_called_for(partition_id));
+        assert!(!persister.persist_called_for(PartitionId::new(2)));
+
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 3);
+        assert_eq!(stats.partition_stats.len(), 1);
+        assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(2));
+    }
+
+    #[tokio::test]
+    async fn persists_based_on_memory_size() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 60,
+            persist_memory_threshold: 20,
+            partition_size_threshold: 20,
+            partition_age_threshold: Duration::from_millis(1000),
+        };
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let m = LifecycleManager::new(config, time_provider);
+        let partition_id = PartitionId::new(1);
+        let persister = Arc::new(TestPersister::default());
+        m.log_write(partition_id, 8);
+        m.log_write(PartitionId::new(2), 13);
+
+        m.maybe_persist(&persister).await;
+
+        // the bigger of the two partitions should have been persisted, leaving the smaller behind
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 8);
+        assert_eq!(stats.partition_stats[0].partition_id, partition_id);
+        assert!(!persister.persist_called_for(partition_id));
+        assert!(persister.persist_called_for(PartitionId::new(2)));
+
+        // add that partition back in over size
+        m.log_write(partition_id, 20);
+        m.log_write(PartitionId::new(2), 21);
+
+        // both partitions should now need to be persisted to bring us below the mem threshold of 20.
+        m.maybe_persist(&persister).await;
+
+        assert!(persister.persist_called_for(partition_id));
+        assert!(persister.persist_called_for(PartitionId::new(2)));
+
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 0);
+        assert_eq!(stats.partition_stats.len(), 0);
+    }
+
+    #[tokio::test]
+    async fn persist_based_on_partition_and_memory_size() {
+        let config = LifecycleConfig {
+            pause_ingest_size: 60,
+            persist_memory_threshold: 6,
+            partition_size_threshold: 5,
+            partition_age_threshold: Duration::from_millis(1000),
+        };
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let tp = Arc::clone(&time_provider);
+        let m = LifecycleManager::new(config, tp);
+        let persister = Arc::new(TestPersister::default());
+        m.log_write(PartitionId::new(1), 4);
+        time_provider.inc(Duration::from_nanos(1));
+        m.log_write(PartitionId::new(2), 6);
+        time_provider.inc(Duration::from_nanos(1));
+        m.log_write(PartitionId::new(3), 3);
+
+        m.maybe_persist(&persister).await;
+
+        // the bigger of the two partitions should have been persisted, leaving the smaller behind
+        let stats = m.stats();
+        assert_eq!(stats.total_bytes, 3);
+        assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(3));
+        assert!(!persister.persist_called_for(PartitionId::new(3)));
+        assert!(persister.persist_called_for(PartitionId::new(2)));
+        assert!(persister.persist_called_for(PartitionId::new(1)));
+    }
+}

From ca331503a533b3813673480e027c2c9e47ef17b9 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:34:05 +0000
Subject: [PATCH 28/30] feat: add WriteBufferErrorKind (#3664)

* feat: add WriteBufferErrorKind

* fix: test_offset_after_broken_message

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 db/src/write_buffer.rs           |  21 ++++--
 router/src/write_sink.rs         |   4 +-
 write_buffer/src/codec.rs        |  49 ++++++++++----
 write_buffer/src/config.rs       |   8 +--
 write_buffer/src/core.rs         | 107 ++++++++++++++++++++++++++++++-
 write_buffer/src/file.rs         |   7 +-
 write_buffer/src/kafka/config.rs |  20 +++---
 write_buffer/src/kafka/mod.rs    |  18 ++++--
 write_buffer/src/mock.rs         |   9 +--
 9 files changed, 199 insertions(+), 44 deletions(-)

diff --git a/db/src/write_buffer.rs b/db/src/write_buffer.rs
index 953cbab0b5..4af0b88573 100644
--- a/db/src/write_buffer.rs
+++ b/db/src/write_buffer.rs
@@ -15,7 +15,7 @@ use std::{
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use trace::span::SpanRecorder;
-use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
+use write_buffer::core::{WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler};
 
 use self::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
 pub mod metrics;
@@ -142,14 +142,27 @@ async fn stream_in_sequenced_entries<'a>(
         // get entry from sequencer
         let dml_operation = match db_write_result {
             Ok(db_write) => db_write,
-            // skip over invalid data in the write buffer so recovery can succeed
             Err(e) => {
-                warn!(
+                error!(
                     %e,
                     %db_name,
                     sequencer_id,
-                    "Error converting write buffer data to SequencedEntry",
+                    "Error reading record from write buffer",
                 );
+
+                match e.kind() {
+                    // If invalid data, simply skip over it
+                    WriteBufferErrorKind::InvalidData => {}
+
+                    // Otherwise backoff for a period
+                    WriteBufferErrorKind::Unknown
+                    | WriteBufferErrorKind::IO
+                    // TODO: Should probably bail on invalid input error
+                    | WriteBufferErrorKind::InvalidInput => {
+                        // TODO: Exponential backoff
+                        tokio::time::sleep(std::time::Duration::from_secs(10)).await;
+                    }
+                }
                 continue;
             }
         };
diff --git a/router/src/write_sink.rs b/router/src/write_sink.rs
index e4054664c4..2ed9ea7902 100644
--- a/router/src/write_sink.rs
+++ b/router/src/write_sink.rs
@@ -104,7 +104,9 @@ impl VariantWriteBuffer {
         write_buffer
             .store_operation(0, operation)
             .await
-            .context(WriteFailureSnafu)?;
+            .map_err(|e| Error::WriteFailure {
+                source: Box::new(e),
+            })?;
 
         Ok(())
     }
diff --git a/write_buffer/src/codec.rs b/write_buffer/src/codec.rs
index 8b69e4f957..6a4e3da083 100644
--- a/write_buffer/src/codec.rs
+++ b/write_buffer/src/codec.rs
@@ -76,9 +76,17 @@ impl IoxHeaders {
             if name.eq_ignore_ascii_case(HEADER_CONTENT_TYPE) {
                 content_type = match std::str::from_utf8(value.as_ref()) {
                     Ok(CONTENT_TYPE_PROTOBUF) => Some(ContentType::Protobuf),
-                    Ok(c) => return Err(format!("Unknown message format: {}", c).into()),
+                    Ok(c) => {
+                        return Err(WriteBufferError::invalid_data(format!(
+                            "Unknown message format: {}",
+                            c
+                        )))
+                    }
                     Err(e) => {
-                        return Err(format!("Error decoding content type header: {}", e).into())
+                        return Err(WriteBufferError::invalid_data(format!(
+                            "Error decoding content type header: {}",
+                            e
+                        )))
                     }
                 };
             }
@@ -95,7 +103,10 @@ impl IoxHeaders {
                         span_context = match parser.parse(trace_collector, &headers) {
                             Ok(ctx) => ctx,
                             Err(e) => {
-                                return Err(format!("Error decoding trace context: {}", e).into())
+                                return Err(WriteBufferError::invalid_data(format!(
+                                    "Error decoding trace context: {}",
+                                    e
+                                )))
                             }
                         };
                     }
@@ -103,15 +114,20 @@ impl IoxHeaders {
             }
 
             if name.eq_ignore_ascii_case(HEADER_NAMESPACE) {
-                namespace = Some(
-                    String::from_utf8(value.as_ref().to_vec())
-                        .map_err(|e| format!("Error decoding namespace header: {}", e))?,
-                );
+                namespace = Some(String::from_utf8(value.as_ref().to_vec()).map_err(|e| {
+                    WriteBufferError::invalid_data(format!(
+                        "Error decoding namespace header: {}",
+                        e
+                    ))
+                })?);
             }
         }
 
+        let content_type =
+            content_type.ok_or_else(|| WriteBufferError::invalid_data("No content type header"))?;
+
         Ok(Self {
-            content_type: content_type.ok_or_else(|| "No content type header".to_string())?,
+            content_type,
             span_context,
             namespace: namespace.unwrap_or_default(),
         })
@@ -173,8 +189,12 @@ pub fn decode(
 
             match payload {
                 Payload::Write(write) => {
-                    let tables = decode_database_batch(&write)
-                        .map_err(|e| format!("failed to decode database batch: {}", e))?;
+                    let tables = decode_database_batch(&write).map_err(|e| {
+                        WriteBufferError::invalid_data(format!(
+                            "failed to decode database batch: {}",
+                            e
+                        ))
+                    })?;
 
                     Ok(DmlOperation::Write(DmlWrite::new(
                         headers.namespace,
@@ -183,7 +203,11 @@ pub fn decode(
                     )))
                 }
                 Payload::Delete(delete) => {
-                    let predicate = delete.predicate.required("predicate")?;
+                    let predicate = delete
+                        .predicate
+                        .required("predicate")
+                        .map_err(WriteBufferError::invalid_data)?;
+
                     Ok(DmlOperation::Delete(DmlDelete::new(
                         headers.namespace,
                         predicate,
@@ -220,7 +244,8 @@ pub fn encode_operation(
     let payload = WriteBufferPayload {
         payload: Some(payload),
     };
-    Ok(payload.encode(buf).map_err(Box::new)?)
+
+    payload.encode(buf).map_err(WriteBufferError::invalid_input)
 }
 
 #[cfg(test)]
diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs
index ae88d01929..1be5a8fbdd 100644
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@@ -267,7 +267,7 @@ mod tests {
             .new_config_write(db_name.as_str(), None, &cfg)
             .await
             .unwrap_err();
-        assert!(err.to_string().starts_with("Unknown mock ID:"));
+        assert!(err.to_string().contains("Unknown mock ID:"));
     }
 
     #[tokio::test]
@@ -302,7 +302,7 @@ mod tests {
             .new_config_read(db_name.as_str(), None, &cfg)
             .await
             .unwrap_err();
-        assert!(err.to_string().starts_with("Unknown mock ID:"));
+        assert!(err.to_string().contains("Unknown mock ID:"));
     }
 
     #[tokio::test]
@@ -335,7 +335,7 @@ mod tests {
             .new_config_write(db_name.as_str(), None, &cfg)
             .await
             .unwrap_err();
-        assert!(err.to_string().starts_with("Unknown mock ID:"));
+        assert!(err.to_string().contains("Unknown mock ID:"));
     }
 
     #[tokio::test]
@@ -368,7 +368,7 @@ mod tests {
             .new_config_read(db_name.as_str(), None, &cfg)
             .await
             .unwrap_err();
-        assert!(err.to_string().starts_with("Unknown mock ID:"));
+        assert!(err.to_string().contains("Unknown mock ID:"));
     }
 
     #[test]
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index d250bc5d89..d9d96ef93f 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -1,3 +1,5 @@
+use std::fmt::{Display, Formatter};
+use std::io::Error;
 use std::{
     collections::{BTreeMap, BTreeSet},
     fmt::Debug,
@@ -10,7 +12,106 @@ use futures::stream::BoxStream;
 /// Generic boxed error type that is used in this crate.
 ///
 /// The dynamic boxing makes it easier to deal with error from different implementations.
-pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
+#[derive(Debug)]
+pub struct WriteBufferError {
+    inner: Box<dyn std::error::Error + Sync + Send>,
+    kind: WriteBufferErrorKind,
+}
+
+impl WriteBufferError {
+    pub fn new(
+        kind: WriteBufferErrorKind,
+        e: impl Into<Box<dyn std::error::Error + Sync + Send>>,
+    ) -> Self {
+        Self {
+            inner: e.into(),
+            kind,
+        }
+    }
+
+    pub fn invalid_data(e: impl Into<Box<dyn std::error::Error + Sync + Send>>) -> Self {
+        Self::new(WriteBufferErrorKind::InvalidData, e)
+    }
+
+    pub fn invalid_input(e: impl Into<Box<dyn std::error::Error + Sync + Send>>) -> Self {
+        Self::new(WriteBufferErrorKind::InvalidInput, e)
+    }
+
+    /// Returns the kind of error this was
+    pub fn kind(&self) -> WriteBufferErrorKind {
+        self.kind
+    }
+
+    /// Returns the inner error
+    pub fn inner(&self) -> &dyn std::error::Error {
+        self.inner.as_ref()
+    }
+}
+
+impl Display for WriteBufferError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "WriteBufferError({:?}): {}", self.kind, self.inner)
+    }
+}
+
+impl std::error::Error for WriteBufferError {}
+
+impl From<std::io::Error> for WriteBufferError {
+    fn from(e: Error) -> Self {
+        Self {
+            inner: Box::new(e),
+            kind: WriteBufferErrorKind::IO,
+        }
+    }
+}
+
+impl From<rskafka::client::error::Error> for WriteBufferError {
+    fn from(e: rskafka::client::error::Error) -> Self {
+        Self {
+            inner: Box::new(e),
+            kind: WriteBufferErrorKind::IO,
+        }
+    }
+}
+
+impl From<rskafka::client::producer::Error> for WriteBufferError {
+    fn from(e: rskafka::client::producer::Error) -> Self {
+        Self {
+            inner: Box::new(e),
+            kind: WriteBufferErrorKind::IO,
+        }
+    }
+}
+
+impl From<String> for WriteBufferError {
+    fn from(e: String) -> Self {
+        Self {
+            inner: e.into(),
+            kind: WriteBufferErrorKind::Unknown,
+        }
+    }
+}
+
+impl From<&'static str> for WriteBufferError {
+    fn from(e: &'static str) -> Self {
+        Self {
+            inner: e.into(),
+            kind: WriteBufferErrorKind::Unknown,
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+pub enum WriteBufferErrorKind {
+    /// This operation failed for an unknown reason
+    Unknown,
+    /// This operation was provided with invalid input data
+    InvalidInput,
+    /// This operation encountered invalid data
+    InvalidData,
+    /// A fatal IO error occurred - non-fatal errors should be retried internally
+    IO,
+}
 
 /// Writing to a Write Buffer takes a [`DmlWrite`] and returns the [`DmlMeta`] for the
 /// payload that was written
@@ -44,7 +145,9 @@ pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
         lp: &str,
         default_time: i64,
     ) -> Result<DmlMeta, WriteBufferError> {
-        let tables = mutable_batch_lp::lines_to_batches(lp, default_time).map_err(Box::new)?;
+        let tables = mutable_batch_lp::lines_to_batches(lp, default_time)
+            .map_err(WriteBufferError::invalid_input)?;
+
         self.store_operation(
             sequencer_id,
             &DmlOperation::Write(DmlWrite::new("test_db", tables, Default::default())),
diff --git a/write_buffer/src/file.rs b/write_buffer/src/file.rs
index b1d7a5fec9..af575a1af9 100644
--- a/write_buffer/src/file.rs
+++ b/write_buffer/src/file.rs
@@ -462,7 +462,7 @@ impl ConsumerStream {
                         }
                         _ => {
                             // cannot read file => communicate to user
-                            Err(Box::new(error) as WriteBufferError)
+                            Err(error.into())
                         }
                     }
                 }
@@ -478,7 +478,10 @@ impl ConsumerStream {
         trace_collector: Option<Arc<dyn TraceCollector>>,
     ) -> Result<DmlOperation, WriteBufferError> {
         let mut headers = [httparse::EMPTY_HEADER; 16];
-        match httparse::parse_headers(&data, &mut headers)? {
+        let status =
+            httparse::parse_headers(&data, &mut headers).map_err(WriteBufferError::invalid_data)?;
+
+        match status {
             httparse::Status::Complete((offset, headers)) => {
                 let iox_headers = IoxHeaders::from_headers(
                     headers.iter().map(|header| (header.name, header.value)),
diff --git a/write_buffer/src/kafka/config.rs b/write_buffer/src/kafka/config.rs
index f828e4b207..317ceceb74 100644
--- a/write_buffer/src/kafka/config.rs
+++ b/write_buffer/src/kafka/config.rs
@@ -45,7 +45,8 @@ impl TryFrom<&WriteBufferCreationConfig> for TopicCreationConfig {
 
     fn try_from(cfg: &WriteBufferCreationConfig) -> Result<Self, Self::Error> {
         Ok(Self {
-            num_partitions: i32::try_from(cfg.n_sequencers.get())?,
+            num_partitions: i32::try_from(cfg.n_sequencers.get())
+                .map_err(WriteBufferError::invalid_input)?,
             replication_factor: parse_key(&cfg.options, "replication_factor")?.unwrap_or(1),
             timeout_ms: parse_key(&cfg.options, "timeout_ms")?.unwrap_or(5_000),
         })
@@ -127,6 +128,7 @@ where
 #[cfg(test)]
 mod tests {
     use std::{collections::BTreeMap, num::NonZeroU32};
+    use test_helpers::assert_contains;
 
     use super::*;
 
@@ -159,7 +161,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `max_message_size` from 'xyz': invalid digit found in string"
         );
@@ -206,7 +208,7 @@ mod tests {
             options: BTreeMap::from([(String::from("replication_factor"), String::from("xyz"))]),
         })
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `replication_factor` from 'xyz': invalid digit found in string"
         );
@@ -216,7 +218,7 @@ mod tests {
             options: BTreeMap::from([(String::from("timeout_ms"), String::from("xyz"))]),
         })
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `timeout_ms` from 'xyz': invalid digit found in string"
         );
@@ -257,7 +259,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `consumer_max_wait_ms` from 'xyz': invalid digit found in string"
         );
@@ -267,7 +269,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `consumer_min_batch_size` from 'xyz': invalid digit found in string"
         );
@@ -277,7 +279,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `consumer_max_batch_size` from 'xyz': invalid digit found in string"
         );
@@ -318,7 +320,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `producer_linger_ms` from 'xyz': invalid digit found in string"
         );
@@ -328,7 +330,7 @@ mod tests {
             String::from("xyz"),
         )]))
         .unwrap_err();
-        assert_eq!(
+        assert_contains!(
             err.to_string(),
             "Cannot parse `producer_max_batch_size` from 'xyz': invalid digit found in string"
         );
diff --git a/write_buffer/src/kafka/mod.rs b/write_buffer/src/kafka/mod.rs
index 1a47531f5e..034b4ca827 100644
--- a/write_buffer/src/kafka/mod.rs
+++ b/write_buffer/src/kafka/mod.rs
@@ -158,11 +158,16 @@ impl WriteBufferStreamHandler for RSKafkaStreamHandler {
 
             let sequence = Sequence {
                 id: self.sequencer_id,
-                number: record.offset.try_into()?,
+                number: record
+                    .offset
+                    .try_into()
+                    .map_err(WriteBufferError::invalid_data)?,
             };
 
             let timestamp_millis =
-                i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)?;
+                i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)
+                    .map_err(WriteBufferError::invalid_data)?;
+
             let timestamp = Time::from_timestamp_millis_opt(timestamp_millis)
                 .ok_or_else::<WriteBufferError, _>(|| {
                     format!(
@@ -182,7 +187,7 @@ impl WriteBufferStreamHandler for RSKafkaStreamHandler {
     }
 
     async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
-        let offset = i64::try_from(sequence_number)?;
+        let offset = i64::try_from(sequence_number).map_err(WriteBufferError::invalid_input)?;
         self.next_offset.store(offset, Ordering::SeqCst);
         Ok(())
     }
@@ -259,7 +264,7 @@ impl WriteBufferReading for RSKafkaConsumer {
             })?;
 
         let watermark = partition_client.get_high_watermark().await?;
-        u64::try_from(watermark).map_err(|e| Box::new(e) as WriteBufferError)
+        u64::try_from(watermark).map_err(WriteBufferError::invalid_data)
     }
 
     fn type_name(&self) -> &'static str {
@@ -288,7 +293,7 @@ async fn setup_topic(
             let mut partition_clients = BTreeMap::new();
             for partition in topic.partitions {
                 let c = client.partition_client(&database_name, partition).await?;
-                let partition = u32::try_from(partition)?;
+                let partition = u32::try_from(partition).map_err(WriteBufferError::invalid_data)?;
                 partition_clients.insert(partition, c);
             }
             return Ok(partition_clients);
@@ -330,6 +335,7 @@ mod tests {
     use dml::{test_util::assert_write_op_eq, DmlDelete, DmlWrite};
     use futures::{stream::FuturesUnordered, TryStreamExt};
     use rskafka::{client::partition::Compression, record::Record};
+    use test_helpers::assert_contains;
     use trace::{ctx::SpanContext, RingBufferTraceCollector};
 
     use crate::{
@@ -502,7 +508,7 @@ mod tests {
         // read broken message from stream
         let mut stream = handler.stream();
         let err = stream.next().await.unwrap().unwrap_err();
-        assert_eq!(err.to_string(), "No content type header");
+        assert_contains!(err.to_string(), "No content type header");
 
         // re-creating the stream should advance past the broken message
         drop(stream);
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index c815fe1b8a..ed41f5cd1d 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -537,6 +537,7 @@ mod tests {
 
     use futures::StreamExt;
     use mutable_batch_lp::lines_to_batches;
+    use test_helpers::assert_contains;
     use time::TimeProvider;
     use trace::RingBufferTraceCollector;
 
@@ -725,7 +726,7 @@ mod tests {
     async fn test_always_error_read() {
         let reader = MockBufferForReadingThatAlwaysErrors {};
 
-        assert_eq!(
+        assert_contains!(
             reader
                 .fetch_high_watermark(0)
                 .await
@@ -736,12 +737,12 @@ mod tests {
 
         let mut stream_handler = reader.stream_handler(0).await.unwrap();
 
-        assert_eq!(
+        assert_contains!(
             stream_handler.seek(0).await.unwrap_err().to_string(),
             "Something bad happened while seeking the stream"
         );
 
-        assert_eq!(
+        assert_contains!(
             stream_handler
                 .stream()
                 .next()
@@ -760,7 +761,7 @@ mod tests {
         let tables = lines_to_batches("upc user=1 100", 0).unwrap();
         let operation = DmlOperation::Write(DmlWrite::new("test_db", tables, Default::default()));
 
-        assert_eq!(
+        assert_contains!(
             writer
                 .store_operation(0, &operation)
                 .await

From a9fe3362bd1b1acf0175ef09a174874ace6fb426 Mon Sep 17 00:00:00 2001
From: Jacob Marble <jacobmarble@influxdata.com>
Date: Tue, 8 Feb 2022 07:44:09 -0800
Subject: [PATCH 29/30] chore: add semantic commit check as GH action (#3638)

* chore: add semantic commit check as GH action

* chore: remove spaces

* chore: indentation

* chore: yamllint config compatible with GH actions 'on' key

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 .circleci/config.yml                 |  2 +-
 .circleci/yamllint.yml               |  3 ++
 .github/workflows/semantic_check.sh  | 67 ++++++++++++++++++++++++++++
 .github/workflows/semantic_check.yml | 20 +++++++++
 4 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 .circleci/yamllint.yml
 create mode 100755 .github/workflows/semantic_check.sh
 create mode 100644 .github/workflows/semantic_check.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7ddf2bc472..6839f5bb09 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -104,7 +104,7 @@ jobs:
           command: find scripts -type f ! \( -iname '*.py' -or -iname '*.supp' \) -exec shellcheck {} +
       - run:
           name: Yamllint
-          command: yamllint --strict .
+          command: yamllint --config-file .circleci/yamllint.yml --strict .
       - cache_save
   cargo_audit:
     docker:
diff --git a/.circleci/yamllint.yml b/.circleci/yamllint.yml
new file mode 100644
index 0000000000..b32765b6d7
--- /dev/null
+++ b/.circleci/yamllint.yml
@@ -0,0 +1,3 @@
+rules:
+  truthy:
+    check-keys: false
diff --git a/.github/workflows/semantic_check.sh b/.github/workflows/semantic_check.sh
new file mode 100755
index 0000000000..83e5a8f401
--- /dev/null
+++ b/.github/workflows/semantic_check.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+shopt -s nocasematch
+semantic_pattern='(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\([^)]+\))?: +[^ ]'
+
+if [[ $1 == "test" ]]; then
+  exit_code=0
+
+  echo checking strings that should be OK
+  expect_ok="chore: foo
+chore(hello): foo
+CHORE: foo"
+  while read -r s; do
+    if [[ ! $s =~ $semantic_pattern ]]; then
+      echo got FAIL, expected OK: "$s"
+      exit_code=1
+    fi
+  done <<< "$expect_ok"
+
+  echo checking strings that should FAIL
+  expect_fail="more: foo
+chore(: foo
+chore : foo
+chore:
+chore:
+chore:foo
+"
+  while read -r s; do
+    if [[ $s =~ $semantic_pattern ]]; then
+      echo got OK, expected FAIL: "$s"
+      exit_code=1
+    fi
+  done <<< "$expect_fail"
+
+  exit $exit_code
+fi
+
+# nb: quotes are often not required around env var names between [[ and ]]
+if [[ -z $PR_TITLE || -z $COMMITS_URL ]]; then
+  echo ::error::required env vars: PR_TITLE, COMMITS_URL
+  exit 1
+fi
+
+exit_code=0
+
+if [[ ! $PR_TITLE =~ $semantic_pattern ]]; then
+  echo ::error::PR title not semantic: "$PR_TITLE"
+  exit_code=1
+else
+  echo PR title OK: "$PR_TITLE"
+fi
+
+json=$(curl --silent "$COMMITS_URL")
+commits=$(echo "$json" | jq --raw-output '.[] | [.sha, .commit.message] | join(" ") | split("\n") | first')
+
+while read -r commit; do
+  commit_title=$(echo "$commit" | cut -c 42-999)
+
+  if [[ ! $commit_title =~ $semantic_pattern ]]; then
+    echo ::error::Commit title not semantic: "$commit"
+    exit_code=1
+  else
+    echo Commit title OK: "$commit"
+  fi
+done <<< "$commits"
+
+exit $exit_code
diff --git a/.github/workflows/semantic_check.yml b/.github/workflows/semantic_check.yml
new file mode 100644
index 0000000000..5606f83fb1
--- /dev/null
+++ b/.github/workflows/semantic_check.yml
@@ -0,0 +1,20 @@
+---
+name: "Semantic PR and Commit Messages"
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize, edited]
+
+env:
+  PR_TITLE: ${{ github.event.pull_request.title }}
+  COMMITS_URL: ${{ github.event.pull_request.commits_url }}
+
+jobs:
+  main:
+    name: Semantic PR and commit messages
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+      - run: bash .github/workflows/semantic_check.sh

From c18ad4ac978998ba29a3b54a1d40dd439dad73f0 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 8 Feb 2022 16:09:36 +0000
Subject: [PATCH 30/30] feat: special case max timestamp range for table_names
 and field_columns (#3642)

---
 query/src/frontend/influxrpc.rs            |  6 +++
 query_tests/src/influxrpc/field_columns.rs | 62 ++++++++++++++++++++++
 query_tests/src/influxrpc/table_names.rs   | 43 +++++++++++++++
 query_tests/src/influxrpc/tag_keys.rs      |  7 +--
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/query/src/frontend/influxrpc.rs b/query/src/frontend/influxrpc.rs
index 8727850615..c04e29d067 100644
--- a/query/src/frontend/influxrpc.rs
+++ b/query/src/frontend/influxrpc.rs
@@ -219,6 +219,9 @@ impl InfluxRpcPlanner {
     {
         debug!(?rpc_predicate, "planning table_names");
 
+        // Special case predicates that span the entire valid timestamp range
+        let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
+
         let mut builder = StringSetPlanBuilder::new();
 
         // Mapping between table and chunks that need full plan
@@ -617,6 +620,9 @@ impl InfluxRpcPlanner {
     {
         debug!(?rpc_predicate, "planning field_columns");
 
+        // Special case predicates that span the entire valid timestamp range
+        let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
+
         // Algorithm is to run a "select field_cols from table where
         // <predicate> type plan for each table in the chunks"
         //
diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs
index ad91ca3a59..28ee683e7a 100644
--- a/query_tests/src/influxrpc/field_columns.rs
+++ b/query_tests/src/influxrpc/field_columns.rs
@@ -1,4 +1,5 @@
 use arrow::datatypes::DataType;
+use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
 use datafusion::logical_plan::{col, lit};
 use predicate::rpc_predicate::InfluxRpcPredicate;
 use predicate::PredicateBuilder;
@@ -216,3 +217,64 @@ async fn test_field_name_plan_with_delete() {
     )
     .await;
 }
+
+#[tokio::test]
+async fn list_field_columns_max_time() {
+    let predicate = PredicateBuilder::default()
+        .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
+        .build();
+    let predicate = InfluxRpcPredicate::new(None, predicate);
+
+    let expected_fields = FieldList {
+        fields: vec![Field {
+            name: "value".into(),
+            data_type: DataType::Float64,
+            last_timestamp: MAX_NANO_TIME,
+        }],
+    };
+
+    run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
+}
+
+#[tokio::test]
+async fn list_field_columns_max_i64() {
+    let predicate = PredicateBuilder::default()
+        .timestamp_range(i64::MIN, i64::MAX)
+        .build();
+    let predicate = InfluxRpcPredicate::new(None, predicate);
+
+    let expected_fields = FieldList {
+        fields: vec![Field {
+            name: "value".into(),
+            data_type: DataType::Float64,
+            last_timestamp: MAX_NANO_TIME,
+        }],
+    };
+
+    run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
+}
+
+#[tokio::test]
+async fn list_field_columns_max_time_less_one() {
+    let predicate = PredicateBuilder::default()
+        // one less than max timestamp
+        .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME - 1)
+        .build();
+    let predicate = InfluxRpcPredicate::new(None, predicate);
+
+    let expected_fields = FieldList { fields: vec![] };
+
+    run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
+}
+
+#[tokio::test]
+async fn list_field_columns_max_time_greater_one() {
+    let predicate = PredicateBuilder::default()
+        .timestamp_range(MIN_NANO_TIME + 1, MAX_NANO_TIME)
+        .build();
+    let predicate = InfluxRpcPredicate::new(None, predicate);
+
+    let expected_fields = FieldList { fields: vec![] };
+
+    run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
+}
diff --git a/query_tests/src/influxrpc/table_names.rs b/query_tests/src/influxrpc/table_names.rs
index 2fe5421d0a..d4b4839679 100644
--- a/query_tests/src/influxrpc/table_names.rs
+++ b/query_tests/src/influxrpc/table_names.rs
@@ -1,4 +1,5 @@
 //! Tests for the Influx gRPC queries
+use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
 use datafusion::logical_plan::{col, lit};
 use predicate::rpc_predicate::InfluxRpcPredicate;
 use predicate::PredicateBuilder;
@@ -207,6 +208,48 @@ async fn list_table_names_data_pred_250_300_with_delete_all() {
     run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(250, 300), vec![]).await;
 }
 
+#[tokio::test]
+async fn list_table_names_max_time() {
+    run_table_names_test_case(
+        MeasurementWithMaxTime {},
+        tsp(MIN_NANO_TIME, MAX_NANO_TIME),
+        vec!["cpu"],
+    )
+    .await;
+}
+
+#[tokio::test]
+async fn list_table_names_max_i64() {
+    run_table_names_test_case(
+        MeasurementWithMaxTime {},
+        // outside valid timestamp range
+        tsp(i64::MIN, i64::MAX),
+        vec!["cpu"],
+    )
+    .await;
+}
+
+#[tokio::test]
+async fn list_table_names_time_less_one() {
+    run_table_names_test_case(
+        MeasurementWithMaxTime {},
+        tsp(MIN_NANO_TIME, MAX_NANO_TIME - 1),
+        vec![],
+    )
+    .await;
+}
+
+#[tokio::test]
+async fn list_table_names_max_time_greater_one() {
+    run_table_names_test_case(
+        MeasurementWithMaxTime {},
+        // one more than max timestamp
+        tsp(MIN_NANO_TIME + 1, MAX_NANO_TIME),
+        vec![],
+    )
+    .await;
+}
+
 // Note when table names supports general purpose predicates, add a
 // test here with a `_measurement` predicate
 // https://github.com/influxdata/influxdb_iox/issues/762
diff --git a/query_tests/src/influxrpc/tag_keys.rs b/query_tests/src/influxrpc/tag_keys.rs
index 5dcf1f0007..0d3e34a870 100644
--- a/query_tests/src/influxrpc/tag_keys.rs
+++ b/query_tests/src/influxrpc/tag_keys.rs
@@ -1,3 +1,4 @@
+use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
 use datafusion::logical_plan::{col, lit};
 use predicate::rpc_predicate::InfluxRpcPredicate;
 use predicate::PredicateBuilder;
@@ -186,7 +187,7 @@ async fn list_tag_name_end_to_end_with_delete() {
 async fn list_tag_name_max_time() {
     test_helpers::maybe_start_logging();
     let predicate = PredicateBuilder::default()
-        .timestamp_range(-9223372036854775806, 9223372036854775806)
+        .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
         .build();
     let predicate = InfluxRpcPredicate::new(None, predicate);
     let expected_tag_keys = vec!["host"];
@@ -209,7 +210,7 @@ async fn list_tag_name_max_i64() {
 async fn list_tag_name_max_time_less_one() {
     test_helpers::maybe_start_logging();
     let predicate = PredicateBuilder::default()
-        .timestamp_range(-9223372036854775806, 9223372036854775805) // one less than max timestamp
+        .timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME - 1) // one less than max timestamp
         .build();
     let predicate = InfluxRpcPredicate::new(None, predicate);
     let expected_tag_keys = vec![];
@@ -220,7 +221,7 @@ async fn list_tag_name_max_time_less_one() {
 async fn list_tag_name_max_time_greater_one() {
     test_helpers::maybe_start_logging();
     let predicate = PredicateBuilder::default()
-        .timestamp_range(-9223372036854775805, 9223372036854775806) // one more than min timestamp
+        .timestamp_range(MIN_NANO_TIME + 1, MAX_NANO_TIME) // one more than min timestamp
         .build();
     let predicate = InfluxRpcPredicate::new(None, predicate);
     let expected_tag_keys = vec![];