From 82d5c7f336af5d0bfe13f8164c1fd13c5f1579ea Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 30 Sep 2022 17:02:38 -0400
Subject: [PATCH 01/40] feat: support parallel, chunked upload via
 `influxdb_iox write` of line protocol, gzip'd line protocol, and parquet
 (#5757)

* feat: Upload in small chunks and in parallel

* fix: doclink

* fix: Apply suggestions from code review

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>

* fix: Update influxdb_iox_client/src/client/write.rs

* fix: fixup error handling and fmt

* fix: Make default chunk sizes the same and add docs

* fix: clippy

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>
---
 Cargo.lock                                 |   5 +-
 influxdb_iox/Cargo.toml                    |   2 +
 influxdb_iox/src/commands/write.rs         | 194 +++++++++-
 influxdb_iox/tests/end_to_end_cases/cli.rs | 114 ++++--
 influxdb_iox_client/Cargo.toml             |   7 +-
 influxdb_iox_client/src/client/write.rs    | 390 +++++++++++++++++++--
 influxdb_line_protocol/src/lib.rs          |   2 +-
 test_fixtures/cpu.parquet                  | Bin 0 -> 25424 bytes
 test_helpers_end_to_end/src/client.rs      |   2 +-
 9 files changed, 622 insertions(+), 94 deletions(-)
 create mode 100644 test_fixtures/cpu.parquet
diff --git a/Cargo.lock b/Cargo.lock
index 899d38c0f1..5f33043592 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2042,7 +2042,9 @@ dependencies = [
  "data_types",
  "datafusion 0.1.0",
  "dotenvy",
+ "flate2",
  "futures",
+ "futures-util",
  "generated_types",
  "hashbrown",
  "http",
@@ -2107,12 +2109,13 @@ dependencies = [
  "client_util",
  "futures-util",
  "generated_types",
- "mockito",
+ "influxdb_line_protocol",
  "prost 0.11.0",
  "rand",
  "reqwest",
  "thiserror",
  "tokio",
+ "tokio-stream",
  "tonic",
 ]
 
diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index f2b400b07c..ec1392882d 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -47,6 +47,8 @@ clap = { version = "3", features = ["derive", "env"] }
 console-subscriber = { version = "0.1.8", optional = true, features = ["parking_lot"] }
 dotenvy = "0.15.5"
 futures = "0.3"
+futures-util = { version = "0.3" }
+flate2 = "1.0"
 hashbrown = "0.12"
 http = "0.2.8"
 humantime = "2.1.0"
diff --git a/influxdb_iox/src/commands/write.rs b/influxdb_iox/src/commands/write.rs
index e5aff6bd88..857a81b320 100644
--- a/influxdb_iox/src/commands/write.rs
+++ b/influxdb_iox/src/commands/write.rs
@@ -1,6 +1,14 @@
+use futures::StreamExt;
 use influxdb_iox_client::{connection::Connection, write};
-use snafu::{ResultExt, Snafu};
-use std::{fs::File, io::Read, path::PathBuf};
+use observability_deps::tracing::info;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::{
+    fs::File,
+    io::{BufReader, Read},
+    num::NonZeroUsize,
+    path::PathBuf,
+    time::Instant,
+};
 
 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@@ -11,10 +19,30 @@ pub enum Error {
         source: std::io::Error,
     },
 
+    #[snafu(display("Error reading files: {:#?}", sources))]
+    ReadingFiles { sources: Vec<Error> },
+
     #[snafu(display("Client error: {source}"))]
     ClientError {
         source: influxdb_iox_client::error::Error,
     },
+
+    #[snafu(display("Error converting parquet: {}", source))]
+    Conversion {
+        source: parquet_to_line_protocol::Error,
+    },
+
+    #[snafu(display("Line protocol was not valid utf8: {}", source))]
+    InvalidUtf8 { source: std::string::FromUtf8Error },
+
+    #[snafu(display("Error decoding gzip {:?}:  {}", file_name, source))]
+    Gz {
+        file_name: PathBuf,
+        source: std::io::Error,
+    },
+
+    #[snafu(display("Max concurrent uploads must be greater than zero"))]
+    MaxConcurrentUploadsVerfication,
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -22,36 +50,176 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// Write data into the specified database
 #[derive(Debug, clap::Parser)]
 pub struct Config {
+    /// If specified, restricts the maxium amount of line protocol
+    /// sent per request to this many bytes. Defaults to 1MB
+    #[clap(action, long, short = 'b', default_value = "1048576")]
+    max_request_payload_size_bytes: usize,
+
+    /// Uploads up to this many http requests at a time. Defaults to 10
+    #[clap(action, long, short = 'c', default_value = "10")]
+    max_concurrent_uploads: usize,
+
     /// The namespace into which to write
     #[clap(action)]
     namespace: String,
 
-    /// File with data to load. Currently supported formats are .lp
+    /// File(s) with data to load. Currently supported formats are .lp (line protocol),
+    /// .parquet (IOx created parquet files), and .gz (gzipped line protocol)
     #[clap(action)]
-    file_name: PathBuf,
+    file_names: Vec<PathBuf>,
 }
 
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
+    let start = Instant::now();
+
     let Config {
         namespace,
-        file_name,
+        file_names,
+        max_request_payload_size_bytes,
+        max_concurrent_uploads,
     } = config;
-    let file_name = &file_name;
 
-    let mut file = File::open(file_name).context(ReadingFileSnafu { file_name })?;
+    let max_concurrent_uploads =
+        NonZeroUsize::new(max_concurrent_uploads).context(MaxConcurrentUploadsVerficationSnafu)?;
 
-    let mut lp_data = String::new();
-    file.read_to_string(&mut lp_data)
-        .context(ReadingFileSnafu { file_name })?;
+    info!(
+        num_files = file_names.len(),
+        max_request_payload_size_bytes, max_concurrent_uploads, "Beginning upload"
+    );
 
-    let mut client = write::Client::new(connection);
+    // first pass is to check that all the files exist and can be
+    // opened and if not fail fast.
+    let file_open_errors: Vec<_> = file_names
+        .iter()
+        .filter_map(|file_name| {
+            File::open(file_name)
+                .context(ReadingFileSnafu { file_name })
+                .err()
+        })
+        .collect();
+
+    ensure!(
+        file_open_errors.is_empty(),
+        ReadingFilesSnafu {
+            sources: file_open_errors
+        }
+    );
+
+    // if everything looked good, go through and read the files out
+    // them potentially in parallel.
+    let lp_stream = futures_util::stream::iter(file_names)
+        .map(|file_name| tokio::task::spawn(slurp_file(file_name)))
+        // Since the contents of each file are buffered into a string,
+        // limit the number that are open at once to the maximum
+        // possible uploads
+        .buffered(max_concurrent_uploads.into())
+        // warn and skip any errors
+        .filter_map(|res| async move {
+            match res {
+                Ok(Ok(lp_data)) => Some(lp_data),
+                Ok(Err(e)) => {
+                    eprintln!("WARNING: ignoring error : {}", e);
+                    None
+                }
+                Err(e) => {
+                    eprintln!("WARNING: ignoring task fail: {}", e);
+                    None
+                }
+            }
+        });
+
+    let mut client = write::Client::new(connection)
+        .with_max_concurrent_uploads(max_concurrent_uploads)
+        .with_max_request_payload_size_bytes(Some(max_request_payload_size_bytes));
 
     let total_bytes = client
-        .write_lp(namespace, lp_data)
+        .write_lp_stream(namespace, lp_stream)
         .await
         .context(ClientSnafu)?;
 
-    println!("{} Bytes OK", total_bytes);
+    let elapsed = Instant::now() - start;
+    let mb = (total_bytes as f64) / (1024.0 * 1024.0);
+    let mb_per_sec = (mb / (elapsed.as_millis() as f64)) * (1000.0);
+    println!("{total_bytes} Bytes OK in {elapsed:?}. {mb_per_sec:.2} MB/sec");
 
     Ok(())
 }
+
+/// Reads the contents of `file_name into a string
+///
+/// .parquet files --> iox parquet files (convert to parquet)
+/// .gz  --> treated as gzipped line protocol
+/// .lp (or anything else) --> treated as raw line protocol
+///
+async fn slurp_file(file_name: PathBuf) -> Result<String> {
+    let file_name = &file_name;
+
+    let extension = file_name
+        .extension()
+        .map(|extension| extension.to_ascii_lowercase());
+
+    match extension {
+        // Transform parquet to line protocol prior to upload
+        // Not the most efficient process, but it is expedient
+        Some(extension) if extension.to_string_lossy() == "parquet" => {
+            let mut lp_data = vec![];
+            parquet_to_line_protocol::convert_file(file_name, &mut lp_data)
+                .await
+                .context(ConversionSnafu)?;
+
+            let lp_data = String::from_utf8(lp_data).context(InvalidUtf8Snafu)?;
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from parquet file"
+            );
+            Ok(lp_data)
+        }
+        // decompress as gz
+        Some(extension) if extension.to_string_lossy() == "gz" => {
+            let mut lp_data = String::new();
+            let reader =
+                BufReader::new(File::open(&file_name).context(ReadingFileSnafu { file_name })?);
+
+            flate2::read::GzDecoder::new(reader)
+                .read_to_string(&mut lp_data)
+                .context(GzSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from gzipped line protocol file"
+            );
+            Ok(lp_data)
+        }
+        // anything else, treat as line protocol
+        Some(_) | None => {
+            let lp_data =
+                std::fs::read_to_string(file_name).context(ReadingFileSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol file"
+            );
+            Ok(lp_data)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use clap::Parser;
+    use influxdb_iox_client::write::DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES;
+
+    use super::*;
+
+    #[test]
+    fn command_default_is_same_as_client_default() {
+        let config = Config::try_parse_from(vec!["my_db", "file1"]).unwrap();
+        assert_eq!(
+            Some(config.max_request_payload_size_bytes),
+            DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES
+        );
+    }
+}
diff --git a/influxdb_iox/tests/end_to_end_cases/cli.rs b/influxdb_iox/tests/end_to_end_cases/cli.rs
index 89f868cae8..941a7437ee 100644
--- a/influxdb_iox/tests/end_to_end_cases/cli.rs
+++ b/influxdb_iox/tests/end_to_end_cases/cli.rs
@@ -6,7 +6,6 @@ use predicates::prelude::*;
 use serde_json::Value;
 use std::time::{Duration, Instant};
 use tempfile::tempdir;
-use test_helpers::make_temp_file;
 use test_helpers_end_to_end::{
     maybe_skip_integration, AddAddrEnv, BindAddresses, MiniCluster, ServerType, Step, StepTest,
     StepTestState,
@@ -526,9 +525,6 @@ async fn write_and_query() {
         vec![
             Step::Custom(Box::new(|state: &mut StepTestState| {
                 async {
-                    // write line protocol to a temp file
-                    let lp_file = make_temp_file("m,tag=1 v=2 12345");
-                    let lp_file_path = lp_file.path().to_string_lossy().to_string();
                     let router_addr = state.cluster().router().router_http_base().to_string();
 
                     let namespace = state.cluster().namespace();
@@ -537,53 +533,48 @@ async fn write_and_query() {
                     // Validate the output of the schema CLI command
                     Command::cargo_bin("influxdb_iox")
                         .unwrap()
+                        .arg("-v")
                         .arg("-h")
                         .arg(&router_addr)
                         .arg("write")
                         .arg(&namespace)
-                        .arg(&lp_file_path)
+                        // raw line protocol ('h2o_temperature' measurement)
+                        .arg("../test_fixtures/lineproto/air_and_water.lp")
+                        // gzipped line protocol ('m0')
+                        .arg("../test_fixtures/lineproto/read_filter.lp.gz")
+                         // iox formatted parquet ('cpu' measurement)
+                        .arg("../test_fixtures/cpu.parquet")
                         .assert()
                         .success()
-                        .stdout(predicate::str::contains("17 Bytes OK"));
+                        // this number is the total size of
+                        // uncompressed line protocol stored in all
+                        // three files
+                        .stdout(predicate::str::contains("1137058 Bytes OK"));
                 }
                 .boxed()
             })),
             Step::Custom(Box::new(|state: &mut StepTestState| {
                 async {
-                    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
-                    let namespace = state.cluster().namespace();
+                    // data from 'air_and_water.lp'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from h2o_temperature order by time desc limit 10",
+                        "| 51.3           | coyote_creek | CA    | 55.1            | 1970-01-01T00:00:01.568756160Z |"
+                    ).await;
 
-                    let max_wait_time = Duration::from_secs(10);
-                    let expected = "| 1   | 1970-01-01T00:00:00.000012345Z | 2 |";
-                    println!("Waiting for {expected}");
+                    // data from 'read_filter.lp.gz'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from m0 order by time desc limit 10;",
+                        "| value1 | value9 | value9 | value49 | value0 | 2021-04-26T13:47:39.727574Z | 1  |"
+                    ).await;
 
-                    // Validate the output of running the query CLI command appears after at most max_wait_time
-                    let end = Instant::now() + max_wait_time;
-                    while Instant::now() < end {
-                        let maybe_result = Command::cargo_bin("influxdb_iox")
-                            .unwrap()
-                            .arg("-h")
-                            .arg(&querier_addr)
-                            .arg("query")
-                            .arg(&namespace)
-                            .arg("SELECT * from m")
-                            .assert()
-                            .success()
-                            .try_stdout(predicate::str::contains(expected));
-
-                        match maybe_result {
-                            Err(e) => {
-                                println!("Got err: {}, retrying", e);
-                            }
-                            Ok(r) => {
-                                println!("Success: {:?}", r);
-                                return;
-                            }
-                        }
-                        // sleep and try again
-                        tokio::time::sleep(Duration::from_millis(500)).await
-                    }
-                    panic!("Did not find expected output in allotted time");
+                    // data from 'cpu.parquet'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from cpu where cpu = 'cpu2' order by time desc limit 10",
+                        "cpu2 | MacBook-Pro-8.hsd1.ma.comcast.net | 2022-09-30T12:55:00Z"
+                    ).await;
                 }
                 .boxed()
             })),
@@ -593,6 +584,53 @@ async fn write_and_query() {
     .await
 }
 
+/// Runs the specified query in a loop for up to 10 seconds, waiting
+/// for the specified output to appear
+async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, expected: &str) {
+    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
+    let namespace = state.cluster().namespace();
+
+    let max_wait_time = Duration::from_secs(10);
+    println!("Waiting for {expected}");
+
+    // Validate the output of running the query CLI command appears after at most max_wait_time
+    let end = Instant::now() + max_wait_time;
+    while Instant::now() < end {
+        let assert = Command::cargo_bin("influxdb_iox")
+            .unwrap()
+            .arg("-h")
+            .arg(&querier_addr)
+            .arg("query")
+            .arg(&namespace)
+            .arg(query_sql)
+            .assert();
+
+        let assert = match assert.try_success() {
+            Err(e) => {
+                println!("Got err running command: {}, retrying", e);
+                continue;
+            }
+            Ok(a) => a,
+        };
+
+        match assert.try_stdout(predicate::str::contains(expected)) {
+            Err(e) => {
+                println!("No match: {}, retrying", e);
+            }
+            Ok(r) => {
+                println!("Success: {:?}", r);
+                return;
+            }
+        }
+        // sleep and try again
+        tokio::time::sleep(Duration::from_secs(1)).await
+    }
+    panic!(
+        "Did not find expected output {} within {:?}",
+        expected, max_wait_time
+    );
+}
+
 /// Test the schema cli command
 #[tokio::test]
 async fn namespaces_cli() {
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
index 9b674c4a33..3cb742bf38 100644
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@@ -13,6 +13,7 @@ format = ["arrow", "arrow_util"]
 # Workspace dependencies, in alphabetical order
 arrow_util = { path = "../arrow_util", optional = true }
 client_util = { path = "../client_util" }
+influxdb_line_protocol = { path = "../influxdb_line_protocol"}
 generated_types = { path = "../generated_types", default-features = false, features = ["data_types_conversions"] }
 
 # Crates.io dependencies, in alphabetical order
@@ -23,9 +24,7 @@ futures-util = { version = "0.3", optional = true }
 prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
+tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
+tokio-stream = "0.1.10"
 thiserror = "1.0.37"
 tonic = { version = "0.8" }
-
-[dev-dependencies] # In alphabetical order
-tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
-mockito = "0.31"
\ No newline at end of file
diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs
index 1ee584d8a0..4771970f11 100644
--- a/influxdb_iox_client/src/client/write.rs
+++ b/influxdb_iox_client/src/client/write.rs
@@ -1,15 +1,16 @@
-/// Re-export generated_types
-pub mod generated_types {
-    pub use generated_types::influxdata::pbdata::v1::*;
-}
+use std::{fmt::Debug, num::NonZeroUsize, sync::Arc};
 
 use client_util::{connection::HttpConnection, namespace_translation::split_namespace};
+use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt};
 
 use crate::{
     connection::Connection,
     error::{translate_response, Error},
 };
-use reqwest::Method;
+use reqwest::{Body, Method};
+
+/// The default value for the maximum size of each request, in bytes
+pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
 
 /// An IOx Write API client.
 ///
@@ -37,18 +38,67 @@ use reqwest::Method;
 /// ```
 #[derive(Debug, Clone)]
 pub struct Client {
-    inner: HttpConnection,
+    /// The inner client used to actually make requests.
+    ///
+    /// Uses a trait for test mocking.
+    ///
+    /// Does not expose the trait in the `Client` type to avoid
+    /// exposing an internal implementation detail (the trait) in the
+    /// public interface.
+    inner: Arc<dyn RequestMaker>,
+
+    /// If `Some`, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict
+    /// the amount sent per request. Defaults to `Some(1MB)`
+    ///
+    /// Splitting the upload size consumes a non trivial amount of CPU
+    /// to find line protocol boundaries. This can be disabled by
+    /// setting `max_request_payload_size_bytes` to `None`.
+    max_request_payload_size_bytes: Option<usize>,
+
+    /// Makes this many concurrent requests at a time. Defaults to 1
+    max_concurrent_uploads: NonZeroUsize,
 }
 
 impl Client {
     /// Creates a new client with the provided connection
     pub fn new(connection: Connection) -> Self {
+        Self::new_with_maker(Arc::new(connection.into_http_connection()))
+    }
+
+    /// Creates a new client with the provided request maker
+    fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
         Self {
-            inner: connection.into_http_connection(),
+            inner,
+            max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
+            max_concurrent_uploads: NonZeroUsize::new(1).unwrap(),
         }
     }
 
-    /// Write the [LineProtocol] formatted data in `lp_data` to
+    /// Override the default of sending 1MB of line protocol per request.
+    /// If `Some` is specified, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict the amount of
+    /// line protocol sent per request.
+    pub fn with_max_request_payload_size_bytes(
+        self,
+        max_request_payload_size_bytes: Option<usize>,
+    ) -> Self {
+        Self {
+            max_request_payload_size_bytes,
+            ..self
+        }
+    }
+
+    /// The client makes this many concurrent uploads at a
+    /// time. Defaults to 1.
+    pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self {
+        Self {
+            max_concurrent_uploads,
+            ..self
+        }
+    }
+
+    /// Write the [LineProtocol] formatted string in `lp_data` to
     /// namespace `namespace`.
     ///
     /// Returns the number of bytes which were written to the database
@@ -59,11 +109,24 @@ impl Client {
         namespace: impl AsRef<str> + Send,
         lp_data: impl Into<String> + Send,
     ) -> Result<usize, Error> {
-        let lp_data = lp_data.into();
-        let data_len = lp_data.len();
+        let sources = futures_util::stream::iter([lp_data.into()]);
 
-        let write_url = format!("{}api/v2/write", self.inner.uri());
+        self.write_lp_stream(namespace, sources).await
+    }
 
+    /// Write the stream of [LineProtocol] formatted strings in
+    /// `sources` to namespace `namespace`. It is assumed that
+    /// individual lines (points) do not cross these strings
+    ///
+    /// Returns the number of bytes, in total, which were written to
+    /// the database
+    ///
+    /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    pub async fn write_lp_stream(
+        &mut self,
+        namespace: impl AsRef<str> + Send,
+        sources: impl Stream<Item = String> + Send,
+    ) -> Result<usize, Error> {
         let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| {
             Error::invalid_argument(
                 "namespace",
@@ -71,47 +134,302 @@ impl Client {
             )
         })?;
 
-        let response = self
-            .inner
-            .client()
-            .request(Method::POST, &write_url)
-            .query(&[("bucket", bucket_id), ("org", org_id)])
-            .body(lp_data)
-            .send()
+        let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
+        let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
+
+        // make a stream and process in parallel
+        let results = sources
+            // split each input source in parallel, if possible
+            .flat_map(|source| {
+                split_lp(
+                    source,
+                    max_request_payload_size_bytes,
+                    max_concurrent_uploads,
+                )
+            })
+            // do the actual write
+            .map(|source| {
+                let org_id = org_id.to_string();
+                let bucket_id = bucket_id.to_string();
+                let inner = Arc::clone(&self.inner);
+
+                tokio::task::spawn(
+                    async move { inner.write_source(org_id, bucket_id, source).await },
+                )
+            })
+            // Do the uploads in parallel
+            .buffered(max_concurrent_uploads)
+            .try_collect::<Vec<_>>()
+            // handle panics in tasks
             .await
-            .map_err(Error::client)?;
+            .map_err(Error::client)?
+            // find / return any errors
+            .into_iter()
+            .collect::<Result<Vec<_>, Error>>()?;
 
-        translate_response(response).await?;
+        Ok(results.into_iter().sum())
+    }
+}
 
-        Ok(data_len)
+/// Something that knows how to send http data. Exists so it can be
+/// mocked out for testing
+trait RequestMaker: Debug + Send + Sync {
+    /// Write the body data to the specified org, bucket, and
+    /// returning the number of bytes written
+    ///
+    /// (this is implemented manually to avoid `async_trait`)
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>>;
+}
+
+impl RequestMaker for HttpConnection {
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>> {
+        let write_url = format!("{}api/v2/write", self.uri());
+
+        async move {
+            let body: Body = body.into();
+
+            let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0);
+
+            let response = self
+                .client()
+                .request(Method::POST, &write_url)
+                .query(&[("bucket", bucket_id), ("org", org_id)])
+                .body(body)
+                .send()
+                .await
+                .map_err(Error::client)?;
+
+            translate_response(response).await?;
+
+            Ok(data_len)
+        }
+        .boxed()
+    }
+}
+
+/// splits input line protocol into one or more sizes of at most
+/// `max_chunk` on line breaks in a separte tokio task
+fn split_lp(
+    input: String,
+    max_chunk_size: Option<usize>,
+    max_concurrent_uploads: usize,
+) -> impl Stream<Item = String> {
+    let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads);
+
+    tokio::task::spawn(async move {
+        match max_chunk_size {
+            None => {
+                // ignore errors (means the receiver hung up but nothing to communicate
+                tx.send(input).await.ok();
+            }
+            Some(max_chunk_size) => {
+                // use the actual line protocol parser to split on valid boundaries
+                let mut acc = LineAccumulator::new(max_chunk_size);
+                for l in influxdb_line_protocol::split_lines(&input) {
+                    if let Some(chunk) = acc.push(l) {
+                        // abort if receiver has hungup
+                        if tx.send(chunk).await.is_err() {
+                            return;
+                        }
+                    }
+                }
+                if let Some(chunk) = acc.flush() {
+                    tx.send(chunk).await.ok();
+                }
+            }
+        }
+    });
+
+    tokio_stream::wrappers::ReceiverStream::new(rx)
+}
+#[derive(Debug)]
+struct LineAccumulator {
+    current_chunk: String,
+    max_chunk_size: usize,
+}
+
+impl LineAccumulator {
+    fn new(max_chunk_size: usize) -> Self {
+        Self {
+            current_chunk: String::with_capacity(max_chunk_size),
+            max_chunk_size,
+        }
+    }
+
+    // Add data `l` to the current chunk being created, returning the
+    // current chunk if complete.
+    fn push(&mut self, l: &str) -> Option<String> {
+        let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size {
+            self.flush()
+        } else {
+            None
+        };
+
+        if !self.current_chunk.is_empty() {
+            self.current_chunk += "\n";
+        }
+
+        self.current_chunk += l;
+        chunk
+    }
+
+    /// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length
+    /// `self.current_chunk.len()` is zero
+    fn flush(&mut self) -> Option<String> {
+        if !self.current_chunk.is_empty() {
+            let mut new_chunk = String::with_capacity(self.max_chunk_size);
+            std::mem::swap(&mut new_chunk, &mut self.current_chunk);
+            Some(new_chunk)
+        } else {
+            None
+        }
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Mutex;
+
     use super::*;
-    use crate::connection::Builder;
 
     #[tokio::test]
-    /// Ensure the basic plumbing is hooked up correctly
-    async fn basic() {
-        let url = mockito::server_url();
-
-        let connection = Builder::new().build(&url).await.unwrap();
+    async fn test() {
+        let mock = Arc::new(MockRequestMaker::new());
 
         let namespace = "orgname_bucketname";
         let data = "m,t=foo f=4";
 
-        let m = mockito::mock("POST", "/api/v2/write?bucket=bucketname&org=orgname")
-            .with_status(201)
-            .match_body(data)
-            .create();
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucketname".into(),
+            body: data.into(),
+        }];
 
-        let res = Client::new(connection).write_lp(namespace, data).await;
-
-        m.assert();
-
-        let num_bytes = res.expect("Error making write request");
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
         assert_eq!(num_bytes, 11);
     }
+
+    #[tokio::test]
+    async fn test_max_request_payload_size() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = "m,t=foo f=4\n\
+                    m,t=bar f=3\n\
+                    m,t=fooddddddd f=4";
+
+        // expect the data to be broken up into two chunks:
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4\nm,t=bar f=3".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=fooddddddd f=4".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            // enough to get first two lines, but not last
+            .with_max_request_payload_size_bytes(Some(30))
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 41);
+    }
+
+    #[tokio::test]
+    async fn test_write_lp_stream() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = futures_util::stream::iter(
+            vec!["m,t=foo f=4", "m,t=bar f=3"]
+                .into_iter()
+                .map(|s| s.to_string()),
+        );
+
+        // expect the data to come in two chunks
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=bar f=3".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp_stream(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 22);
+    }
+
+    #[derive(Debug, Clone, PartialEq)]
+    struct MockRequest {
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    }
+
+    #[derive(Debug)]
+    struct MockRequestMaker {
+        requests: Mutex<Vec<MockRequest>>,
+    }
+
+    impl MockRequestMaker {
+        fn new() -> Self {
+            Self {
+                requests: Mutex::new(vec![]),
+            }
+        }
+
+        /// get a copy of the requests that were made using this mock
+        fn requests(&self) -> Vec<MockRequest> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl RequestMaker for MockRequestMaker {
+        fn write_source(
+            &self,
+            org_id: String,
+            bucket_id: String,
+            body: String,
+        ) -> BoxFuture<'_, Result<usize, Error>> {
+            let sz = body.len();
+
+            self.requests.lock().unwrap().push(MockRequest {
+                org_id,
+                bucket_id,
+                body,
+            });
+
+            async move { Ok(sz) }.boxed()
+        }
+    }
 }
diff --git a/influxdb_line_protocol/src/lib.rs b/influxdb_line_protocol/src/lib.rs
index 07d9ca14ea..91c1c2077d 100644
--- a/influxdb_line_protocol/src/lib.rs
+++ b/influxdb_line_protocol/src/lib.rs
@@ -529,7 +529,7 @@ pub fn parse_lines(input: &str) -> impl Iterator<Item = Result<ParsedLine<'_>>>
 /// logic duplication for scanning fields, duplicating it also means
 /// we can be more sure of the compatibility of the rust parser and
 /// the canonical Go parser.
-fn split_lines(input: &str) -> impl Iterator<Item = &str> {
+pub fn split_lines(input: &str) -> impl Iterator<Item = &str> {
     // NB: This is ported as closely as possibly from the original Go code:
     let mut quoted = false;
     let mut fields = false;
diff --git a/test_fixtures/cpu.parquet b/test_fixtures/cpu.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..86cae861b61b39c3730b4038e82d7b8a9d77edf6
GIT binary patch
literal 25424
zcmc$^Wmp_-vo1QgI|CDfI|P>?!6CT21{vJlA-KB*cXtiJ-CY6%cX!uh&-=aaTKik)
ztbOhC=j`1-`s(hPuBYm$s=KTeg_T)>u)tzyU^$d5kQ@lW1^{TAnBM@BCeVO+Xe0mt
z0Je8wbhdLgw1)lrhJ_W5g_Vt+gA?|z(?>4uf1dN814vPZ5!hg~ggTSQyUqO&Nf8iX
z#59ND-s!8S0s%l(peY0J5*>&Mga^VCYQjPY^MH{4lRSj_-zzIY0bn3Em*`M%_5gb*
zduV$Ydsurodw6>UdqjIAdt`eQdsKTgdvtpYdrTneJ5B&R<~tA!gb)xF?-Ctv7Y6Sc
zntB$B8u7nd1R||8P*)ZR{cp0;K>^4hE6N#yMeOXX7!@7u7`d6uos3zTYz&#eb~a!`
zCub&G6K9AJxIpTE7lQ16`wr-T|Cj$MIp9Aehe-dQlEIqWIXV9=S~WD@EHw2hEcGr7
zq2~Xd@c*Z^Asga{0L+D#1=|1Hmhhtj;8zWVyLHXbNC(Q)MyDFY-}jWqO;Vi@c#sww
z9;LGEbT<jP+6PF&H4+y4D{gNa)gtd@-SwOS5@;KtOwzqFBhEPl;p6}c0C~V2on~^S
zAe2Nd_ze+neOyxzUR)02L=fQkx&L`Rn2CGA&A!M4>4a-oSj+GYj+5rYf`(8?8QO`P
zSPEvXZvdJ!+rmW}C1&LX?{$m8omW(Llx7aXLSg?iNpTXNp$|q57Tv<Z<`byz`I9QI
zW@U|2#g)-pCI|(!vL>%Q2;*ci#yvt)Fc3D2v{K<l#H#nXff7-vQeSwG6cy&qf>6jm
z<j#!m3D>|UKM$Swq|g4Bqh{A<pcdu{M#rVVg2q-g;t7hLg|Oip4x6S*CCPjpUEi~i
zk|}iU@34GkmW4kaHg$EKh;$5;pu13(ceMd`PGqZub)A@*95R)E`ocl<m9Z~d$^*~*
z0AsDbXGu9l#)Si@U0m=R5p=4n<{vz&aLkhfmiV^1-(~<Lq?3)*08!+V6-lhX2q*x=
z=+aSuAyEHrc8CCT<nrXgclaPgfVFo95OVdOM)VU3;<)f%g-B7w3<)Lk8Dw|(6e;CY
zP4qO2ZFCQO3}|DFEwatatF2Eu?HQ(=Z8jXB0dlG`Kx@l#dKe(=KkfwUY++;ax6#Gm
z;H4u{5290tKpg77T^V9Rm_YLX>QllHhW<JJ-)@8WA3s6(|DVS{?*Ct$0Qnz&P+Xi0
z%}n&oTulDM7viAs8lk91v8bc4{$lH&<KGGXA7Bgf-`LW(wE+LKWq<Sbh(q0q^S|b;
zmKXTQAqy-<_&2x9vH&2jqOuEMlAc+h`k;V}?v~KbqTS1azHT;nW1mgpds^|6b@?$`
zPseQ^AD239hAV5OI$+XBdMD^nw)(WRsdbD<AydH+!jQr^I`FvvxV)6tN~u|kSr)PK
z!{6BaTCFY0Ko_-cmVGdLZf4{~89#_Kgy4~jVMDMex5Mo_b5`s?h~nDz9u%_HswZ<t
z6`YE=xBO7`Y}-uf1G{$`^(Pb+n21Ovj<6<5(C5_fTR}0haSZumr@3=oeEo1DMt~tm
zee?Q5`$fjjHy-xy8V0g1*;FurR@;UlUXwTDXZLc9KQ(ZPW=_kj)rE<aC;8qFNwuyt
zJB)`ey{RE6*ixs^zD*O)g0eGyHd*T-Nm*`YTg0IB1tVCYkBW^%^g#={!SvbBCFD7|
z#8q=DW?tC<<(pxRbosw6r6fkdO7m&ERZZw5(Pl@35%*hjr0dwN9R{B(9(*cV|5#SH
zwQa<2bD!T^1Le7hR89&1TKuK$*buHTl6#~2$sW_`*r}3O5sRI@8q8Hu&&TKIiN~N4
zp*n>>^rhf=Dq^lK)Qyoa==w(njz`2dE)Jgxge14m^5;|AB2*$Q^IKhEyhEB6ty@}z
zXgHyo>imt#l3J$$S&FWf1tK21fma~RN<7V1=&aHYm>{3*kU_NfY6Me{W(JAz8yLlr
z&#71Y)ncjo<x-9}Y!vSA2F0OxF){r_%fs`qt%SPier|WFvJ{P?bOXoh2Fko^=+@TW
zNJS2a>9B$}IQJ(43g^VTHvN<4N+M)yoL|X_wt^y6is3@E>*Su(8>HzxoDKQz{G;Gm
zFpYfRl_ds$Hr|V>Tj*zBMs>w!_W1kp5^MA=HJC`|rCB9hwJdK3pJYbz&a+L36Jg2H
zd-ow0zP$7M9^j?X1ANCysMI~S70rVB2a&^Z9^GC!L2PF0o7DO|CH=L%s+58XP0bSz
znS0V`X{9fUP*7NC(=}NVZF~?UPzy*GBWY+;;9dmG^-HZ}W`a%q3NN8QiE*6{9@&;8
zO;Coadhn=^q`(m~BXgGmU;0o{6ua{AhQ+1dajYZuUFtS}BTy99OTo1!E<w%QpSQ>|
zi-rzs@g^lHML`?FO5<HG63$~EhOzD}MoZP%mfDTMYLC}Cn!i3Jow~2`dz47`vm1mJ
zUuTe%?sXkeGgTkwrIKH#nN73jUwR=1h0L@H9zl~!*2=N|hQcDl=T;KlD?-m+m1R(J
z<YETUHB}@3NI8~?KuMKd?6mtC>*`>$fV~}y8o)$9b`5^O8L1zC<mtgs;(^ZHH~fu!
z!xuX!^@XM)deMCCG*4Au**0Fb^WDKLcM;Bpx`|eGWA`w|fP?q0V~|x(D><&9YF8;r
zIJ>v8?vFBJ#rP}~qsb~y%^p+w!4?soY)NCxotAF$EU9+~1ul9?X;a<qDTpD4Fs&O%
zgPSRt#V3LT4w;27b<f$u+C{e6azO+^-I1dkezsQDwWQNcSS((Lw8}e6A5<+vD6IG*
z2aE<e@58aS3%^+#8c(-aZ9HV1_)hUC4{WOEK-W$r_12JNNAGa|j5~EP*77}dJq04)
zO$B<DFnsc5>FqUB=*q+Q*>s4ZWqm&vu$^UGnp(?YdO5)zW9Sa&i6636Ox1@`w?u&e
zhu0&ablZ9HkU=`<iir2)jYTVk=wP{x1(n^!%N&kc!b;74b#Bh?5H*G)#?o}49~<qN
zn1$HFrkpC<oE2uKvI)jl#5@=#yV1~#_@j@DrIrZunQfk}+f4|3RHHT2*PWt?0dFX$
zzi=ha5^>R#k!;Y#-Ydr=Th4IlQ%57u+TsSnPvKvY(?REC=ELQE!d9Df{SM=cZ=ci<
zG%U1j-SN&Mbmvz5zPY-^oj!8w@aK}7nywyPQPLsnQ+3idoTyw(O9sTod6Y{HLg93F
zh&SmUWZU87!A|k9yE9=rVPC?|xHnD?j43o;pK_M}spQ^}N--Hc&zZf155&`qx9gaz
zyV;$&ZgPzy_{EBJup~xHFRo5aq{B4?#EalP4nyVRKn8557%^!|_U3PDjdQc4Stt?8
zz{4NFqW`Q*b%;^0Ce&-RkDb_n<zH0xkz%c4^~{+iXQLk)DB1ipPNA3hlKJ3Mwbn^T
z-_j?ETkv565VD28mLqycwTCy-FWQiTSr$>b9&8AzTVXHjEKFShJ?(RrlO#F#EWUKs
zppG!#DECrfpk5Btsy2puVm;Axwzc^<v10%Ny?;6GYFQJC3LO9+VO|-@%6c7hDuWot
zR(%PnrPWcynckwmeI#*GyW>sMlF^+kMD}!>2rxN}3BMpGAB$Pjzt3;$(mzEC**X`z
zq8KyWUv3kw?QeG$_M}OEFUQZ9dU=i_B+W<8`z2!Q(4sGG7UPZkw45C1i6~=VR&461
zuyG?CfL4nVNSvPTnq~ia<Y%2j`{LRkAq?6YZ1fSqTmxA&N`onkGqUJABE?&gC?^YQ
z^Q|k{wW_(ZWq7v~vjV+N6D{%6-^BO|@<X}eubAcOS@`q<x*0oja>1G@C2<9xjI%HI
z^CS5k#~XdUz1$a#4yF!TrX!}qS&}zr7!EwLPsD1vH*%p+GO0<{z%GWGj8QW>WCN@u
zYhho}g+1f)C<tYGG`TY7`|7v@GD&9!-(vVkD{x0Mbs}0ecVT7%z;Cs*M%cC<cxTdt
z>5a7V<$F?*M<L%AvHSDjOcs6W<h5K+e=i8P3WQU2?FcEor^sMhKPsB=CnUI<uu|3V
z;noi#LG80*A{j_x7%Q-|%SQ?&pkjy=s7x9Tp@LJ%Zn-N+?iJn3PGRWxL+^2#$5DRw
zkuvU9j5bq}f8F3l1e`GMeCYoK&@IvQ3EMm&3xlu)3&WlxMIxLye|W9>vBCfspbZIy
zUff^&F(?srkn)~xL`yAQBedEPB6d=1UNY|E2JdY-9HzNrVx7OjfC9CZvpBw*V^~BD
zr^eW^?hxiIaSLE=6=K40jsD2sMA5BQA&q(NWJZsqJvb)Sv*7?fr3BQ;VQV)|slii;
zA&+)Sob<A+o@w#twik5u;vd^)ox24XYF51z8Gwps-ZVVHl6~^Hp7)p40A@3b4PMDb
zG*LD0ND9f+q;~{-pXg!{3&g^}3Y=@j$J{$#Nie^>;B}_)0b*4?CU!=HJH={*{>%%v
zlD{3c&>AOdF2Fc$tTAr+2p-_xKykRn*%Ayn<dUU1wXJL{vNxpsP>t~I3?sbI8Q4D?
zVbOF5rDVC08?I+?`?^G>7cD^n%0^lM<~`;eXBWoiQjsHGK@zHrA99<eP&EyGsy<_s
zBs4KQy;P!8Q3+(i$xOH#A%4PprOrH$v?2yV;)-)Oa-^<qlP`w#3}dL)Py428gj{*j
zLobLx564B^9?|JzMQzuXy=d3ea=WzM6U|(xr0LVsmr%1&)K{3G(9R1hqO)s3X)RKe
z+bNWRT4Yq0nXV*ofihe?3uO%N9n_cWz~AC2dsF9bamg$nJw5ht-m(N}IvLp&I{yXU
z<4t-wo|3kB#8oFE^-r|!-)-;fz@tZC``l8SF`8d0T0ebue`23|X)oUm%Ljs+xdVbe
z8zyWm(P!Es_5W~Gp5Ub_z@Kl=a1HI&tbJx{7nFhJaPa<M<gnxqC=VQTux>FvOu6B6
z8j$)DM1;g?*QzNH!Kszvq3OM(%M}Cj*w@NZxp0;N$(vk%d#|df#02Z*VL#m7zIViv
z6+mSAnFMnfF1U>}k>Iu~biNO_X<YI#pg`WokB0=9rE^ysU^Dvsli6O*GsNejFWC_N
z(^yA5=8kSUohY8dmXE&G!-XWZo+fV5ErTXC=_xy5Qh35$?@Gdy$HqgY*;vC*M4a+v
zj@0p1h#`RRh!>h5C?(+cbPLHE|4IO}Wvs{O#4~*wnNa(<DFiI(ucGV&A`bw4EYau9
z0fs-sAG%igro!Izq{JR}M&}zoDPzoJ>s~RK5`s8z0lY_4Iv~bg^sYTKamg6#6q-bn
zUv)2DvI*%$EKmYw&UFs6h?|jzPTOPjDo?a{4Y!YMNP7ykTaN?{j(>tCcS~~8*f*#}
z(5^X!2KgDa!meH$9l3;S2cyDRd=$c%RV(BCJ1`-xzcz*Qv?uCMnRRiVJ$<(FVO|!-
zX+k61n`2T4d@uetbon0o=+lmt*5r~^pd(a4P37ywS4@iso89?cj%MOrO`ZR49+aK(
z-k`UAI51`;QwC~%J@natYSml0I|68b9F;lD-;yl8JhWY4uooufY2D+S+ITb1iu@*T
zHrU`P<bloC{0a{WF?0U&^Jo^njaKYyaOe=bWaC|I5ep+@^vqvGZ56L?ZPo}w8~ctm
z=ScusnB&YJ{925|#tVUFe6q&zpLS6(Fs_kLl~Y*7*rTr4`Zy-6M^1SZy4)ts-ZD+@
zr51iI@46pkE*kb{Dl?m_49ad%zoTr!iNeFbIa$g~+uO(5R=2g1ua9rRS<yuw_{8*z
z)WyCGrvfQ34>5tK=>Mfe=I9aH8amY!`uB?PpAM4^5#aXYKRqU-uK;L7kWj9lKMW!^
zt!8(we;@)Txn!zPUO+Gkh|PSdJ1;O4gH$98B%B`<jzg<ls@ant9Es0jG@L425EA{K
z+wpKZ4~!5?%<B!g5Di5q;`Ig0{C(m4uP3pjA|aT6J&7R~k0F&35ktZvl5v(z=k|c6
zib$eUE!S)bpo+vJ*ZJXmbqyyAVg(N;%^2>ZrgK;>)LQK9qh)eC?oXG2574vuAfR}(
z0}dx4{|Y#o&ZCIS6-ETkB481PphIq9;$y@j@K`Oy`s2~bg=0xSCphm<P)a8==w17V
zhh>qbaoB9nmnD=a<ch?Tkz3-HDHqG<Nf%iHE7YsM^h8j}%ra;<SzomTM@lm3wYftm
z7JRGG=k<o2@q?QSWg+SdgFJ$WVKf*+CQ}?GiAf|jmdfdL&w$ZrK3lBRr*<YyLb8CU
z*OwyqKEr12hs({@+4~>LTm8`#8k4I$&PVfA;Kj+TO1I1Hkqn952ole`)4wSG;wR$0
zJeo$_!x9TcCx5<2_kDYPgE^$ljUg2afJVS(Fux@24MHwaa%#LB&G>-ouV;05N!D+N
zUZxkmbduE{j{8BoJ?u(XJo-J4>+NHm!G}1q&$cg;{h`DXIuugJr1_#+QbfG6=(BmE
zLsA$-v?|AnWe_=%8FX5`k>(^=O(q#6oxaFra5*@hB;Sm_Cnt5WT^?yDQD77-bAa)r
zDxFpq4aej3#I;Z^l1cmM@|;94@1#^@YxUW9Wd4I{#qqS^9lbhHJvv5c45@bW@xOK{
zQMb+W%`-R*oyfq&(1aK3(*nV-def<Vs)UrV<tC3MTg#JsM%_H?;eM)mvGebIP6{(s
zR`^GH?H+gMdn*wH+I&7vF=um;VH;n*LY++K1?p@H^Aa9S*CQ{)kOm^15C@}?h@TEi
zgaesPS5u33jF3rG0ts89jNR<0RjRF@*!Bg|I5VysOd9wmv-rOjYoa@w&lE^~`?wam
zY%!0bGWOzb0x_g=jgx7tUF4GmXV|&FluL@q3jkI>{H^K5TJTu*Ss)@7QDYNur8C+~
z#&(V0?ORLl3q%hczn*`iwJY0i^;O*>Z-@P8w;6~gk@&*&#IQG(#^v_=md80{<{`;b
z+Bp373kYx$&3(<a0*Ku7^!o%K1=@x8=ppazdI4yHpT6}|S|%gzB1{0nP`eRGp+Q4_
zSWvUwsMstk0E6Ob;D2<1kpFJCSQuOXM{i0NZ&{2wN{0Fx2x+wZcP{^bpb3Qf4_9_>
zh8F)cYX5Ek1?W=u>-?`9K>vUD%#i-!%hBQQM&92{)qkPB)%(B4)ITcxfA-2&4XCpX
z{_k;x_V-pf*_k?9{MYRY15=M0{oiBkpT4J)vx%YgziYA8g!<L^f6dmkDezKV7I=*M
zZ@opID!`~2k+Lu*sa**xyS%*$+$LpOGCq~;7F{`&fe<S(onWB9Nf~-2%&AgOJgU%V
zQjH(LcYB#Pl(<FTGSH<zeBw)WjT^^cLxh8e!-jJWqWtYcKsRK8LP%c|!}?AN47Je~
zm2KU`$>5GZ2%_0K*njDix~$Q*>VOtxILPd}tobZMTxJ+Hgoo_inF@QuI5RgF%6@I8
zija$WE1oSa7C<<__I#8U$<mBPlbyj3x7}*$#s+WaD3?F((b<43QtS`)+gqtBAg^q+
zfagGWAl4{~73+5*EGZC^{QIY=LQ#4l_{;Jlj1ZA!(1rKLgI++aDpFc7(&6EcQY6HC
zmpx5<j^8bb32nEsyoJ#xo{UvZ5$(O~@qj{TY~kKCi}Gv92HSQ-a1^WWtP*}yV}}e=
z9;#4pV;y*qs<3InFV#!SLwFp4_0y`T))+Ff)bQq)^A0zlHw}%y3f3^wpCCAN6-2ku
z*v18I7y%q`RODp_lliL?4b?*_QawjpTJCju$AfKu4SkPL9yWK<>Tx`>PolB?K9aQ|
zTrA^xBmXl!z1J&nBZ|s7RfBkfM}LKQ)$*dFZ}XGctzvzyRBsk&hoz$wyG>=1N8u{Z
zaq?YcO%Us#qW8Zjgt7egZk#0x#Qs^g{@|0iDPd8?iTa8tuJB#Ejo(eV+Cf#6&7F}Z
z6fw!~#-Ro&NJ3r3FxC=EGF1jZeC)F!*f}hyuUgT6N9#lz%=OK4_0atrB_6lDE|IQ8
zd2nqzdT=%;TVcq1G&3%%&I=v5KT%^a15NGhf;zCOp92j0=i<H~8srV$|KjE{WT1Xz
zb-#oCUW=j9-vVJK?{TT)p*YYkM8b+6R-e)KB04OGg8lX~t~=WeVqaHnjSB>rUqA4I
z0~ZkXmI}P=#G^JK`djM@ZRM}+-nDpU_{6=79QJZuvY<_|?KF@};mj7E<Jug)Os+)?
z1WQie1I7Y*>Ni<>M6wlF2SE{n)G1Drir#GWinko4etZio5Nly(_pJXZQpWdDI>Z2#
zle5{Aw33|~abp4zD)~^)erH*lYK6+xb;|4F3}NFLfWqh@W=bNOupMAwwJvSq9219g
zYdOhd)r~R6V-_XI?}W84UgQUk&XsC`$vEJTascXk_DfXt&OcT9kZU*Ofhp2bov9J8
z8cCMBuw4LIW9mjRh$s`Wdee_3pW!IJ5H+}kMqYTb6{iBB)5E_}73E!3<Zb^N#y8sq
zU0xxj=*W?!|8AGw!VHN()NYqP+8GuvxKk6CWcF5GZ(oMRDypfUs^YH=vZ8d=Ww@;V
z)r~)(k@ivY2z%#D(KNiRG-6D_Q0}rVGP?!0pK2!zm+1?UaHwbw?i%>*Rw<w#T6g(Y
z=xi**t?X#s3uEJ5vNr21!eE)W>W}xZd%Y!-VM@E&zaDXj`#5xNrzZ>j`vMXKB)=N1
z>`OUOxX{p>SQE0n@JVA5;W}Bnx6H{#Q{r%qn@I_^BzuPP1&{<&dZzwHZz7xxGEv>q
z)=gxt?jjD}5p79r+i*0zQ=yjVE^^tIzkKs^s?BV0QH)aPLv&yR04ltojvDTEL=SX7
zPu`BJl0Pl2DUqabvDH|@VA&XuKh`4QecBZUKQG3J^j;xOHmDO~8}%2AHT8<bAthlw
zF}`23{f7PLczJIsA_FUwExbU?#r$fPP$6#Kci3GMm&MrHG=P{&$chMh10}NYePP6j
zvM^%l_g_bgWA7vkV;cDm{KS+$$AnrPtzlgz2l!c8Q2F&E?!tW27J?B=Z0?E7`kc4G
zld8cE45i2zt0TPh8;C{k$C2eu1F?7VAkz|`ArVCr@P_$*-=2v(LyeJXT37~nTQskn
zDQ}wzO2d1^3p+Ew36INiw|WuR7ZuvoUdgs~HL=i?Ni;3EIP%1pnCt~0L)%(B`vSSY
zlgA)V=6Kn$W8=nBe&63Ow4ZnfnX2rLZ{=v8v`SWrKCGM4BCVas4?tPyz@htWaD^M8
zpOazxsS5vIXSb+odM^+U!^9`X@r(Ayw@7zgdb~^rB#Ww(PQp$o!fa$%P1`JCG$!j6
zO5mjGoX87J&6O;+xVn5Jj>IU>0lhCyda|<U?0Sx@=F@Dh)cr&z`#pfH#PODQ7rvfH
zee#zy?PEzMyL!ZSUP|Djz~d@1c-$|j(<v5;PT)9V5*A59@;W5ulUDgsT@S5&No4|w
zN<|Khp3<n)C~Gy*hs#lH^x$RohxIlr{qmJF;^P9lqNZw?#2p>{!7p^*d%cdX5{QiB
z9!u~v;G^=wiDZxokbD8P`&v%vAzrgW9ad+a2;u3$C#4WyBWcT?AlT1re$?g|k|MHk
z2+ZEA{Q+gu#=p|^u<mi0$HOLyf#K^`{Zsn{^U*?$Py6K4t68cPs^KAr`{KG6#davv
z@>`V>x3{_)TmgLkY_%HP;5*=<`I-9H)yf|PaeJyENT;CrUiZd99Hn^rimy_e*X7s#
zn~Gg6Z0Mk+%bFu{;%^afX5ZAECOsl^(3~b!UaeNwy>=2!AZy`1s!{P+VKUNIo7_b9
z1*3+U`Cm?GCJgl4fV@w#D1$?S!d~(76D!FhhZM?m)@++k5H_qXI#xEp8xKA&mrkpY
zEfTCuK%Qi&ShGZsn8cM{sG}~8lIN@_FsuU`SWRutAYuaS#qp88bzZ!v_u%){vkHaf
zg11NNa}BnScXl<AZXJmTM@7@xd9d#XeNKEfC<VFs%9|EGT-HFK8Sbh=H>gF;puDvK
zq6hNphw^qodFX@Iu4T&;G76b)qSx-Led$<kX0`xoy-&`MP8UA=qRS2S(!P>gyXg0w
z7bKazqKpW2uzaQXXB`h@o!GODav21457RJJO~rkyG6=fHXqLZvfl0J)D#7W)RlB{S
zVw+m{&9fn2vbEn~>5i49+m7G7B!Dobop3jP-xi^>grjBS3*QmtS>;AAWxb}qe_gok
zurNjnBPl4+Ad<pL!t5KVx?eY|4Tw!34Ag18-c^ruU+slA-e40pac<@O)4yEL`rr>8
zBDxfCJ46W={;A_4BWP^vFI!fg2cJpDs?zG}t!2~?+Fz6}SzObwhQQjiU@%c>9vVfF
zrE*0xKwZu0dM4S+Bt}|fcmc_$xrdNLL_02C(0klG|4J*qLfo^rH|Df%w4Hk=-+W8Y
z&XE#ri_(4s5U_=m5c1`F5v^atN4mvXpDPE^#K(po9d00R_P$IfAc;*VHthFKTIQ8L
z5crj;ju1<X*2$hRiq&-63=(Cb67CyJb@k2piZ>u84at}yNU^@qUGB|rvYz%E*8NCy
zj4oKlrJ0i6WJLawP@zKP(j%D|;QRqC&|Si1P6e*l5-S+xRK*_J963PdhGsWIf*tAV
zP??Z2O!-vz*cd$=mrZ~2uJa-T_K!>@BKDRVj3%)w<!Afeq|#Wdo*6h4iCyD~2Op@b
z8ZXce`D^`;jt9_dWz&M&qZ$*C)0t&Z_XS6u(&ycmg)`T-q(@^Tqw&XC!0&V#aT8)#
zPwqt+q_|RP7lGv9#lT^uXtUtD)XfrTuFzTKA(_$l%sKfz2ktca6M0&$_Pet035lXa
za|uZ4(f&|ENy3RmzF9#bbA0=pqR_Txi;pE}Rj_jQ+|wn{98!vQ>hW$3SMmd?aI>iy
zC<5xib>GBUg|r6e@O626)Q^?6O5?3GF6fEOo`r$d@#Co|{umGV4ObAtB}Uzm6GU_5
z+g~APH=*4sFG%vi{8PIkZu9;FI<r5Rk(N*<I6yd3naQ_!KU7_6F+At019AQ%{sCC>
zYp$++@2+x{%UWI^_%6E=6r(4f5+IdB2UD)|z%mHZuaKOKq)%fXYR$cxA_~MTS@yD%
z^ri_aOBWZt^H!ha`L^|S4Nbg^uElF%-RoPApf)+%Xh6Wo7$U}B_D_s23%fe@na`H2
z`9uu4bZb(}zo*KWI({OHm8`pHx6(kxRN-8<s)-mtQhtHBmc%Un#RZ})=+ym;i<eHH
zM<qFz5WtMZA5`{mvT=FOq~kuRY>`&*n|*Db#IMzFEFiLFVRJW{u6_6}ei32_8onp#
z#o<*$N*g!|>#4%K`hk9$%Bx`**F$3Z+Ww(;H;bZnUrBtraMgbzM^Y_myA>DR#mm4j
zd>w0%^}u_rZ^{w^ZbO2Bovg~{ejO=_*vQic^D_XXI_pH^FQ%zZrrTC9X&!V52W~YT
zToi+`AKxj3JeTfkvT_g%^Q^7PPBIQr9dr&rW|!$B`U4>B#$+}Oh#jp__*6ak{8<+h
zSRNWgyB%042n4Q@t~9tXkaM>h+_(}VqXy?Ht!!0v_V2<gm3pf(FWUUDa)P|0&-<v<
z&KWpP%6C}<MI4qVoY2OO0w87K<-~`$7w>Cb$HRKxP{Lkj1H5-dx`1%HHGU2nEEHxT
z@vPa=siL<>{FVlvwiT~;1ssZGL{tjCllb!!OMx@}9u5j`qMiI+`euJN18&dPvkop&
z-$4*~AJf|kr{WklZl%tL=-OE7mPdQegGvmkzd(;{+u{4?p#kV2_5QLmo&|>}=#S)-
z%vRI58WaOb-yrcA{<V?;q6$F-y>sVmsZdqW&6YwTy|g{2B_Z@4fke~$pe|%W9jHaK
ziJtNNu*c*kvwy)B*+bkyt1ae2GI7~7*WlRuJK-&=^J$4~^<dWI%q`c0Fmq=kCwqni
zIw%1SUfUwH4n8Cz+|4fn%6YcrNxDcD%lkG~Tk@{<!@%6=8%iW&<6rhCj;875e9E7K
zAU5%_K>SJ_1g;FS+v9jF>WLUV(@1hmAl|<BErJOmCV@xj4C+Btiv5mn7oiQoKK_2Q
z^A0Z9M@=z%=bm0HkNNc#S=flLA!+@j{s^$0E+xSMm<av{<yeN-@<ddjvjSKZ@3H?h
zKA85CyPGTFU@}CE=6Oi5S2CuaCwHUMnq7BOIhlT6>jU9$ePfK<_FNzxo+!rh;~ob*
z)OZiWIqv1{EWgvEU8LME$pVJd0S*T;cn-$<yGD7;Xnp4$E9k`VrgDU^OE(o&e`QT!
zv$axkdpAS|rhaG8Bj$W6ISOdT$w>&9rIt_?Opl)X+#zykXsN=mPm1Ziu=uYMZ%m=5
zRMRpcnF(OtZ0QdGutHtOOgM0Ab^&Y$5_JHQzaq>A7FMBMK=zEdU$&v8-MW;0xJR5Q
zdP`L7F*qhXUY0Z_dD^zXH4u;Vdo+Xua2bS+bnzhQxy4!S6P(8xYKqY)x?*m1NEV1Z
z3w~3Z<N;qr-mjaM(yk+mW#OPuVmY%2u`uMtU237GYpM-UNBx+EY-&~*4iyB>p^C<E
z&iB?|P*vk}(h4e?g%P6gl`#9Gf{Fys;o_MmofVyetwtCup#!#P%|b?E_7wZtu;$L~
z7^5nV6&R~gK{wB>s(IL`xXYI=<;6F$yv9{<(VzA*>!b|av=a#%>|y!}R7vEo5%V8a
ztg1e5>X<AA?8b(po7LYmjwP;q&23uXyR37BBsPC2i_maYsfB(v*K)L+P8<0w#!wit
z&<+quzlb%qV!f3$$=A_bPtm+b=2Pu<swTjqJ}ZpxbD4)&fH-jtM8h;&88&0Jw58L$
z2F_LSs1kI5f>#t?I0-1M>`Bg^aDFY3(5~{VCo%`aA*dQ78;fv%zYq|+YjZ+1mA$z9
zSQGiQoPWD~u2W^d(`C0VA6_I+8Mb7(8#?uN6_(hR9;D7)#&9Es-@(3srj8Np8h6f5
z5A=p0z<pp2Rhy}>I4yRIPs-%VCR!=kA7sBzMzW~;MPHU6JqieTaJR$8ylc6%h;L>j
zPoVws(-DwbE9f?qf5gf<AdR&?g^FfmY#(}I2$}w_>BjUq*B5+Prgia+v1#2w!=ah{
zDUwiyim}?nGWd8n=|zh~QBe-H>?_6vcdgxYpmFg+?}6!2o9VQ$jbl-zMk=&}(YWBE
zP92*_UB}T(>+A5r$@z}knx&d5=9;}CkPNLk()7dg!{C?$_Af#Ln%SIFU0WdMFUJxx
zX#oI0vOk`ltbxylkw<j{tCbE1_9<x&IVQfbh&o-6XNlOyM2!fU2&Y6cb^ivln8kdP
zbnIOD;)S{;en%?iOK48Rftt1zP*!Qkb3<b!X)vd>ae=-TS-%OfG+pfSQit#~GkOB!
z!ZVQ~Z4X{RH4#o(Q-XoW@}U_|9cn#C;Un%aRsI(5erAoyth^~C`+E<!F^<+Ht3wm^
zPI<A(lvimY{hm(aBUC{ECYMI_&zY3s;?N5VvtYx`m}^@`N(L++hEI(s;HZj6Z4H7u
zV*XkU2T(@c+40MbDB)*OZqsbF18H#9yHKt@Nh6O2FyZL#M0_ox0G~8M1WmO_unZC;
z*M(270ZTP;GeOxq)fNCTh`-;m&@aFy&x{xzFCHvlszLD%ZO>@eba#nUQKa_I9%%K`
zaB&-`b({h?`3>lg1n`E#R=&YYGDWYjee3N{R)D1K7&)VArLZu;-Pc8Cvt&S0821J3
z6aLo|_zu*0UYRKyhelL}xfoY|)jjeLH#V@6L6>=!J`o%?bK+DdbAJ{5pGk?Vv`S>{
zl%~ekcO{MDKY0PcQDv7+kPQ3JWz+1q$7U;x{@zmeZhD$PXt&FjRewe+1m9oqdJrP<
z7(EJ&aHeFpBc|`hm4=!!6AZ~}<TtKTJ3i@pFVgTIRC=tkjdVm^Q#v+*cfN?r<<3$_
zQS>L~<|0cGm-Y4a6;;L++bqvZ3f^%D#q-c+q7y!MUThFWe@aUyBc5hYI8ee&uPxLj
z3{9mvho$VVZ&LD3UeKP~a{FvPyCs4o@Ib(7S^<B&uc+`>gU&MYt4NxM9(p~ZhX$02
zH!Mo{Rss}kANFtfoS*5O8xI$;D-)1@O+>6KHBr4O_+lAS|13fwq(JKZe(dGy7#k?b
zFg2wutw!7C6`TI%^Nx3EkN>Kb`VHD;&W0%>;PEi-GEkgLu}Fgvg88AyWED$2(MbbD
z5QTA<p)fJY1T!zQzYA7S#VQkc4ariAs<BC28~yx6Wy`kx`b+gG6d%m^^|yP*cc*XT
zDUD)!-!lkFk$T(mi;J#g?pqGu32+PIf{P8{tb=L^xiQ3Db-#O^QNDLzI(>GF#0X3r
zQSLT&5a;J98l!a`l(^aW$}h8mD<4-6#U+_9Ra?6lu=vbNP}kFWzN&t$^}*E<XC77V
z$lADIPNX)`*fx*q{>31&G`X5^)Y7!ubtPz!a#X;co$#DB-;LBJIL9_PZLOs_-yr7r
zSE~HvVw^RuEiVH65NThBfd-#wAyFqGKg@`xBF!?IlyRD+ZFh$`JXhMpr9p2X|GDhB
z{GhGc;yNK4VME7U$x{kFWUjf14ZOzwFLTW+3{4Em7m>f_nto}JV?+@0>FeL0&iYgR
z$M@sNUjeYB-2hlzgKj89%G_>fRF2JV7)&wJ9#~vegC01Lac&Phq4Q=B0%;IwFCt}<
zK`#<*VQw!nL*r&I3QIp}A1cSZK_43TZf+ks-|c1}h7c@SKc*P2VLz56WnMqFEXP(q
zj-nXZ0IsU4;Q&z6IBx(?*LiCI-yn!=5M-QWIQY)IFmI5+x^Zjpy?sB~5TWzD;SiDg
zZr%{F_wCjYi61QaFli93(J)yUW&SXE)ay(bMXV5ZIAx-uayZpDgM@JE40|?7nrv_A
zQJTCkHe|Y@R7Yg`veJ6w4^=;|ks0cS*-#i&=8eV}f9&RuF}2@rk1=<{QjD|o<ATRo
zhbarj*~U3`WSOUgaH2WpScK&`7jy&VJ}z7L$#JcF!^m@QM~TYQ?WH=Q^B$Eppnp31
zaf8l#*-sHmd@~J}<iB4HR1kPR?Nbo^{R*Qf1dS%9C=5>&q$q;S*smyx&JU|3hAk(i
z^chbtNJ*T)re8^d*aucwk~~^WS&BL<NLiY`s$W@#siP4~mUV;!M~?HCGmbpZQ6r85
z|I;mwqOcalyrLM;WM1i(G&xCGhV^HXiUJh0nyRwgXEil-y<jyJEsLg~<hprt$r}3p
zKq*b5=+Ej}rdh%2+Ll!V>N>U^a2mRf6P(C;F27vxzjz!q;p_W6-QgSfBYhk<@W4_B
zVTBNdXc{%seg}c0dG0~R@lu1+#z|_Xzf4k1inW+C92T@>vVGvS&GVzF3789$O_wc7
zi;9;mE1N=oT2yzy>sZ%Mi0jy(h8tz5HgCDUx7^q*Ua@Vz+h4Kkjt6Gi_hD&dIt-Bw
zp$m-A4NW?ZU(IAXIZO%TXF1Qgi0Qd3=q+No{<3H$a$ECwAadUd=lUYP%czv?aUkda
z#q+po=!@5R2ZFx$^@N1J&;44czVGwdkp9=-ZwLl{(0H7r{)f2coBoK@VMzff>3s%)
z=mLma0a()9d0gj;nuaQxTJCuoAk)%3%J+^-d7&f$p!_h31Vnr-s#fv5aGLy7a#8x~
z;caZj)<^Ov)=_SX=yyNOccQuXOLt=U?!&TU1)<;V;;JF?P{w~I@SsePqHm#0l;eG(
zOj45Op-NWMMw&?0G%edp(RDrA`(_YAx1Va95)S-injLPME?L%6%x=9tZ>o~><B2-c
zVf1Mt(|KK`B+G3(tt7kejE6AC=lQ8f@T)I0Z5l2UZfR~HwsvV=D0zBmek7Zxc|nW-
zvPEHnyfmGAvbHB(ahhc-U5StD@li=`2>o$sVT$E(S!r>3aCt@ja(P8fFTqLq!Z7cL
zswM*Sld2!m$raUY*UJ?(U7j%3wH?Tx7@Ydo)vYPcL0$~?UnwKQ>&MtuDjTLm-d8ow
zDeKtqEYQ2#G%cC7Wq!{It7oKJbv-$2-VFJ0{$n@A>b&K!*h)p>xVi%DdP=-d-FDFm
zZP)%)jqIZRb^+3fdfczL=zP6Dx#$8QfNHy8aCB>XWC^3}{g6Jiv-F|$9bNWeixAfJ
zJ8^P5^rI{5)(zk<hByoo*o`|35f44F4wFUlv5in=c(aYtRkX8>G3M;o3o`e;N{zEE
zeq^8c&(ZEbLsTaZ$f(Qa?@3p&C*G1Jb$cfib-5X2)b-y1A7r{aZ3cYRmIXfl`>3m`
z3E&yqqbZ1q2`(>$X&A)tx^dG1z9M3TX5rV7YE-IJhoRrpq<FOO(oaP^9b0<gh)73p
zZ=K}V2vytWXSb6H1nmsPK?T9t!nwo2_`S(1Zta}MMy>3Ia-}7ZdPs}SAnzoJG+XhQ
zNxsd`aLCpAj($g|kXp(p!9ayma$~r3QdIldA+QYA(@=P0XF3=!jrBB)T6->~K}N2{
z%G(^3k8*oQSmv8jbs}B<71<JrA~1h^hE>r#zqC@p4djsPFutThSY94;RFF4}I!q3|
z{ax1PVp15Y`N9^?Cn>nRT=$uF5Hxi+C#<G+^IW<{NFq#QBlxn)i%=+ey9)+V$VM1w
z_-es*FZ2wkK;84Uw4ixf;?oReN_ye;&5p7959WCNUN$h`cxdDm48|jj;SKpv(}FLf
z*!%^wL;Ch&U^9||ChBVzM&7m~6uH4T$Dwg+ryc{QnuuNzn&X)GeK$ujj2s)ptRyxY
zK9Bs;(F;O3(vkSWi$~gUeTxQ#U%)Z&Ha^3lB;@XZLvb!Y6Wl@sUs9O(-o_}_e{pS1
z-_k<3Tk3r!^T{i2YdT>1n*+~*$eRfQXDpvT!RzP5t+1p)aRnS1I(jtrHEnEf%Dl?a
z3-ZCos;_ElY6G`+Sr+MrZ*3N1VrWRgT@K_XY(GmTF4|Vgzv}R@;KGh(tZ<^)5=)dm
zqBS~#;|K}5w0L5GA4f}~idd9bcW#m}ceuVcmRziKRrn{spS#Uu1rpf@`mXI}BHb}V
z!dLZI&b>^hKTOc*TDv$7ZDz+O4mimx5`V2+lXhNxydouee_F`u{PNR<J_g3Q*_E-@
zwHuRf6d)JVfo<j+ODHc)I=^G5I~f`m-W-ejjJmz$`2HwvlSUF;!&)AoPhkT7mAf9u
zkh>rEI-l#09sM!4nov~@SW_d*+u}fe(wqFTaGs}EaaW3;)Nhzf={iwt@>gw5JNdyG
z6oae;5oh;4CB~&I<AL>(AQJ1KlMhvriyr@7A_%;?VD|}17-Ob@(atH+>3-gBj$IB%
zG{=VVz`N9LE~&LpWae@K1AMyB1p*Tm+R=n%?wBfhz6?oXGie&U+NKr%T2R39P}%~^
z?98!q@m7XO*-eCbh-@h>%<+>WGP0`P;MTNJ+|Hqvm!}_<aI2~{%?jcJb{52A*A>L(
zIOd)Y5?Ks#*z(?j8e#Vnl5b04P;f;Q?RQ7vu;04?p&U#|fKf`>Ed|;<X5v1$GCNT%
z(xln)ngL3fc)avKK7vF>0}En`Ejn}1%gZen70$MIbJ=_Ra^jukc_Kj0jTLS0Z|B&E
z@6~aoHV+Y**r7MXelzs<6|$fu(DX=Z#rFf(i%S7wz)^x&HexAb_@Sf2Ey`XVjdS01
zBw(t<<ZtI~It5Ww;u0ls>v4ZP_&<taq2YNUx3mR?l#aW`yy^Y%WIR&TM2xa%Wd!3&
zN`ONrF67Hn-0qJbS7rH$x{t?c5p`nF;icW<&D>9lk4Jzqs$F8|RKmc@5(V`rtZta4
zna{)=hhzy)Eg?hFO$DdcLAYwKOVC^9exRKe-)K&^+}pCtYYvPEUDI}t>kqIf5-^4l
zT@J|KtflEMUl0@VFp)4`pDjeLgPZT(yGc)H(V@NOYvS~*!L{q)z27A?R5`qVg=szr
z3EJR7vlDZWYVj!;Mg)mZGsX$giA}=WL2-#;MUlH(Q7ViSf0E5!_s@bd4C$VWy3)}p
zD~Pk6ZkNw;Uw#At8${++q+}FqNlA&C*bPDnx~mJsxuuL$7Z1(fvn%NQuH{<`=t0CU
z=m5@x5i(#*n<e4@94hv2sSmeS7d~@A2g=JQftw@Zzo(kX@>Lm{*ozwOs&Ib%Jvgto
zcJI<0;M+PFm#Gvg8<URBH;*&qu6mj+Wx)D`BP%ZpBayudotDrWN(fl(wIC}R5b`f^
zVRHtsX56ZxK~vFh)@F%+gzbYzV+sXG1%S%2rJ#?F$)i*;2TLF@v8sTgMN3MDepY6Y
zPObF^bq^NnVsui88H)w!p6ps>7A9~zF2JeL4mJcWG4mA{W;Ll4u$o*X1Y4_H_sE_w
zNhbjciRpcZ^j--4sw_bO=LbLZ?JY96vLHtwJ8{8<>ZgQ5X?g3S!h~WAxr$7Dj&|?x
zv|>Z+b*~0hHM2r?5Cdx-QY42$g+Wx{tqQGrbomdE^sCelf3#D}6LUApnJ0kL4N@3r
z3822{R#8r`pWV3PqPM8>CPRLpTWUu_cU#xD<c=b|k%Xu8x_X>Ojf||><?9JcHyrd(
zq&^wck858+cOa_h+#CezN@n#aAF+%KM(ZgtjM|XxuCTX)8KKPWHK~N)li*-!JA!>c
zCUe6rpUsHF_88|dJJGbf{@|YOVfn>3RRX+d>5&a)TZ6b$+&73nmEWI5r373%yf8Y@
z07UltAMd<sOQrPF^xH{{{314ekq$~@is)Zkr9_aXGCK7M%SRDPXYFE{?cBJFn$196
zMQW{Qw&f_^3K;Kduli6FHspYLc5kxQK5J>u_0rQY<X)t2#||E|k`n_LMhNvPrpcY{
z>5bJ^#v9jVwKwo+=dLy)!?^fq8{*~VAxA6X<*gbLd*3TEECW4IU;{Kj)q6@J9iI$Q
zg`Ou8U#zx2?k(CNV)A`!Q&T<H3sBtoC|4#!cMSS+CW-*B=S7}`mQI&D!BP5l+Y355
z%+Y~>MvZPisO>IU3z-twvNeVEak-m^^*BXF1%jQi%6WT2PMb&4(ratEISdWeR;9nS
zO!YKWd>Z4zf0pIVOhJMg=G%L9>iH(AICm@anOp@0fz|!JOsG3%<etIzTz*{!DM1`P
z2``_Ohc)*QV45|Ir5(+aNpCm;gj`LO5t}m<1^FGE`KW>))g*Y1=rXZk@WQG~$_p&g
zDW4BhEYb~pIGFwd>?h~u00GZn)|iumVuA@)t03JGy3k}MJttpV(j72{Y+-?$HMZn7
z^+V$<K7%kDKwtm_ru12Xq!P+LUcs!cCppooYMgTtQo9fn*lS?#AXFqW@l!6;q@WUf
z2e$~%EYza>AqE|An9X-YGn>lko6XmuqLwTps5i#Lew-5s@29#oHKM?5HJktoea!(y
z{IwAPyNiqG%*is-ze0RMKo@?Y{^201^lMvd+l$6KTB(oaeHXw%JLXwmd_WyCxM=7s
zr%awy8SuNh4zs0yqW9Yu>RAq~N++4J4Ep}9v+N(uZCD?&(~pXK?H?N@SdGCUs2&lJ
zh_QcXg1Kej*84epj8<onv%c%}y!e>;4?=v!=k;%nAY4^Mp}ubv*XEkjRQ?+xUG;e0
z{#{Gc-$EI0K0<YTmyCveRknAcn!*0Q0SU!V&x?^+>MN<?8*`|XO6F0%SXZsrjcRZ3
z>BTMiU#GCokUk;#F1lt4JN7tvr59^nPt~LmqKv2hT-*1^jUy?(`woTl>#F3E4%%I?
zUsb8uP6kx<B=TSa$6y~F8|hnN@cL2x2p_8k1!M6|?^Foy7i`}}Yjs6gr;P&)BQ}(2
z7Y;1@+LQL>ryr0kBz4<{f|x@rWpFh;t$4N0(CNcKe3|&?@&X&anI5mnjrPnN!#|c)
zO07=bPLXs`V13Hsig1Hz4VW>*U%6?}yipQKIa2(oz1Q5jBee92;X)`HgL|1zx5=Kx
z77=&JM8ooFwAY6vI&N<Iza-$mL1DvHX%?e;>F~B{tSyjvF4;4Z(G}`ifULrPrc-|B
zH8R*+=nvmq;FTnau!eM=;+<c3aPu1`1PdW=l153SCf%Y0k7v_?q<shQ?CtaCwk~wZ
z5X{~Uzfvpqh3JGP9qLX9zo!dhh<`3icG{oQ;7u>)S<pZ)r_aIa{_w^!=;~vliH(+l
z{O%1kgW^C1v^zCvSQPmSbx22A^M`mi*_Fzb2)SZhfCA*jv<15Y<88&h7_4}^7iAd9
z<fQRmyxzz2MXUC0-aS-eaQIzXEUev{xOPMXQerpU2l$FfeRMvquiC;Y==s=fK84dm
z==s}cOH_;>*swezBw)VrdWUrB*y~!?m3cz&(Q15bTBZ5ic3QBl^?e{_Qvn3)+~1r4
z?Hpg<wZ8_b%y-X7F3&XAvFGVla;MXZqX=T*=Fq3l?M?Du;)a%Amsv<Gy682Gm!vxi
zaGZXUCO?oZi-ov8#+?fPoER(l)ghZC9wn5wo)&n#MEvVVzNe!Bd?xziIV3|TSR$gF
z{XK6@`m0j)$tSr#x|wyb(K(0;UVMELEn*jt^t|(-iUg9l+R@B6{e*(?IN|2*4@6S#
za^Nu&eQPsGr;h0SDasL1bv4bH1968E|90AZaH)TRi<`!sf<aWBP8Y*8hg_X+RHhU0
zygs%;pRG!2Df8NF=3)Q$*I7m}uy(GHzG+^`Hs$FPC%L#W7HrN28|dTG2+qjPy&@v-
z2f~fl`goH|a?4S>Ngn*;jV2<qqQ~LKCVQ+=@A!bddmNZ-LQcq+`%_a=D|*{1)h>Dp
zcZ{*g!Ao(K%oUZgqxFGD4;P|eFn^FYhXvsgZXyjF++WD`6o|9gHVeUh;o)2>ubm4(
zMZ}WWU=n8Qby=&HLljPUc78uH-XDz=tkGaEl-tHx#U}h$IrG~im=%3igFL+$X+eYO
zSJHz}nukfg2(|jcKr%kI|8}ET`?UfHpOr~|)zyvbP>0LN%!Qa$_ZL2Ge7umo0>l6}
z;-&4tPQd}G!8Of{Mz7!DFGuJ>ebIDNAGhVP2Y8EZU%HdFEwAsr255gQP+mPKO6Kw_
z&u<XPi=C!*`>(*u`X^wS1+27ZvKMp``4_~=yx&m~!|;Prg1&I8s+-b7<_uAQBmmFF
zIro~2$;yI|%4X_LL1K<W^mYnT`E2}!yC=6ZbkMJcjhup2<U*q=bTd=TpSh9b5V<`=
zAks;^iAZ6?r0yL>_ftUi(|Int&`*-c*eP`R(mq-0*ngtHoG`!(FSs~I^l_e=WS!3;
z7sOF3o83O>PG*2_4kD!%peyyuk@(1#onCD5{Z55b0*87S<tj05NiAI1v}8+7Tzp$P
z>+3Mj!TE|%c+?V)&}7X50TnfLWtSOfr}_6pEtzUrU8RmlltWx6qZ1&4RnpG)<o-A4
zL-HdwLT=4xdUab!eS`7hCJY;?OAESFjfOT)5o&SdkeCyz^E;<w8MJp!XDX1v<H5+R
zi-kHY4n<I;?%W+!_J<<sgy41E*l;#;1L-5U2lgpg#Iz<Un55ax4OjR}o`V-s2m!6%
z^2KIx6}Fmtmm6K<?vdf|>OI-R@th6F_9@-kO+K;IUo4`4Q5T!7)DWfQ6t&k9dea@W
z{jPB9LkWgCu@oRB4Z`h=T)5!^w|Ywvv_I`2VET#850`PlbajOnQe@^ow!rO$phJf-
zylKnv;Al7O(HUcK7Oju3m;jqFB3TrKo%7jA0qBq-hnrE=mWJ$CLm&K%%{C1@Z*;y_
zv@niP?iQk68mXFEhqHd!?%dj%g7UE~+?&(Uo@7bUiF7wvq@S&RHj`1*H=nkpqXZ~Z
z_ZaERMlzPk+2nBKG|iF6L?0FH;j|Y1vt!#TU!5{eqQ2n;X7y;++<BqbWz5_j5&-FA
z@{VZ#Hh7YmkVXCj6=|3Dyn7wH^3g|#H)HyIa+N|sf&ZfL4%Ao_fkDw6R&0@Qh|ui5
zw&qij>|!c0aOH*)6UlgY?1}}Bdf_^dHCGt=c5z<*yU0G@GBQBEIU;Ua2${)Kd$4ak
zf|stOhx2&4gcAk70%z|7Tqx5wC9`x|bILCv>XV_)wVkS%!Upeh5cIbd)jCD}^o=Cd
zp>V%E+7ODAZlUCEh$w|T;udusM+y{Y+PAFXJcM*3J&r9D_h*%~ZoC?w%m?)9t0keb
z2}b#@wPuvt?Fc9m++1xg%5wcge2x4Z7Fmk4%(n&`GN$V5NuPSxtst3QPfJlE9~Y9W
zuR^=oZ}RrGo|UhYf&Fx1o?h%B^hcY>q*&-3+}7bHweTl_(l<vrExqYf`?hMUUGh$S
z1`%D!EY2@gXThN)LAXC0XIrcDo~0RJ8(FW5&PF6zg)>EVpDf&ZQDR_CH3Z0^VE%aA
zHqI@kaKF2>HG|E6_dmM3@~|e7EFLoGFp3HhVo+4%Mv;UNF7Hl&a0Lh?kOZ%Ugm8o_
z;dZ<L<q!qr7(@gS5J6=WM^MHUMNvUSLGT)NP(krn6+Fjlwz?Zga9nmicl~GU`)I1`
zRsH%`Rj=Ob@KQz3xcT*Y+5?Wb9~zXZlXfNt<ZK&wz-#-%V{x;dOXf6X{ld-kax>O?
zwYBU8TT3tdUD#g)Ud}%5P1_3Poo)ZQOYa<Ad)J3~gvL@O!zF9Pj)AORdOPyzSDo@A
z(tXS1!s99p;B|BBtIJ>$Q_b4rzaD<qH`yVP`f%=A;51}2LzGSuN+#BqmhM|Jb!F0X
zqeRBaBge<u@S+2Z&p4E%RFu3n-nrknpLLPKz^#6-v&l$pweH$QRaNzvX`3pfvKh@$
zcVs!2x>cDAwcl2nJV>u7Uh4b2GJjobc8;z`hxduR;$nh%SDezj$_Hg$op(Fiots1_
zvo=1M*7K>LthZgds)@?(GGFbRL@RKJ`|;t1{%M@@-{+*?sqH+<0Iu8aU7*hzGn^0J
z8aTaQXO?DF4OsG}T0Gkaw2!@5tmN7TlQrA)5_6?T`_4}uuQ7#B`q8$eWz!05OhcSn
z^pVPs-j0)LODL3v6)vi~lTSU<?N)PW{NU@rFueTR`wIt8s7`c^U*nQrzN1p`W<Inb
zV?OPWFL+0p@&0j)lG<L9XY_)u9GO|C`}C70L29X|F3*SG<)S5a*~S4a#JQ|xjLh|K
z$ESG>q|WY}(oVUcvHDl>GCjTNw-;5-o;hP&U1C*zUDD&)rU?`$VQd|wqXOwpy3K4#
z_SD&TGAvI$W~&NWHvfm_+tX>z4dY_hwP)Vx57H&PtTY)rdm4+K`}3B83h<|e9mhS{
z?5^s)#Yu@bu2HPtWLJlOcT12yXYn7m@@2T#O@8ZY)ct7hoE?0Y%Ot`g(*&c7%?FC~
zPZPd>xOe~K`WcUWP3COnX+2#dvkO|%gDYF1bnHh=r}ER+7d0)nB(82)tgoNobuD&p
z^OK)9lsMIC*Mz359O!+Rqp$z+>D@PJ%<%oso<G#r$ckE^6932$d@OMHf<J&P>nKhl
z-1}wGqbs)v@A`1ROk-&{zSCNwdo|_Ii;9e+TE0`8Y^Mgbc06jl>1C!NFm7~SWRN9W
z)LA#Bwf^_AidE4y>0}ovC9$e1?45B2jjzm2nQqyRX`5Yf*BQ4VwM=<^ceCxd(4?Cx
z3k?JhU7L2@XeW7EYtIyYd~yo2VY_l((V3lLAu4&blXiknu5bRi0}R!lMY{tF@<<=Y
zb_b{KRbHJ+W3vwz?_v}r>R!B7T1r|!X0!6@C7;YVOl=PKo_QNbP*&Q~?}9arjq>Pt
z-5Zx`)>KeKysC3C^2Eu9)>tnc4P|IxRkHnW<LS#9*tko<n|dlw|I@^LqMOF7PeD})
zuXRNq7M>zZdbk2>WxH~#-YKHh%dRHlM|i*VNqs98rytDx)YM%PF_Dm1v6+?oY(lr!
zLG|4>+fE*NWpVJ;hNR2SeotO-<X{8rC+44at&h@+_OICV?6oQ|qqP|OL(WoY%TuO}
z-eG!QT}uK**q7}Ke7pP05M|;F6+O({DgM@8ZDrLv4l&L)dmmOaI$u5Ue%S6*$${BM
z*;a>o7i_m~ok()d?^Vlkk>s80&|m2<Dilthk)KW;W0W0#ps=(cgj$pzZZ?nGvty=i
zYrrG#xXd_<V&N%mr`F30)~k~P+u}RMW{Ld|6dFu7{6omEnIYNVL%DQQSg?M~o-YRc
zPQM=8)+mEFGDe0EY@U#P^#3E^N4a}WFVha2G9uvDk!n=D4wq(Jn&~;tcz0of^X#g9
zjm~qbk6~R*Yg(x;b8BxVxR}*HX>=hT|A2KRd8=5cnl}+-s^n(lL#h@RY+k5R+8r$>
zSaxw_6RfTW9GYN#EB3_%n>$H*Z8kkQer>jWWu<NNp8RsJP3PH($r%e?T=L3T_@+BM
zW6}HPmopasiP6Yh@|`*>6Z(EuP9}A{d1oeV5~Pu3H<`)GvY*1w$#R$$(V0b`xn83~
z2gLe1a0c5Van2@{zu;Wv9({@Xa;&d%7KrtUymFZIS;MhDNHdq^#boEQefYV#-o6os
zlV>c8d!@<Yr&?ao6XuGpa0ANAuka*?d$0J0HyK=A8rdbfx-7Prowj^cf7eed)+*ui
zS0<`^=kt?iIk*benhb^cj@lVs9}e?f_b-gPe_gzDI{t>`uFVd-z}-7G@`9@N?=B$j
zJAD5}@PRJ%?UF<7-rGa!7UFUj`rpah9{OwlmF-~-O36OqjT7igBhE}u)@C$Umfw`N
zT0XcbyJT;4E3%U<z7=&%<g+<C;<t#axA?bLf4@BT?q+L8)1GbKAi`JjQ&IfmgI9}I
zJ)5iaZz6mXRg~9{nY!mPi16*{PEfV#>rOmA_d9+P&drg(p|kJa_d59HDg2F72Wm^g
zwWIq=X3k7dIj1)(%|<uXpdeN^&7|sx?j{i2+sPCBLq!jb@LAr7)!SnG=!jm%-B*40
zGM0>u+n7niam%vkx=t&$(q|t90YB39vTS#%R(Xz>C$~J;N3g9t&lle$`f2%^AN=zL
zTb%s2(YM7J76hi1-!G8VniLv`j<f69?z&7xTqyOlGTIS!Bkn;;a`$z|qWGuH;^H-*
zehAo^Fwr@nWMhKvt&)`beB<yms_^=*bgM^wr37edWm&do8k2~G`3Rz_%J}Wl8<iEs
z3H8i9yS6w7RqifI3#zKBZVB33eY|SyzFkeTAMYDz=k48JmzEk_UH`Nt_~)p%@8%ph
zIqv=01E=x9rt>3bY?9Qpn4N!;d|}>uNx@+V4fD+<Us=8>r1nPW`H;HXYsQBjxu3Gi
zC-UA_s+n<b!78)*skJ%P^^a?Q0};MP-xi(zmifegyqf1f7~$LS@fEJ&<Gc676H4P3
zkWP-7_9LkgjquI?KsxPeXWE}YXShZ*;bS7q&m1%0`JU+vIMzQ?JLXCMS)F9lfphvX
zoSJ4MBXZsn<BEL)Ev9u(2F{btn!aeIT$Q}IVB?kj!gf{PF^h}C0l&eGfKf7$bTHhv
z3f~iISX7NQTz6Rs#QVNv_q5cpS_)?dY7vlA1T`^Qcnk)S{pUmRwEo7acnW~S2drs<
zFI)_sx;8>Qc<>l39!y^+9@@P6b*=xqXMauW!7rbHeh&p^6`2pYDj*)3$3d|+zC%g#
zfD)K6jFvY3rn2UZq5PSN@_dWBX7pf{x08pn=`NZlCJh!REFCVcSw5V-xpFw0;y0YF
z7Y%1YI9c<|<UtIeY=bwNSgj|Hs(h(5EM|qwFF+K+*PM#cR5idN2TK}YT(tpT<Q_SW
zai~Dq07F2I%9Etyv=<=5s5uTaQar8>gOvQ`q&T!#VMjAsFk&+?S|s1cJE9G|g%&Ls
zxqB5YQ`|U;mW+t7k3fRgAO;+rsim!T(=g#X?ct**|L&oh_?~90CIR2wh}E>g7}k!J
zBYfk-EAbzY2j`!{YVK0P817Wl{<;$k@i{GU2kK8_H5XwFPvEqN&6MIyDsI3X?chQD
zK7fYDzeUlWYRJ+4q6LbOqn#IsZ$uz#n-GGnJUOI-2BBbE{^Oo_{57PvFHx|!OXXlA
zv`>-SXj|Fw&k=4lXAo|CR><MxnyZL4+75L5#`EydJx8%R_;ReT*f@?aM8K-gBE&xV
z$-zdh7)5fU?f1slAb3wuyaWDnJjK<fh&I|bZTvX|t?C@YE}=t74mV<{qIiGf%P{!c
z0B|b44+ZQwiIoF>m9xdaMeufmCAJ8I?>meICIX{nN>Fuj=gbQcMe>FGNIqeY5-osc
z%?xmd#9VI=HpS5;I!FXbr0!Ps;ut5X%vun_<T5F4oZv9AJs0wF3Sb2BLfpt~b1us>
z(2eXYVZ?LIxe^;o21^*|&JqT=vz#UFcH(%h4=;!Tv8^DoTa1tm(Y(ZAaSS`BI9C>A
z>B?fn@F6!VM`oB4P3##auy7;MA$u2UOqd<lf@%)uX;7HGkS>Yjd5WolkUbScP=!1_
z-F@bv2A;4_g9s-}u_FXQOz;CI7YL$2b`azW7tzF^ggPXGs8mEA1J`pJmP2qQ_<ksn
zK!!%4f?_(v1ro6X1lh%awh$YB3_}dyAsYz5M8eHzF*I-y!y>>M)JO0^27<hzqlv_L
zIEV5BKxkqz5K^JhQX&zqj!-~QAjr`hjGR!yvG9)PabQ{q6msEsgc6Bya(qX)9bFPF
zvJ11~l5J%|AM+@Vg{P$-oo(eF!VBm5#0I+ug@*{}wgEyI34ZEl^?{ru;!q_(0>vYI
z5o*ZgbG}ZnT!Ak<5OTh5mVOqV5&_Z@f`<#exU(#YL>t-gJ(U7W!UYy$z|k!l{53OD
zAhAU_z+Cx0-ZCEO4=*G(n#%!TRJiM()ej1@5;{s^c<{3v1U3L)J}ATUZEuGRnhQAi
zk!S-d4oxvkg=+3cb_nHp+r|s&-T?wo01D9yJ;>^t@^zEKd|@R+XZCZxG;3H6q(gk+
zs&c*pGLOXPILE^6QLTap!7gSFm>Qt;LxmzZ$M><NfH4gckYmjSK9>H-n6cool}H41
zCafbe0oeh5fahQ9Q>@%W5nTczY9P^{NMyj3kdXpQ0Zmj0VL~GlJ%~h51l9tQz&eB=
zA;&F%NA~s?gh+yrxk7XUzEDYXz|#G8eC<L;=L=cEazJE=ud^cGFN`nc+w!GG5W)Pp
z!6d#szH>g~i%iAP_=0)1;QB}qol>+HVvqQ5YA<Y}!O|MVzInoO1$%+m<01P-mIMpv
zuo@w&6uD46EWRmUAsAy~JYsASP5j)x#eK#XAvMfbXwKufkvL?_a4vaJ{fG6L#)RkR
zTj{gC7&szeC%|vkptAt;kKBW$4=c@={R<E7uj2T_`Oo^JIR4HaV7|VUf1x@7|A^WR
z`xw-?0qXV;C<h5Pco6K!A=t5^$z{j_qPJXlu>uI-laYihfx$egM9A9$GA9c5oWTN1
z*Cq_Eh#Gms{{K7wL*o0GOMo8>5O4rF7;6-NiRcr$>Ja~i7!mkEOg2ItxzJf)5^NAv
z;s6^-jDss630>(9ayg$4{E>nTsesCZ;bN3Otb3pNL%IJXaee&Ys-xO-phtcXc)=Oc
zC$J9W`h+YBlp11>U^FHh8CCRNJZLn*`xPuh=L>W-2IdEr0J6T23nE761N5JX@`oP<
zQG@&(K|c@~GT%e(!EdDL|Fpm0feSGNt#H0HB1#lFS1Qwq7D;7+VWB!?3p0`#3A0!Y
KgYf|WOXFX_Ldj|X

literal 0
HcmV?d00001

diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs
index 0f4567a973..5017b0bbba 100644
--- a/test_helpers_end_to_end/src/client.rs
+++ b/test_helpers_end_to_end/src/client.rs
@@ -1,12 +1,12 @@
 //! Client helpers for writing end to end ng tests
 use arrow::record_batch::RecordBatch;
 use futures::{stream::FuturesUnordered, StreamExt};
+use generated_types::influxdata::pbdata::v1::WriteResponse;
 use http::Response;
 use hyper::{Body, Client, Request};
 use influxdb_iox_client::{
     connection::Connection,
     flight::generated_types::ReadInfo,
-    write::generated_types::WriteResponse,
     write_info::generated_types::{merge_responses, GetWriteInfoResponse, ShardStatus},
 };
 use observability_deps::tracing::info;

From b862ae6476aa485639f82cc717bec1f9334025b7 Mon Sep 17 00:00:00 2001
From: Stuart Carnie <stuart.carnie@gmail.com>
Date: Mon, 3 Oct 2022 11:38:35 +1100
Subject: [PATCH 02/40] feat: `EXPLAIN` statement (#5763)

---
 influxdb_influxql_parser/src/explain.rs   | 140 ++++++++++++++++++++++
 influxdb_influxql_parser/src/lib.rs       |   1 +
 influxdb_influxql_parser/src/statement.rs |   9 ++
 3 files changed, 150 insertions(+)
 create mode 100644 influxdb_influxql_parser/src/explain.rs

diff --git a/influxdb_influxql_parser/src/explain.rs b/influxdb_influxql_parser/src/explain.rs
new file mode 100644
index 0000000000..c9576aa3e8
--- /dev/null
+++ b/influxdb_influxql_parser/src/explain.rs
@@ -0,0 +1,140 @@
+#![allow(dead_code)] // Temporary
+
+use crate::internal::{expect, ParseResult};
+use crate::select::{select_statement, SelectStatement};
+use nom::branch::alt;
+use nom::bytes::complete::tag_no_case;
+use nom::character::complete::multispace1;
+use nom::combinator::{map, opt, value};
+use nom::sequence::{preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+/// Represents various options for an `EXPLAIN` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ExplainOption {
+    /// `EXPLAIN VERBOSE statement`
+    Verbose,
+    /// `EXPLAIN ANALYZE statement`
+    Analyze,
+    /// `EXPLAIN ANALYZE VERBOSE statement`
+    AnalyzeVerbose,
+}
+
+impl Display for ExplainOption {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Verbose => f.write_str("VERBOSE"),
+            Self::Analyze => f.write_str("ANALYZE"),
+            Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"),
+        }
+    }
+}
+
+/// Represents an `EXPLAIN` statement.
+///
+/// ```text
+/// explain         ::= "EXPLAIN" explain_options? select_statement
+/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? )
+/// ```
+#[derive(Debug, Clone, PartialEq)]
+pub struct ExplainStatement {
+    options: Option<ExplainOption>,
+    select: Box<SelectStatement>,
+}
+
+impl Display for ExplainStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str("EXPLAIN ")?;
+        if let Some(options) = &self.options {
+            write!(f, "{} ", options)?;
+        }
+        Display::fmt(&self.select, f)
+    }
+}
+
+/// Parse an `EXPLAIN` statement.
+pub fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> {
+    map(
+        tuple((
+            tag_no_case("EXPLAIN"),
+            opt(preceded(
+                multispace1,
+                alt((
+                    map(
+                        preceded(
+                            tag_no_case("ANALYZE"),
+                            opt(preceded(multispace1, tag_no_case("VERBOSE"))),
+                        ),
+                        |v| match v {
+                            // If the optional combinator is Some, then it matched VERBOSE
+                            Some(_) => ExplainOption::AnalyzeVerbose,
+                            _ => ExplainOption::Analyze,
+                        },
+                    ),
+                    value(ExplainOption::Verbose, tag_no_case("VERBOSE")),
+                )),
+            )),
+            multispace1,
+            expect(
+                "invalid EXPLAIN statement, expected SELECT statement",
+                select_statement,
+            ),
+        )),
+        |(_, options, _, select)| ExplainStatement {
+            options,
+            select: Box::new(select),
+        },
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::explain::{explain_statement, ExplainOption};
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_explain_statement() {
+        let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(format!("{}", got), "EXPLAIN SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(format!("{}", got), "EXPLAIN VERBOSE SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(format!("{}", got), "EXPLAIN ANALYZE SELECT val FROM temp");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            format!("{}", got),
+            "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
+        );
+
+        // Fallible cases
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SHOW DATABASES"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        // surfaces statement-specific errors
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/lib.rs b/influxdb_influxql_parser/src/lib.rs
index 32842c0615..231e3fe0e9 100644
--- a/influxdb_influxql_parser/src/lib.rs
+++ b/influxdb_influxql_parser/src/lib.rs
@@ -29,6 +29,7 @@ mod test_util;
 mod common;
 mod delete;
 mod drop;
+mod explain;
 mod expression;
 mod identifier;
 mod internal;
diff --git a/influxdb_influxql_parser/src/statement.rs b/influxdb_influxql_parser/src/statement.rs
index 0455051e81..3275685c54 100644
--- a/influxdb_influxql_parser/src/statement.rs
+++ b/influxdb_influxql_parser/src/statement.rs
@@ -1,5 +1,6 @@
 use crate::delete::{delete_statement, DeleteStatement};
 use crate::drop::{drop_statement, DropMeasurementStatement};
+use crate::explain::{explain_statement, ExplainStatement};
 use crate::internal::ParseResult;
 use crate::select::{select_statement, SelectStatement};
 use crate::show::{show_statement, ShowDatabasesStatement};
@@ -19,6 +20,8 @@ pub enum Statement {
     Delete(Box<DeleteStatement>),
     /// Represents a `DROP MEASUREMENT` statement.
     DropMeasurement(Box<DropMeasurementStatement>),
+    /// Represents an `EXPLAIN` statement.
+    Explain(Box<ExplainStatement>),
     /// Represents a `SELECT` statement.
     Select(Box<SelectStatement>),
     /// Represents a `SHOW DATABASES` statement.
@@ -40,6 +43,7 @@ impl Display for Statement {
         match self {
             Self::Delete(s) => Display::fmt(s, f),
             Self::DropMeasurement(s) => Display::fmt(s, f),
+            Self::Explain(s) => Display::fmt(s, f),
             Self::Select(s) => Display::fmt(s, f),
             Self::ShowDatabases(s) => Display::fmt(s, f),
             Self::ShowMeasurements(s) => Display::fmt(s, f),
@@ -56,6 +60,7 @@ pub fn statement(i: &str) -> ParseResult<&str, Statement> {
     alt((
         map(delete_statement, |s| Statement::Delete(Box::new(s))),
         map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))),
+        map(explain_statement, |s| Statement::Explain(Box::new(s))),
         map(select_statement, |s| Statement::Select(Box::new(s))),
         show_statement,
     ))(i)
@@ -77,6 +82,10 @@ mod test {
         let (got, _) = statement("DROP MEASUREMENT foo").unwrap();
         assert_eq!(got, "");
 
+        // explain_statement combinator
+        let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap();
+        assert_eq!(got, "");
+
         let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap();
         assert_eq!(got, "");
 

From 3ff48152c9c495ac23cd459453f583c79dfbb134 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Oct 2022 01:56:00 +0000
Subject: [PATCH 03/40] chore(deps): Bump smallvec from 1.9.0 to 1.10.0

Bumps [smallvec](https://github.com/servo/rust-smallvec) from 1.9.0 to 1.10.0.
- [Release notes](https://github.com/servo/rust-smallvec/releases)
- [Commits](https://github.com/servo/rust-smallvec/compare/v1.9.0...v1.10.0)

---
updated-dependencies:
- dependency-name: smallvec
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock                        | 4 ++--
 influxdb_line_protocol/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5f33043592..365df27e60 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4653,9 +4653,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
 name = "snafu"
diff --git a/influxdb_line_protocol/Cargo.toml b/influxdb_line_protocol/Cargo.toml
index f82103288d..aae56dd1db 100644
--- a/influxdb_line_protocol/Cargo.toml
+++ b/influxdb_line_protocol/Cargo.toml
@@ -14,7 +14,7 @@ ffi = ["libc"]
 bytes = "1.2"
 libc = { version = "0.2", optional = true }
 nom = { version = "7", default-features = false, features = ["std"] }
-smallvec = { version = "1.9.0", features = ["union"] }
+smallvec = { version = "1.10.0", features = ["union"] }
 snafu = "0.7"
 observability_deps = { path = "../observability_deps" }
 workspace-hack = { path = "../workspace-hack"}

From e7a32543787fe293d3cafc344cd0fcfe60cf48a6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Oct 2022 09:09:00 +0000
Subject: [PATCH 04/40] chore(deps): Bump ordered-float from 3.1.0 to 3.2.0
 (#5784)

Bumps [ordered-float](https://github.com/reem/rust-ordered-float) from 3.1.0 to 3.2.0.
- [Release notes](https://github.com/reem/rust-ordered-float/releases)
- [Commits](https://github.com/reem/rust-ordered-float/compare/v3.1.0...v3.2.0)

---
updated-dependencies:
- dependency-name: ordered-float
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 365df27e60..3fa55e8786 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1031,7 +1031,7 @@ dependencies = [
  "influxdb_line_protocol",
  "iox_time",
  "observability_deps",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "percent-encoding",
  "schema",
  "serde",
@@ -1075,7 +1075,7 @@ dependencies = [
  "log",
  "num_cpus",
  "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "parking_lot 0.12.1",
  "parquet",
  "paste",
@@ -1097,7 +1097,7 @@ source = "git+https://github.com/apache/arrow-datafusion.git?rev=c7f3a70a79ee840
 dependencies = [
  "arrow",
  "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "parquet",
  "sqlparser 0.23.0",
 ]
@@ -1144,7 +1144,7 @@ dependencies = [
  "hashbrown",
  "lazy_static",
  "md-5",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "paste",
  "rand",
  "regex",
@@ -3191,9 +3191,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "3.1.0"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a"
+checksum = "129d36517b53c461acc6e1580aeb919c8ae6708a4b1eae61c4463a615d4f0411"
 dependencies = [
  "num-traits",
 ]

From 20451921d0d76ec8c7227cba380d2a2181f563ce Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 3 Oct 2022 11:52:14 +0200
Subject: [PATCH 05/40] test: MockLifecycleHandle captures calls

Changes the NoopLifecycleHandle to MockLifecycleCall, and adds code
causing it to log all calls made to the log_write() method.

This will allow tests to assert calls and their values in DML buffering
tests.
---
 ingester/src/lifecycle/mock_handle.rs | 66 +++++++++++++++++++++------
 query_tests/src/scenarios/util.rs     |  4 +-
 2 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/ingester/src/lifecycle/mock_handle.rs b/ingester/src/lifecycle/mock_handle.rs
index d5b889c4af..bec4af5ce0 100644
--- a/ingester/src/lifecycle/mock_handle.rs
+++ b/ingester/src/lifecycle/mock_handle.rs
@@ -1,26 +1,66 @@
 //! A mock [`LifecycleHandle`] impl for testing.
 
+use std::sync::Arc;
+
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
+use parking_lot::Mutex;
 
 use super::LifecycleHandle;
 
-/// Special [`LifecycleHandle`] that never persists and always accepts more data.
-///
-/// This is useful to control persists manually.
-#[derive(Debug, Default, Clone, Copy)]
-pub struct NoopLifecycleHandle;
+/// A set of arguments captured from a call to
+/// [`MockLifecycleHandle::log_write()`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[allow(missing_docs)]
+pub struct MockLifecycleCall {
+    pub partition_id: PartitionId,
+    pub shard_id: ShardId,
+    pub namespace_id: NamespaceId,
+    pub table_id: TableId,
+    pub sequence_number: SequenceNumber,
+    pub bytes_written: usize,
+    pub rows_written: usize,
+}
 
-impl LifecycleHandle for NoopLifecycleHandle {
+/// A mock [`LifecycleHandle`] implementation that records calls made to
+/// [`Self::log_write()`] and never blocks ingest, always accepting more data.
+///
+/// # Cloning
+///
+/// Cloning a [`MockLifecycleHandle`] will clone the inner state - calls to all
+/// cloned instances are reported in a call to [`Self::get_log_calls()`].
+#[derive(Debug, Default, Clone)]
+pub struct MockLifecycleHandle {
+    log_calls: Arc<Mutex<Vec<MockLifecycleCall>>>,
+}
+
+impl MockLifecycleHandle {
+    /// Returns the ordered [`Self::log_write()`] calls made to this mock.
+    pub fn get_log_calls(&self) -> Vec<MockLifecycleCall> {
+        self.log_calls.lock().clone()
+    }
+}
+
+impl LifecycleHandle for MockLifecycleHandle {
     fn log_write(
         &self,
-        _partition_id: PartitionId,
-        _shard_id: ShardId,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _sequence_number: SequenceNumber,
-        _bytes_written: usize,
-        _rows_written: usize,
+        partition_id: PartitionId,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        sequence_number: SequenceNumber,
+        bytes_written: usize,
+        rows_written: usize,
     ) -> bool {
+        self.log_calls.lock().push(MockLifecycleCall {
+            partition_id,
+            shard_id,
+            namespace_id,
+            table_id,
+            sequence_number,
+            bytes_written,
+            rows_written,
+        });
+
         // do NOT pause ingest
         false
     }
diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs
index f9a687f03a..a46200101b 100644
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@@ -18,7 +18,7 @@ use ingester::{
         partition::resolver::CatalogPartitionResolver, FlatIngesterQueryResponse, IngesterData,
         IngesterQueryResponse, Persister,
     },
-    lifecycle::mock_handle::NoopLifecycleHandle,
+    lifecycle::mock_handle::MockLifecycleHandle,
     querier_handler::prepare_data_to_querier,
 };
 use iox_catalog::interface::get_schema_by_name;
@@ -722,7 +722,7 @@ impl MockIngester {
     /// Takes `&self mut` because our partioning implementation does not work with concurrent
     /// access.
     async fn buffer_operation(&mut self, dml_operation: DmlOperation) {
-        let lifecycle_handle = NoopLifecycleHandle {};
+        let lifecycle_handle = MockLifecycleHandle::default();
 
         let should_pause = self
             .ingester_data

From b23ad317114ba66532c146e56481ec367c82e3c3 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 3 Oct 2022 12:08:33 +0200
Subject: [PATCH 06/40] fix: spurious memory accounting for failed write

Fixes a case where the ingester may incorrectly record a write as having
been buffered in memory, when in fact the buffering failed.

This could cause the effective buffer size to be reduced over time as
more and more data is spuriously "added" to the buffer, but never
released back to the memory tracker as it is never persisted.
---
 ingester/src/data/table.rs | 152 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 3 deletions(-)

diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 89127d04bf..186377269c 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -121,16 +121,19 @@ impl TableData {
             }
         }
 
+        let size = batch.size();
+        let rows = batch.rows();
+        partition_data.buffer_write(sequence_number, batch)?;
+
         let should_pause = lifecycle_handle.log_write(
             partition_data.id(),
             self.shard_id,
             self.namespace_id,
             self.table_id,
             sequence_number,
-            batch.size(),
-            batch.rows(),
+            size,
+            rows,
         );
-        partition_data.buffer_write(sequence_number, batch)?;
 
         Ok(should_pause)
     }
@@ -207,3 +210,146 @@ impl TableData {
         self.table_id
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use assert_matches::assert_matches;
+    use data_types::{PartitionId, ShardIndex};
+    use mutable_batch::writer;
+    use mutable_batch_lp::lines_to_batches;
+    use schema::{InfluxColumnType, InfluxFieldType};
+
+    use crate::{
+        data::{
+            partition::{resolver::MockPartitionProvider, PartitionData},
+            Error,
+        },
+        lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle},
+        test_util::populate_catalog,
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+    const PARTITION_ID: PartitionId = PartitionId::new(0);
+
+    #[tokio::test]
+    async fn test_bad_write_memory_counting() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME,
+            shard_id,
+            ns_id,
+            None,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Initialise the mock lifecycle handle and use it to inspect the calls
+        // made to the lifecycle manager during buffering.
+        let handle = MockLifecycleHandle::default();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.get(&PARTITION_KEY.into()).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.get(&PARTITION_KEY.into()).is_some());
+
+        // And the lifecycle handle was called with the expected values
+        assert_eq!(
+            handle.get_log_calls(),
+            &[MockLifecycleCall {
+                partition_id: PARTITION_ID,
+                shard_id,
+                namespace_id: ns_id,
+                table_id,
+                sequence_number: SequenceNumber::new(42),
+                bytes_written: 1131,
+                rows_written: 1,
+            }]
+        );
+
+        // Attempt to buffer the second op that contains a type conflict - this
+        // should return an error, and not make a call to the lifecycle handle
+        // (as no data was buffered)
+        //
+        // Note the type of value was numeric previously, and here it is a string.
+        let batch = lines_to_batches(r#"bananas,bat=man value="platanos" 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        let err = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect_err("type conflict should error");
+
+        // The buffer op should return a column type error
+        assert_matches!(
+            err,
+            Error::BufferWrite {
+                source: mutable_batch::Error::WriterError {
+                    source: writer::Error::TypeMismatch {
+                        existing: InfluxColumnType::Field(InfluxFieldType::Float),
+                        inserted: InfluxColumnType::Field(InfluxFieldType::String),
+                        column: col_name,
+                    }
+                },
+            } => { assert_eq!(col_name, "value") }
+        );
+
+        // And the lifecycle handle should not be called.
+        //
+        // It still contains the first call, so the desired length is 1
+        // indicating no second call was made.
+        assert_eq!(handle.get_log_calls().len(), 1);
+    }
+}

From 7efd81a63a48c43af15c394123b07863edaecdaf Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 3 Oct 2022 12:23:30 +0200
Subject: [PATCH 07/40] docs: comment write record ordering

---
 ingester/src/data/table.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 186377269c..a8f295c5eb 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -125,6 +125,11 @@ impl TableData {
         let rows = batch.rows();
         partition_data.buffer_write(sequence_number, batch)?;
 
+        // Record the write as having been buffered.
+        //
+        // This should happen AFTER the write is applied, because buffering the
+        // op may fail which would lead to a write being recorded, but not
+        // applied.
         let should_pause = lifecycle_handle.log_write(
             partition_data.id(),
             self.shard_id,

From 9c0e4e98c4130ae9a86ca092de0cc8694fe5de32 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Fri, 30 Sep 2022 14:10:02 +0200
Subject: [PATCH 08/40] refactor: ref TableData by name & ID

Changes the NamespaceData to hold a map of table name -> TableData, and
table ID -> TableData simultaneously.

This allows for cheap lookups when the caller holds an ID, and is part
of preparatory work to transition away from using string names in the
ingester for tables.

This commit also switches from a BTreeMap to a HashMap as the backing
collection, as maintaining key ordering doesn't appear to be necessary.
---
 ingester/src/data/namespace.rs | 161 ++++++++++++++++++++++++++++-----
 ingester/src/data/table.rs     |   7 +-
 2 files changed, 143 insertions(+), 25 deletions(-)

diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 6a5ddb9581..515956f5f4 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -1,11 +1,8 @@
 //! Namespace level data buffer structures.
 
-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};
 
-use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId};
+use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
 use iox_query::exec::Executor;
@@ -16,12 +13,38 @@ use write_summary::ShardProgress;
 
 #[cfg(test)]
 use super::triggers::TestTriggers;
-use super::{
-    partition::{resolver::PartitionProvider, PersistingBatch},
-    table::TableData,
-};
+use super::{partition::resolver::PartitionProvider, table::TableData};
 use crate::lifecycle::LifecycleHandle;
 
+/// A double-referenced map where [`TableData`] can be looked up by name, or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<Arc<str>, Arc<tokio::sync::RwLock<TableData>>>,
+    by_id: HashMap<TableId, Arc<tokio::sync::RwLock<TableData>>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, t: TableData) -> Arc<tokio::sync::RwLock<TableData>> {
+        let name = Arc::clone(t.table_name());
+        let id = t.table_id();
+
+        let t = Arc::new(tokio::sync::RwLock::new(t));
+        self.by_name.insert(name, Arc::clone(&t));
+        self.by_id.insert(id, Arc::clone(&t));
+        t
+    }
+
+    fn by_name(&self, name: &str) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    #[cfg(test)]
+    fn by_id(&self, id: TableId) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
 /// Data of a Namespace that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct NamespaceData {
@@ -30,7 +53,7 @@ pub(crate) struct NamespaceData {
     /// The catalog ID of the shard this namespace is being populated from.
     shard_id: ShardId,
 
-    tables: RwLock<BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>>,
+    tables: RwLock<DoubleRef>,
     table_count: U64Counter,
 
     /// The resolver of `(shard_id, table_id, partition_key)` to
@@ -198,7 +221,7 @@ impl NamespaceData {
         partition_key: &PartitionKey,
     ) -> Option<(
         Vec<Arc<super::partition::SnapshotBatch>>,
-        Option<Arc<PersistingBatch>>,
+        Option<Arc<super::partition::PersistingBatch>>,
     )> {
         if let Some(t) = self.table_data(table_name) {
             let mut t = t.write().await;
@@ -221,7 +244,7 @@ impl NamespaceData {
         &self,
         table_name: &str,
         partition_key: &PartitionKey,
-    ) -> Option<Arc<PersistingBatch>> {
+    ) -> Option<Arc<super::partition::PersistingBatch>> {
         if let Some(table_data) = self.table_data(table_name) {
             let mut table_data = table_data.write().await;
 
@@ -240,7 +263,17 @@ impl NamespaceData {
         table_name: &str,
     ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
         let t = self.tables.read();
-        t.get(table_name).cloned()
+        t.by_name(table_name)
+    }
+
+    /// Return the table data by ID.
+    #[cfg(test)]
+    pub(crate) fn table_id(
+        &self,
+        table_id: TableId,
+    ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        let t = self.tables.read();
+        t.by_id(table_id)
     }
 
     /// Inserts the table or returns it if it happens to be inserted by some other thread
@@ -259,23 +292,22 @@ impl NamespaceData {
 
         let mut t = self.tables.write();
 
-        let data = match t.entry(table_name.to_string()) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new(
+        Ok(match t.by_name(table_name) {
+            Some(v) => v,
+            None => {
+                self.table_count.inc(1);
+
+                // Insert the table and then return a ref to it.
+                t.insert(TableData::new(
                     info.table_id,
                     table_name,
                     self.shard_id,
                     self.namespace_id,
                     info.tombstone_max_sequence_number,
                     Arc::clone(&self.partition_provider),
-                ))));
-                self.table_count.inc(1);
-                Arc::clone(v)
+                ))
             }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };
-
-        Ok(data)
+        })
     }
 
     /// Walks down the table and partition and clears the persisting batch. The sequence number is
@@ -299,7 +331,7 @@ impl NamespaceData {
 
     /// Return progress from this Namespace
     pub(super) async fn progress(&self) -> ShardProgress {
-        let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect();
+        let tables: Vec<_> = self.tables.read().by_id.values().map(Arc::clone).collect();
 
         // Consolidate progtress across partitions.
         let mut progress = ShardProgress::new()
@@ -357,3 +389,84 @@ impl<'a> Drop for ScopedSequenceNumber<'a> {
         *buffering_sequence_number = None;
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_namespace_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+        let exec = Executor::new(1);
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                None,
+            ),
+        ));
+
+        let ns = NamespaceData::new(ns_id, shard_id, partition_provider, &*metrics);
+
+        // Assert the namespace does not contain the test data
+        assert!(ns.table_data(TABLE_NAME).is_none());
+        assert!(ns.table_id(table_id).is_none());
+
+        // Write some test data
+        ns.buffer_operation(
+            DmlOperation::Write(make_write_op(
+                &PartitionKey::from("banana-split"),
+                SHARD_INDEX,
+                NAMESPACE_NAME,
+                0,
+                r#"bananas,city=Medford day="sun",temp=55 22"#,
+            )),
+            &catalog,
+            &MockLifecycleHandle::default(),
+            &exec,
+        )
+        .await
+        .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(ns.table_data(TABLE_NAME).is_some());
+        assert!(ns.table_id(table_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_tables_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index a8f295c5eb..7dc4536075 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -210,10 +210,15 @@ impl TableData {
             })
     }
 
-    #[cfg(test)]
+    /// Returns the table ID for this partition.
     pub(super) fn table_id(&self) -> TableId {
         self.table_id
     }
+
+    /// Returns the name of this table.
+    pub(crate) fn table_name(&self) -> &Arc<str> {
+        &self.table_name
+    }
 }
 
 #[cfg(test)]

From 66e05b5ea7682e5f0347312dd2d228f2180180db Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Fri, 30 Sep 2022 14:23:34 +0200
Subject: [PATCH 09/40] refactor: ref NamespaceData by name & ID

Changes the ShardData to hold a map of namespace name -> NamespaceData,
and namespace ID -> NamespaceData simultaneously.

This allows for cheap lookups when the caller holds an ID, and is part
of preparatory work to transition away from using string names in the
ingester for tables.

This commit also switches from a BTreeMap to a HashMap as the backing
collection, as maintaining key ordering doesn't appear to be necessary.
---
 ingester/src/data/shard.rs | 172 ++++++++++++++++++++++++++++++++-----
 1 file changed, 151 insertions(+), 21 deletions(-)

diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 76fa44ab8b..3390b2aed8 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -1,11 +1,8 @@
 //! Shard level data buffer structures.
 
-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};
 
-use data_types::{ShardId, ShardIndex};
+use data_types::{NamespaceId, ShardId, ShardIndex};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
 use iox_query::exec::Executor;
@@ -17,6 +14,34 @@ use write_summary::ShardProgress;
 use super::{namespace::NamespaceData, partition::resolver::PartitionProvider};
 use crate::lifecycle::LifecycleHandle;
 
+/// A double-referenced map where [`NamespaceData`] can be looked up by name, or
+/// ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<String, Arc<NamespaceData>>,
+    by_id: HashMap<NamespaceId, Arc<NamespaceData>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, name: String, ns: NamespaceData) -> Arc<NamespaceData> {
+        let id = ns.namespace_id();
+
+        let ns = Arc::new(ns);
+        self.by_name.insert(name, Arc::clone(&ns));
+        self.by_id.insert(id, Arc::clone(&ns));
+        ns
+    }
+
+    fn by_name(&self, name: &str) -> Option<Arc<NamespaceData>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
 /// Data of a Shard
 #[derive(Debug)]
 pub(crate) struct ShardData {
@@ -32,7 +57,7 @@ pub(crate) struct ShardData {
     partition_provider: Arc<dyn PartitionProvider>,
 
     // New namespaces can come in at any time so we need to be able to add new ones
-    namespaces: RwLock<BTreeMap<String, Arc<NamespaceData>>>,
+    namespaces: RwLock<DoubleRef>,
 
     metrics: Arc<metric::Registry>,
     namespace_count: U64Counter,
@@ -90,7 +115,17 @@ impl ShardData {
     /// Gets the namespace data out of the map
     pub(crate) fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
         let n = self.namespaces.read();
-        n.get(namespace).cloned()
+        n.by_name(namespace)
+    }
+
+    /// Gets the namespace data out of the map
+    pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        // TODO: this should be the default once IDs are pushed over the wire.
+        //
+        // At which point the map should be indexed by IDs, instead of namespace
+        // names.
+        let n = self.namespaces.read();
+        n.by_id(namespace_id)
     }
 
     /// Retrieves the namespace from the catalog and initializes an empty buffer, or
@@ -110,26 +145,34 @@ impl ShardData {
 
         let mut n = self.namespaces.write();
 
-        let data = match n.entry(namespace.name) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(NamespaceData::new(
-                    namespace.id,
-                    self.shard_id,
-                    Arc::clone(&self.partition_provider),
-                    &*self.metrics,
-                )));
+        Ok(match n.by_name(&namespace.name) {
+            Some(v) => v,
+            None => {
                 self.namespace_count.inc(1);
-                Arc::clone(v)
-            }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };
 
-        Ok(data)
+                // Insert the table and then return a ref to it.
+                n.insert(
+                    namespace.name,
+                    NamespaceData::new(
+                        namespace.id,
+                        self.shard_id,
+                        Arc::clone(&self.partition_provider),
+                        &*self.metrics,
+                    ),
+                )
+            }
+        })
     }
 
     /// Return the progress of this shard
     pub(super) async fn progress(&self) -> ShardProgress {
-        let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect();
+        let namespaces: Vec<_> = self
+            .namespaces
+            .read()
+            .by_id
+            .values()
+            .map(Arc::clone)
+            .collect();
 
         let mut progress = ShardProgress::new();
 
@@ -144,3 +187,90 @@ impl ShardData {
         self.shard_index
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, PartitionKey, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_shard_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+        let exec = Executor::new(1);
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                None,
+            ),
+        ));
+
+        let shard = ShardData::new(
+            SHARD_INDEX,
+            shard_id,
+            partition_provider,
+            Arc::clone(&metrics),
+        );
+
+        // Assert the namespace does not contain the test data
+        assert!(shard.namespace(NAMESPACE_NAME).is_none());
+        assert!(shard.namespace_by_id(ns_id).is_none());
+
+        // Write some test data
+        shard
+            .buffer_operation(
+                DmlOperation::Write(make_write_op(
+                    &PartitionKey::from("banana-split"),
+                    SHARD_INDEX,
+                    NAMESPACE_NAME,
+                    0,
+                    r#"bananas,city=Medford day="sun",temp=55 22"#,
+                )),
+                &catalog,
+                &MockLifecycleHandle::default(),
+                &exec,
+            )
+            .await
+            .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(shard.namespace(NAMESPACE_NAME).is_some());
+        assert!(shard.namespace_by_id(ns_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_namespaces_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}

From 0847cc54584c4fad4ed78576ea8f0eac9088da04 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Fri, 30 Sep 2022 14:35:38 +0200
Subject: [PATCH 10/40] refactor: PartitionData::id() -> partition_id()

Consistent naming is consistent - all the others are thing_id().
---
 ingester/src/data.rs                          |  4 ++--
 ingester/src/data/partition.rs                |  2 +-
 ingester/src/data/partition/resolver/cache.rs | 10 +++++-----
 ingester/src/data/partition/resolver/trait.rs |  2 +-
 ingester/src/data/shard.rs                    |  2 ++
 ingester/src/data/table.rs                    |  4 ++--
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 7c4a48386f..fc25b906d4 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -811,7 +811,7 @@ mod tests {
             assert!(n.table_data("mem").is_some());
             let mem_table = mem_table.write().await;
             let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
-            p.id()
+            p.partition_id()
         };
 
         data.persist(partition_id).await;
@@ -955,7 +955,7 @@ mod tests {
             let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
 
             table_id = mem_table.table_id();
-            partition_id = p.id();
+            partition_id = p.partition_id();
         }
         {
             // verify the partition doesn't have a sort key before any data has been persisted
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 1ec531fdbc..009018820a 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -302,7 +302,7 @@ impl PartitionData {
         self.data.progress()
     }
 
-    pub(super) fn id(&self) -> PartitionId {
+    pub(super) fn partition_id(&self) -> PartitionId {
         self.id
     }
 
diff --git a/ingester/src/data/partition/resolver/cache.rs b/ingester/src/data/partition/resolver/cache.rs
index 0dda53f057..8755e73d90 100644
--- a/ingester/src/data/partition/resolver/cache.rs
+++ b/ingester/src/data/partition/resolver/cache.rs
@@ -221,7 +221,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -255,7 +255,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -307,7 +307,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), other_key_id);
+        assert_eq!(got.partition_id(), other_key_id);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -346,7 +346,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), other_table);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -385,7 +385,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), other_shard);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
diff --git a/ingester/src/data/partition/resolver/trait.rs b/ingester/src/data/partition/resolver/trait.rs
index c18ccdf1a2..ab89d6753e 100644
--- a/ingester/src/data/partition/resolver/trait.rs
+++ b/ingester/src/data/partition/resolver/trait.rs
@@ -82,7 +82,7 @@ mod tests {
                 Arc::clone(&table_name),
             )
             .await;
-        assert_eq!(got.id(), partition);
+        assert_eq!(got.partition_id(), partition);
         assert_eq!(got.namespace_id(), namespace_id);
         assert_eq!(*got.table_name(), *table_name);
     }
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 3390b2aed8..ff32804520 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -37,6 +37,7 @@ impl DoubleRef {
         self.by_name.get(name).map(Arc::clone)
     }
 
+    #[cfg(test)]
     fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
         self.by_id.get(&id).map(Arc::clone)
     }
@@ -119,6 +120,7 @@ impl ShardData {
     }
 
     /// Gets the namespace data out of the map
+    #[cfg(test)]
     pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
         // TODO: this should be the default once IDs are pushed over the wire.
         //
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 7dc4536075..9eb05bae27 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -131,7 +131,7 @@ impl TableData {
         // op may fail which would lead to a write being recorded, but not
         // applied.
         let should_pause = lifecycle_handle.log_write(
-            partition_data.id(),
+            partition_data.partition_id(),
             self.shard_id,
             self.namespace_id,
             self.table_id,
@@ -182,7 +182,7 @@ impl TableData {
         self.partition_data
             .values()
             .map(|p| UnpersistedPartitionData {
-                partition_id: p.id(),
+                partition_id: p.partition_id(),
                 non_persisted: p
                     .get_non_persisting_data()
                     .expect("get_non_persisting should always work"),

From f9bf86927d6bfcd229fb43a7ff9350e5db2a8135 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Fri, 30 Sep 2022 15:14:33 +0200
Subject: [PATCH 11/40] refactor: ref PartitionData by key & ID

Changes the TableData to hold a map of partition key -> PartitionData,
and partition ID -> PartitionData simultaneously. This allows for cheap
lookups when the caller holds an ID.

This commit also manages to internalise the partition map within the
TableData - one less pub / peeking!

This commit also switches from a BTreeMap to a HashMap as the backing
collection, as maintaining key ordering doesn't appear to be necessary.
---
 ingester/src/data.rs           |  12 ++-
 ingester/src/data/namespace.rs |   7 +-
 ingester/src/data/table.rs     | 147 +++++++++++++++++++++++++++++----
 3 files changed, 144 insertions(+), 22 deletions(-)

diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index fc25b906d4..9c26a09a20 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -810,7 +810,9 @@ mod tests {
             let mem_table = n.table_data("mem").unwrap();
             assert!(n.table_data("mem").is_some());
             let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
             p.partition_id()
         };
 
@@ -952,7 +954,9 @@ mod tests {
             let mem_table = n.table_data("mem").unwrap();
             assert!(n.table_data("cpu").is_some());
             let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
 
             table_id = mem_table.table_id();
             partition_id = p.partition_id();
@@ -1352,7 +1356,7 @@ mod tests {
         {
             let table_data = data.table_data("mem").unwrap();
             let table = table_data.read().await;
-            let p = table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
             assert_eq!(
                 p.max_persisted_sequence_number(),
                 Some(SequenceNumber::new(1))
@@ -1368,7 +1372,7 @@ mod tests {
 
         let table_data = data.table_data("mem").unwrap();
         let table = table_data.read().await;
-        let partition = table.partition_data.get(&"1970-01-01".into()).unwrap();
+        let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
         assert_eq!(
             partition.data.buffer.as_ref().unwrap().min_sequence_number,
             SequenceNumber::new(2)
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 515956f5f4..6c0be9bc6b 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -226,7 +226,7 @@ impl NamespaceData {
         if let Some(t) = self.table_data(table_name) {
             let mut t = t.write().await;
 
-            return t.partition_data.get_mut(partition_key).map(|p| {
+            return t.get_partition_by_key_mut(partition_key).map(|p| {
                 p.data
                     .generate_snapshot()
                     .expect("snapshot on mutable batch should never fail");
@@ -249,8 +249,7 @@ impl NamespaceData {
             let mut table_data = table_data.write().await;
 
             return table_data
-                .partition_data
-                .get_mut(partition_key)
+                .get_partition_by_key_mut(partition_key)
                 .and_then(|partition_data| partition_data.snapshot_to_persisting_batch());
         }
 
@@ -321,7 +320,7 @@ impl NamespaceData {
     ) {
         if let Some(t) = self.table_data(table_name) {
             let mut t = t.write().await;
-            let partition = t.partition_data.get_mut(partition_key);
+            let partition = t.get_partition_by_key_mut(partition_key);
 
             if let Some(p) = partition {
                 p.mark_persisted(sequence_number);
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 9eb05bae27..000d4d1973 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -1,9 +1,10 @@
 //! Table level data buffer structures.
 
-use std::{collections::BTreeMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc};
 
 use data_types::{
-    DeletePredicate, NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId, Timestamp,
+    DeletePredicate, NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
+    Timestamp,
 };
 use iox_catalog::interface::Catalog;
 use iox_query::exec::Executor;
@@ -16,6 +17,39 @@ use super::partition::{
 };
 use crate::lifecycle::LifecycleHandle;
 
+/// A double-referenced map where [`PartitionData`] can be looked up by
+/// [`PartitionKey`], or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_key: HashMap<PartitionKey, PartitionData>,
+    by_id: HashMap<PartitionId, PartitionKey>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, ns: PartitionData) {
+        let id = ns.partition_id();
+        let key = ns.partition_key().clone();
+
+        assert!(self.by_key.insert(key.clone(), ns).is_none());
+        assert!(self.by_id.insert(id, key).is_none());
+    }
+
+    #[cfg(test)]
+    fn by_key(&self, key: &PartitionKey) -> Option<&PartitionData> {
+        self.by_key.get(key)
+    }
+
+    fn by_key_mut(&mut self, key: &PartitionKey) -> Option<&mut PartitionData> {
+        self.by_key.get_mut(key)
+    }
+
+    fn by_id_mut(&mut self, id: PartitionId) -> Option<&mut PartitionData> {
+        let key = self.by_id.get(&id)?.clone();
+        self.by_key_mut(&key)
+    }
+}
+
 /// Data of a Table in a given Namesapce that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct TableData {
@@ -34,8 +68,8 @@ pub(crate) struct TableData {
     /// `(key, shard, table)` triplet.
     partition_provider: Arc<dyn PartitionProvider>,
 
-    // Map pf partition key to its data
-    pub(super) partition_data: BTreeMap<PartitionKey, PartitionData>,
+    // Map of partition key to its data
+    partition_data: DoubleRef,
 }
 
 impl TableData {
@@ -71,6 +105,7 @@ impl TableData {
     /// Return parquet_max_sequence_number
     pub(super) fn parquet_max_sequence_number(&self) -> Option<SequenceNumber> {
         self.partition_data
+            .by_key
             .values()
             .map(|p| p.max_persisted_sequence_number())
             .max()
@@ -92,7 +127,7 @@ impl TableData {
         partition_key: PartitionKey,
         lifecycle_handle: &dyn LifecycleHandle,
     ) -> Result<bool, super::Error> {
-        let partition_data = match self.partition_data.get_mut(&partition_key) {
+        let partition_data = match self.partition_data.by_key.get_mut(&partition_key) {
             Some(p) => p,
             None => {
                 let p = self
@@ -105,12 +140,9 @@ impl TableData {
                         Arc::clone(&self.table_name),
                     )
                     .await;
-                // Add the partition to the map.
-                assert!(self
-                    .partition_data
-                    .insert(partition_key.clone(), p)
-                    .is_none());
-                self.partition_data.get_mut(&partition_key).unwrap()
+                // Add the double-referenced partition to the map.
+                self.partition_data.insert(p);
+                self.partition_data.by_key_mut(&partition_key).unwrap()
             }
         };
 
@@ -171,15 +203,42 @@ impl TableData {
         self.tombstone_max_sequence_number = Some(sequence_number);
 
         // modify one partition at a time
-        for data in self.partition_data.values_mut() {
+        for data in self.partition_data.by_key.values_mut() {
             data.buffer_tombstone(executor, tombstone.clone()).await;
         }
 
         Ok(())
     }
 
+    /// Return the [`PartitionData`] for the specified ID.
+    #[allow(unused)]
+    pub(crate) fn get_partition(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_id_mut(partition_id)
+    }
+
+    /// Return the [`PartitionData`] for the specified partition key.
+    #[cfg(test)]
+    pub(crate) fn get_partition_by_key(
+        &self,
+        partition_key: &PartitionKey,
+    ) -> Option<&PartitionData> {
+        self.partition_data.by_key(partition_key)
+    }
+
+    /// Return the [`PartitionData`] for the specified partition key.
+    pub(crate) fn get_partition_by_key_mut(
+        &mut self,
+        partition_key: &PartitionKey,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_key_mut(partition_key)
+    }
+
     pub(crate) fn unpersisted_partition_data(&self) -> Vec<UnpersistedPartitionData> {
         self.partition_data
+            .by_key
             .values()
             .map(|p| UnpersistedPartitionData {
                 partition_id: p.partition_id(),
@@ -204,6 +263,7 @@ impl TableData {
         };
 
         self.partition_data
+            .by_key
             .values()
             .fold(progress, |progress, partition_data| {
                 progress.combine(partition_data.progress())
@@ -248,6 +308,65 @@ mod tests {
     const PARTITION_KEY: &str = "platanos";
     const PARTITION_ID: PartitionId = PartitionId::new(0);
 
+    #[tokio::test]
+    async fn test_partition_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME,
+            shard_id,
+            ns_id,
+            None,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &MockLifecycleHandle::default(),
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_some());
+    }
+
     #[tokio::test]
     async fn test_bad_write_memory_counting() {
         let metrics = Arc::new(metric::Registry::default());
@@ -291,7 +410,7 @@ mod tests {
         let handle = MockLifecycleHandle::default();
 
         // Assert the table does not contain the test partition
-        assert!(table.partition_data.get(&PARTITION_KEY.into()).is_none());
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
 
         // Write some test data
         let pause = table
@@ -306,7 +425,7 @@ mod tests {
         assert!(!pause);
 
         // Referencing the partition should succeed
-        assert!(table.partition_data.get(&PARTITION_KEY.into()).is_some());
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
 
         // And the lifecycle handle was called with the expected values
         assert_eq!(

From 1a7eb47b8109deb0eef5366ae758640902c2b5ac Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Fri, 30 Sep 2022 15:18:50 +0200
Subject: [PATCH 12/40] refactor: persist() passes all necessary IDs

This commit changes the persist() call so that it passes through all
relevant IDs so that the impl can locate the partition in the buffer
tree - this will enable elimination of many queries against the catalog
in the future.

This commit also cleans up the persist() impl, deferring queries until
the result will be used to avoid unnecessary load, improves logging &
error handling, and documents a TOCTOU bug in code:

    https://github.com/influxdata/influxdb_iox/issues/5777
---
 ingester/src/compact.rs           |  17 +-
 ingester/src/data.rs              | 422 +++++++++++++++---------------
 ingester/src/data/namespace.rs    |   3 +-
 ingester/src/data/shard.rs        |   2 -
 ingester/src/lifecycle.rs         |  38 ++-
 query_tests/src/scenarios/util.rs |  27 +-
 6 files changed, 281 insertions(+), 228 deletions(-)

diff --git a/ingester/src/compact.rs b/ingester/src/compact.rs
index 040a1c983c..ce516ffe85 100644
--- a/ingester/src/compact.rs
+++ b/ingester/src/compact.rs
@@ -86,11 +86,8 @@ pub(crate) async fn compact_persisting_batch(
     namespace_id: i64,
     partition_info: &PartitionInfo,
     batch: Arc<PersistingBatch>,
-) -> Result<Option<CompactedStream>> {
-    // Nothing to compact
-    if batch.data.data.is_empty() {
-        return Ok(None);
-    }
+) -> Result<CompactedStream> {
+    assert!(!batch.data.data.is_empty());
 
     let namespace_name = &partition_info.namespace_name;
     let table_name = &partition_info.table_name;
@@ -141,11 +138,11 @@ pub(crate) async fn compact_persisting_batch(
         sort_key: Some(metadata_sort_key),
     };
 
-    Ok(Some(CompactedStream {
+    Ok(CompactedStream {
         stream,
         iox_metadata,
         sort_key_update,
-    }))
+    })
 }
 
 /// Compact a given Queryable Batch
@@ -254,7 +251,6 @@ mod tests {
         let CompactedStream { stream, .. } =
             compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
                 .await
-                .unwrap()
                 .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -328,7 +324,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -426,7 +421,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -527,7 +521,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -629,7 +622,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -739,7 +731,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 9c26a09a20..89486ed2df 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -6,7 +6,7 @@ use arrow::{error::ArrowError, record_batch::RecordBatch};
 use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use async_trait::async_trait;
 use backoff::{Backoff, BackoffConfig};
-use data_types::{PartitionId, SequenceNumber, ShardId, ShardIndex};
+use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use dml::DmlOperation;
 use futures::{Stream, StreamExt};
@@ -220,7 +220,13 @@ impl IngesterData {
 #[async_trait]
 pub trait Persister: Send + Sync + 'static {
     /// Persits the partition ID. Will retry forever until it succeeds.
-    async fn persist(&self, partition_id: PartitionId);
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    );
 
     /// Updates the shard's `min_unpersisted_sequence_number` in the catalog.
     /// This number represents the minimum that might be unpersisted, which is the
@@ -235,7 +241,69 @@ pub trait Persister: Send + Sync + 'static {
 
 #[async_trait]
 impl Persister for IngesterData {
-    async fn persist(&self, partition_id: PartitionId) {
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    ) {
+        // lookup the state from the ingester data. If something isn't found,
+        // it's unexpected. Crash so someone can take a look.
+        let shard_data = self
+            .shards
+            .get(&shard_id)
+            .unwrap_or_else(|| panic!("shard state for {shard_id} not in ingester data"));
+        let namespace = shard_data
+            .namespace_by_id(namespace_id)
+            .unwrap_or_else(|| panic!("namespace {namespace_id} not in shard {shard_id} state"));
+
+        let partition_key;
+        let batch;
+        {
+            let table_data = namespace.table_id(table_id).unwrap_or_else(|| {
+                panic!("table {table_id} in namespace {namespace_id} not in shard {shard_id} state")
+            });
+
+            let mut guard = table_data.write().await;
+            let partition = guard.get_partition(partition_id).unwrap_or_else(|| {
+                panic!(
+                    "partition {partition_id} in table {table_id} in namespace {namespace_id} not in shard {shard_id} state"
+                )
+            });
+
+            partition_key = partition.partition_key().clone();
+            batch = partition.snapshot_to_persisting_batch();
+        };
+
+        debug!(%shard_id, %namespace_id, %table_id, %partition_id, %partition_key, "persisting partition");
+
+        // Check if there is any data to persist.
+        let batch = match batch {
+            Some(v) if !v.data.data.is_empty() => v,
+            _ => {
+                warn!(
+                    %shard_id,
+                    %namespace_id,
+                    %table_id,
+                    %partition_id,
+                    %partition_key,
+                    "partition marked for persistence contains no data"
+                );
+                return;
+            }
+        };
+
+        // lookup column IDs from catalog
+        // TODO: this can be removed once the ingester uses column IDs internally as well
+        let table_schema = Backoff::new(&self.backoff_config)
+            .retry_all_errors("get table schema", || async {
+                let mut repos = self.catalog.repositories().await;
+                get_table_schema_by_id(table_id, repos.as_mut()).await
+            })
+            .await
+            .expect("retry forever");
+
         // lookup the partition_info from the catalog
         let partition_info = Backoff::new(&self.backoff_config)
             .retry_all_errors("get partition_info_by_id", || async {
@@ -243,217 +311,158 @@ impl Persister for IngesterData {
                 repos.partitions().partition_info_by_id(partition_id).await
             })
             .await
-            .expect("retry forever");
+            .expect("retry forever").unwrap_or_else(|| panic!("partition {partition_id} in table {table_id} in namespace {namespace_id} in shard {shard_id} has no partition info in catalog"));
 
-        // lookup the state from the ingester data. If something isn't found, it's unexpected. Crash
-        // so someone can take a look.
-        let partition_info = partition_info
-            .unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id));
-        let shard_data = self
-            .shards
-            .get(&partition_info.partition.shard_id)
-            .unwrap_or_else(|| {
-                panic!(
-                    "shard state for {} not in ingester data",
-                    partition_info.partition.shard_id
-                )
-            }); //{
-        let namespace = shard_data
-            .namespace(&partition_info.namespace_name)
-            .unwrap_or_else(|| {
-                panic!(
-                    "namespace {} not in shard {} state",
-                    partition_info.namespace_name, partition_info.partition.shard_id
-                )
-            });
-        debug!(?partition_id, ?partition_info, "persisting partition");
+        // do the CPU intensive work of compaction, de-duplication and sorting
+        let CompactedStream {
+            stream: record_stream,
+            iox_metadata,
+            sort_key_update,
+        } = compact_persisting_batch(
+            Arc::new(SystemProvider::new()),
+            &self.exec,
+            namespace.namespace_id().get(),
+            &partition_info,
+            Arc::clone(&batch),
+        )
+        .await
+        .expect("unable to compact persisting batch");
 
-        // lookup column IDs from catalog
-        // TODO: this can be removed once the ingester uses column IDs internally as well
-        let table_schema = Backoff::new(&self.backoff_config)
-            .retry_all_errors("get table schema", || async {
-                let mut repos = self.catalog.repositories().await;
-                let table = repos
-                    .tables()
-                    .get_by_namespace_and_name(namespace.namespace_id(), &partition_info.table_name)
-                    .await?
-                    .expect("table not found in catalog");
-                get_table_schema_by_id(table.id, repos.as_mut()).await
-            })
+        // Save the compacted data to a parquet file in object storage.
+        //
+        // This call retries until it completes.
+        let (md, file_size) = self
+            .store
+            .upload(record_stream, &iox_metadata)
             .await
-            .expect("retry forever");
+            .expect("unexpected fatal persist error");
 
-        let persisting_batch = namespace
-            .snapshot_to_persisting(
-                &partition_info.table_name,
-                &partition_info.partition.partition_key,
-            )
-            .await;
-
-        if let Some(persisting_batch) = persisting_batch {
-            // do the CPU intensive work of compaction, de-duplication and sorting
-            let compacted_stream = match compact_persisting_batch(
-                Arc::new(SystemProvider::new()),
-                &self.exec,
-                namespace.namespace_id().get(),
-                &partition_info,
-                Arc::clone(&persisting_batch),
-            )
-            .await
-            {
-                Err(e) => {
-                    // this should never error out. if it does, we need to crash hard so
-                    // someone can take a look.
-                    panic!("unable to compact persisting batch with error: {:?}", e);
-                }
-                Ok(Some(r)) => r,
-                Ok(None) => {
-                    warn!("persist called with no data");
-                    return;
-                }
-            };
-            let CompactedStream {
-                stream: record_stream,
-                iox_metadata,
-                sort_key_update,
-            } = compacted_stream;
-
-            // Save the compacted data to a parquet file in object storage.
-            //
-            // This call retries until it completes.
-            let (md, file_size) = self
-                .store
-                .upload(record_stream, &iox_metadata)
-                .await
-                .expect("unexpected fatal persist error");
-
-            // Update the sort key in the catalog if there are
-            // additional columns BEFORE adding parquet file to the
-            // catalog. If the order is reversed, the querier or
-            // compactor may see a parquet file with an inconsistent
-            // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
-            if let Some(new_sort_key) = sort_key_update {
-                let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
-                Backoff::new(&self.backoff_config)
-                    .retry_all_errors("update_sort_key", || async {
-                        let mut repos = self.catalog.repositories().await;
-                        let _partition = repos
-                            .partitions()
-                            .update_sort_key(partition_id, &sort_key)
-                            .await?;
-                        // compiler insisted on getting told the type of the error :shrug:
-                        Ok(()) as Result<(), iox_catalog::interface::Error>
-                    })
-                    .await
-                    .expect("retry forever");
-                debug!(
-                    ?partition_id,
-                    table = partition_info.table_name,
-                    ?new_sort_key,
-                    "adjusted sort key during batch compact & persist"
-                );
-            }
-
-            // Add the parquet file to the catalog until succeed
-            let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
-                table_schema.columns.get(name).expect("Unknown column").id
-            });
-
-            // Assert partitions are persisted in-order.
-            //
-            // It is an invariant that partitions are persisted in order so that
-            // both the per-shard, and per-partition watermarks are correctly
-            // advanced and accurate.
-            if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
-                assert!(
-                    parquet_file.max_sequence_number > last_persist,
-                    "out of order partition persistence, persisting {}, previously persisted {}",
-                    parquet_file.max_sequence_number.get(),
-                    last_persist.get(),
-                );
-            }
-
-            // Add the parquet file to the catalog.
-            //
-            // This has the effect of allowing the queriers to "discover" the
-            // parquet file by polling / querying the catalog.
+        // Update the sort key in the catalog if there are
+        // additional columns BEFORE adding parquet file to the
+        // catalog. If the order is reversed, the querier or
+        // compactor may see a parquet file with an inconsistent
+        // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
+        if let Some(new_sort_key) = sort_key_update {
+            let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
             Backoff::new(&self.backoff_config)
-                .retry_all_errors("add parquet file to catalog", || async {
+                .retry_all_errors("update_sort_key", || async {
                     let mut repos = self.catalog.repositories().await;
-                    let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
-                    debug!(
-                        ?partition_id,
-                        table_id=?parquet_file.table_id,
-                        parquet_file_id=?parquet_file.id,
-                        table_name=%iox_metadata.table_name,
-                        "parquet file written to catalog"
-                    );
+                    let _partition = repos
+                        .partitions()
+                        .update_sort_key(partition_id, &sort_key)
+                        .await?;
                     // compiler insisted on getting told the type of the error :shrug:
                     Ok(()) as Result<(), iox_catalog::interface::Error>
                 })
                 .await
                 .expect("retry forever");
-
-            // Update the per-partition persistence watermark, so that new
-            // ingester instances skip the just-persisted ops during replay.
-            //
-            // This could be transactional with the above parquet insert to
-            // maintain catalog consistency, though in practice it is an
-            // unnecessary overhead - the system can tolerate replaying the ops
-            // that lead to this parquet file being generated, and tolerate
-            // creating a parquet file containing duplicate data (remedied by
-            // compaction).
-            //
-            // This means it is possible to observe a parquet file with a
-            // max_persisted_sequence_number >
-            // partition.persisted_sequence_number, either in-between these
-            // catalog updates, or for however long it takes a crashed ingester
-            // to restart and replay the ops, and re-persist a file containing
-            // the same (or subset of) data.
-            //
-            // The above is also true of the per-shard persist marker that
-            // governs the ingester's replay start point, which is
-            // non-transactionally updated after all partitions have persisted.
-            Backoff::new(&self.backoff_config)
-                .retry_all_errors("set partition persist marker", || async {
-                    self.catalog
-                        .repositories()
-                        .await
-                        .partitions()
-                        .update_persisted_sequence_number(
-                            parquet_file.partition_id,
-                            parquet_file.max_sequence_number,
-                        )
-                        .await
-                })
-                .await
-                .expect("retry forever");
-
-            // Record metrics
-            let attributes = Attributes::from([(
-                "shard_id",
-                format!("{}", partition_info.partition.shard_id).into(),
-            )]);
-            self.persisted_file_size_bytes
-                .recorder(attributes)
-                .record(file_size as u64);
-
-            // and remove the persisted data from memory
-            namespace
-                .mark_persisted(
-                    &partition_info.table_name,
-                    &partition_info.partition.partition_key,
-                    iox_metadata.max_sequence_number,
-                )
-                .await;
             debug!(
                 ?partition_id,
-                table_name=%partition_info.table_name,
-                partition_key=%partition_info.partition.partition_key,
-                max_sequence_number=%iox_metadata.max_sequence_number.get(),
-                "marked partition as persisted"
+                table = partition_info.table_name,
+                ?new_sort_key,
+                "adjusted sort key during batch compact & persist"
             );
         }
+
+        // Add the parquet file to the catalog until succeed
+        let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
+            table_schema.columns.get(name).expect("Unknown column").id
+        });
+
+        // Assert partitions are persisted in-order.
+        //
+        // It is an invariant that partitions are persisted in order so that
+        // both the per-shard, and per-partition watermarks are correctly
+        // advanced and accurate.
+        if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
+            assert!(
+                parquet_file.max_sequence_number > last_persist,
+                "out of order partition persistence, persisting {}, previously persisted {}",
+                parquet_file.max_sequence_number.get(),
+                last_persist.get(),
+            );
+        }
+
+        // Add the parquet file to the catalog.
+        //
+        // This has the effect of allowing the queriers to "discover" the
+        // parquet file by polling / querying the catalog.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("add parquet file to catalog", || async {
+                let mut repos = self.catalog.repositories().await;
+                let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
+                debug!(
+                    ?partition_id,
+                    table_id=?parquet_file.table_id,
+                    parquet_file_id=?parquet_file.id,
+                    table_name=%iox_metadata.table_name,
+                    "parquet file written to catalog"
+                );
+                // compiler insisted on getting told the type of the error :shrug:
+                Ok(()) as Result<(), iox_catalog::interface::Error>
+            })
+            .await
+            .expect("retry forever");
+
+        // Update the per-partition persistence watermark, so that new
+        // ingester instances skip the just-persisted ops during replay.
+        //
+        // This could be transactional with the above parquet insert to
+        // maintain catalog consistency, though in practice it is an
+        // unnecessary overhead - the system can tolerate replaying the ops
+        // that lead to this parquet file being generated, and tolerate
+        // creating a parquet file containing duplicate data (remedied by
+        // compaction).
+        //
+        // This means it is possible to observe a parquet file with a
+        // max_persisted_sequence_number >
+        // partition.persisted_sequence_number, either in-between these
+        // catalog updates, or for however long it takes a crashed ingester
+        // to restart and replay the ops, and re-persist a file containing
+        // the same (or subset of) data.
+        //
+        // The above is also true of the per-shard persist marker that
+        // governs the ingester's replay start point, which is
+        // non-transactionally updated after all partitions have persisted.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("set partition persist marker", || async {
+                self.catalog
+                    .repositories()
+                    .await
+                    .partitions()
+                    .update_persisted_sequence_number(
+                        parquet_file.partition_id,
+                        parquet_file.max_sequence_number,
+                    )
+                    .await
+            })
+            .await
+            .expect("retry forever");
+
+        // Record metrics
+        let attributes = Attributes::from([(
+            "shard_id",
+            format!("{}", partition_info.partition.shard_id).into(),
+        )]);
+        self.persisted_file_size_bytes
+            .recorder(attributes)
+            .record(file_size as u64);
+
+        // and remove the persisted data from memory
+        namespace
+            .mark_persisted(
+                &partition_info.table_name,
+                &partition_info.partition.partition_key,
+                iox_metadata.max_sequence_number,
+            )
+            .await;
+        debug!(
+            ?partition_id,
+            table_name=%partition_info.table_name,
+            partition_key=%partition_info.partition.partition_key,
+            max_sequence_number=%iox_metadata.max_sequence_number.get(),
+            "marked partition as persisted"
+        );
     }
 
     async fn update_min_unpersisted_sequence_number(
@@ -804,7 +813,7 @@ mod tests {
         // limits)
         assert!(!should_pause);
 
-        let partition_id = {
+        let (table_id, partition_id) = {
             let sd = data.shards.get(&shard1.id).unwrap();
             let n = sd.namespace("foo").unwrap();
             let mem_table = n.table_data("mem").unwrap();
@@ -813,10 +822,11 @@ mod tests {
             let p = mem_table
                 .get_partition_by_key(&"1970-01-01".into())
                 .unwrap();
-            p.partition_id()
+            (mem_table.table_id(), p.partition_id())
         };
 
-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;
 
         // verify that a file got put into object store
         let file_paths: Vec<_> = object_store
@@ -953,12 +963,13 @@ mod tests {
         {
             let mem_table = n.table_data("mem").unwrap();
             assert!(n.table_data("cpu").is_some());
+
             let mem_table = mem_table.write().await;
+            table_id = mem_table.table_id();
+
             let p = mem_table
                 .get_partition_by_key(&"1970-01-01".into())
                 .unwrap();
-
-            table_id = mem_table.table_id();
             partition_id = p.partition_id();
         }
         {
@@ -973,7 +984,8 @@ mod tests {
             assert!(partition_info.partition.sort_key.is_empty());
         }
 
-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;
 
         // verify that a file got put into object store
         let file_paths: Vec<_> = object_store
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 6c0be9bc6b..987dad6c27 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -39,7 +39,6 @@ impl DoubleRef {
         self.by_name.get(name).map(Arc::clone)
     }
 
-    #[cfg(test)]
     fn by_id(&self, id: TableId) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
         self.by_id.get(&id).map(Arc::clone)
     }
@@ -240,6 +239,7 @@ impl NamespaceData {
     /// Snapshots the mutable buffer for the partition, which clears it out and then moves all
     /// snapshots over to a persisting batch, which is returned. If there is no data to snapshot
     /// or persist, None will be returned.
+    #[cfg(test)] // Only used in tests
     pub(crate) async fn snapshot_to_persisting(
         &self,
         table_name: &str,
@@ -266,7 +266,6 @@ impl NamespaceData {
     }
 
     /// Return the table data by ID.
-    #[cfg(test)]
     pub(crate) fn table_id(
         &self,
         table_id: TableId,
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index ff32804520..3390b2aed8 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -37,7 +37,6 @@ impl DoubleRef {
         self.by_name.get(name).map(Arc::clone)
     }
 
-    #[cfg(test)]
     fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
         self.by_id.get(&id).map(Arc::clone)
     }
@@ -120,7 +119,6 @@ impl ShardData {
     }
 
     /// Gets the namespace data out of the map
-    #[cfg(test)]
     pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
         // TODO: this should be the default once IDs are pushed over the wire.
         //
diff --git a/ingester/src/lifecycle.rs b/ingester/src/lifecycle.rs
index b46b84dde7..01b9ff2f33 100644
--- a/ingester/src/lifecycle.rs
+++ b/ingester/src/lifecycle.rs
@@ -234,7 +234,7 @@ struct LifecycleStats {
 }
 
 /// The stats for a partition
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 struct PartitionLifecycleStats {
     /// The shard this partition is under
     shard_id: ShardId,
@@ -469,6 +469,18 @@ impl LifecycleManager {
         let persist_tasks: Vec<_> = to_persist
             .into_iter()
             .map(|s| {
+                // BUG: TOCTOU: memory usage released may be incorrect.
+                //
+                // Here the amount of memory to be reduced is acquired, but this
+                // code does not prevent continued writes adding more data to
+                // the partition in another thread.
+                //
+                // This may lead to more actual data being persisted than the
+                // call below returns to the server pool - this would slowly
+                // starve the ingester of memory it thinks it has.
+                //
+                // See https://github.com/influxdata/influxdb_iox/issues/5777
+
                 // Mark this partition as being persisted, and remember the
                 // memory allocation it had accumulated.
                 let partition_memory_usage = self
@@ -483,7 +495,9 @@ impl LifecycleManager {
 
                 let state = Arc::clone(&self.state);
                 tokio::task::spawn(async move {
-                    persister.persist(s.partition_id).await;
+                    persister
+                        .persist(s.shard_id, s.namespace_id, s.table_id, s.partition_id)
+                        .await;
                     // Now the data has been uploaded and the memory it was
                     // using has been freed, released the memory capacity back
                     // the ingester.
@@ -602,7 +616,13 @@ mod tests {
 
     #[async_trait]
     impl Persister for TestPersister {
-        async fn persist(&self, partition_id: PartitionId) {
+        async fn persist(
+            &self,
+            _shard_id: ShardId,
+            _namespace_id: NamespaceId,
+            _table_id: TableId,
+            partition_id: PartitionId,
+        ) {
             let mut p = self.persist_called.lock();
             p.insert(partition_id);
         }
@@ -662,8 +682,16 @@ mod tests {
 
     #[async_trait]
     impl Persister for PausablePersister {
-        async fn persist(&self, partition_id: PartitionId) {
-            self.inner.persist(partition_id).await;
+        async fn persist(
+            &self,
+            shard_id: ShardId,
+            namespace_id: NamespaceId,
+            table_id: TableId,
+            partition_id: PartitionId,
+        ) {
+            self.inner
+                .persist(shard_id, namespace_id, table_id, partition_id)
+                .await;
             if let Some(event) = self.event(partition_id) {
                 event.before.wait().await;
                 event.after.wait().await;
diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs
index a46200101b..6b2249dc20 100644
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@@ -752,7 +752,32 @@ impl MockIngester {
                 .map(|f| f.id)
                 .collect();
 
-            self.ingester_data.persist(*partition_id).await;
+            let p = self
+                .catalog
+                .catalog
+                .repositories()
+                .await
+                .partitions()
+                .get_by_id(*partition_id)
+                .await
+                .unwrap()
+                .expect("partition not found");
+
+            let namespace_id = self
+                .catalog
+                .catalog
+                .repositories()
+                .await
+                .tables()
+                .get_by_id(p.table_id)
+                .await
+                .unwrap()
+                .expect("table does not exist")
+                .namespace_id;
+
+            self.ingester_data
+                .persist(p.shard_id, namespace_id, p.table_id, *partition_id)
+                .await;
 
             result.extend(
                 self.catalog

From a11aafe25bd1f944a396a3aa3197be98e462300f Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Tue, 4 Oct 2022 08:53:17 -0400
Subject: [PATCH 13/40] chore: Update SQL repl to refer to `namespace` rather
 than `database` (#5788)

---
 influxdb_iox/src/commands/sql/repl.rs         | 22 +++----
 influxdb_iox/src/commands/sql/repl_command.rs | 62 +++++++++----------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/influxdb_iox/src/commands/sql/repl.rs b/influxdb_iox/src/commands/sql/repl.rs
index cf4cb2c8dd..129367b906 100644
--- a/influxdb_iox/src/commands/sql/repl.rs
+++ b/influxdb_iox/src/commands/sql/repl.rs
@@ -53,7 +53,7 @@ pub enum Error {
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
 enum QueryEngine {
-    /// Run queries against the named database on the remote server
+    /// Run queries against the namespace on the remote server
     Remote(String),
 
     /// Run queries against a local `Observer` instance
@@ -177,7 +177,7 @@ pub struct Repl {
     /// Client for running sql
     flight_client: influxdb_iox_client::flight::Client,
 
-    /// database name against which SQL commands are run
+    /// namespace name against which SQL commands are run
     query_engine: Option<QueryEngine>,
 
     /// Formatter to use to format query results
@@ -239,8 +239,8 @@ impl Repl {
                         .map_err(|e| println!("{}", e))
                         .ok();
                 }
-                ReplCommand::UseDatabase { db_name } => {
-                    self.use_database(db_name);
+                ReplCommand::UseNamespace { db_name } => {
+                    self.use_namespace(db_name);
                 }
                 ReplCommand::SqlCommand { sql } => {
                     self.run_sql(sql).await.map_err(|e| println!("{}", e)).ok();
@@ -302,18 +302,18 @@ impl Repl {
         self.print_results(&[record_batch])
     }
 
-    // Run a command against the currently selected remote database
+    // Run a command against the currently selected remote namespace
     async fn run_sql(&mut self, sql: String) -> Result<()> {
         let start = Instant::now();
 
         let batches = match &mut self.query_engine {
             None => {
-                println!("Error: no database selected.");
-                println!("Hint: Run USE DATABASE <dbname> to select database");
+                println!("Error: no namespace selected.");
+                println!("Hint: Run USE NAMESPACE <dbname> to select namespace");
                 return Ok(());
             }
             Some(QueryEngine::Remote(db_name)) => {
-                info!(%db_name, %sql, "Running sql on remote database");
+                info!(%db_name, %sql, "Running sql on remote namespace");
 
                 scrape_query(&mut self.flight_client, db_name, &sql).await?
             }
@@ -349,9 +349,9 @@ impl Repl {
         }
     }
 
-    fn use_database(&mut self, db_name: String) {
-        info!(%db_name, "setting current database");
-        println!("You are now in remote mode, querying database {}", db_name);
+    fn use_namespace(&mut self, db_name: String) {
+        info!(%db_name, "setting current namespace");
+        println!("You are now in remote mode, querying namespace {}", db_name);
         self.set_query_engine(QueryEngine::Remote(db_name));
     }
 
diff --git a/influxdb_iox/src/commands/sql/repl_command.rs b/influxdb_iox/src/commands/sql/repl_command.rs
index 37fa4fb843..56f310ed7f 100644
--- a/influxdb_iox/src/commands/sql/repl_command.rs
+++ b/influxdb_iox/src/commands/sql/repl_command.rs
@@ -7,7 +7,7 @@ pub enum ReplCommand {
     ShowNamespaces,
     Observer,
     SetFormat { format: String },
-    UseDatabase { db_name: String },
+    UseNamespace { db_name: String },
     SqlCommand { sql: String },
     Exit,
 }
@@ -64,18 +64,18 @@ impl TryFrom<&str> for ReplCommand {
             ["observer"] => Ok(Self::Observer),
             ["exit"] => Ok(Self::Exit),
             ["quit"] => Ok(Self::Exit),
-            ["use", "database"] => {
-                Err("name not specified. Usage: USE DATABASE <name>".to_string())
-            } // USE DATABASE
-            ["use", "database", _name] => {
-                // USE DATABASE <name>
-                Ok(Self::UseDatabase {
+            ["use", "namespace"] => {
+                Err("name not specified. Usage: USE NAMESPACE <name>".to_string())
+            } // USE NAMESPACE
+            ["use", "namespace", _name] => {
+                // USE namespace <name>
+                Ok(Self::UseNamespace {
                     db_name: raw_commands[2].to_string(),
                 })
             }
             ["use", _command] => {
                 // USE <name>
-                Ok(Self::UseDatabase {
+                Ok(Self::UseNamespace {
                     db_name: raw_commands[1].to_string(),
                 })
             }
@@ -98,9 +98,9 @@ impl ReplCommand {
 Available commands (not case sensitive):
 HELP (this one)
 
-SHOW NAMESPACES: List databases available on the server
+SHOW NAMESPACES: List namespaces available on the server
 
-USE [DATABASE|NAMESPACE] <name>: Set the current remote database to name
+USE NAMESPACE <name>: Set the current remote namespace to name
 
 SET FORMAT <format>: Set the output format to Pretty, csv or json
 
@@ -108,9 +108,9 @@ OBSERVER: Locally query unified queryable views of remote system tables
 
 [EXIT | QUIT]: Quit this session and exit the program
 
-# Examples: use remote database foo
-SHOW DATABASES;
-USE DATABASE foo;
+# Examples: use remote namespace foo
+SHOW NAMESPACES;
+USE foo;
 
 # Basic IOx SQL Primer
 
@@ -199,35 +199,35 @@ mod tests {
     }
 
     #[test]
-    fn use_database() {
-        let expected = Ok(ReplCommand::UseDatabase {
+    fn use_namespace() {
+        let expected = Ok(ReplCommand::UseNamespace {
             db_name: "Foo".to_string(),
         });
         assert_eq!("use Foo".try_into(), expected);
-        assert_eq!("use Database Foo;".try_into(), expected);
-        assert_eq!("use Database Foo ;".try_into(), expected);
-        assert_eq!(" use Database Foo;   ".try_into(), expected);
-        assert_eq!("   use Database Foo;   ".try_into(), expected);
+        assert_eq!("use Namespace Foo;".try_into(), expected);
+        assert_eq!("use Namespace Foo ;".try_into(), expected);
+        assert_eq!(" use Namespace Foo;   ".try_into(), expected);
+        assert_eq!("   use Namespace Foo;   ".try_into(), expected);
 
-        // ensure that database name is case sensitive
-        let expected = Ok(ReplCommand::UseDatabase {
+        // ensure that namespace name is case sensitive
+        let expected = Ok(ReplCommand::UseNamespace {
             db_name: "FOO".to_string(),
         });
         assert_eq!("use FOO".try_into(), expected);
-        assert_eq!("use DATABASE FOO;".try_into(), expected);
-        assert_eq!("USE DATABASE FOO;".try_into(), expected);
+        assert_eq!("use NAMESPACE FOO;".try_into(), expected);
+        assert_eq!("USE NAMESPACE FOO;".try_into(), expected);
 
         let expected: Result<ReplCommand, String> =
-            Err("name not specified. Usage: USE DATABASE <name>".to_string());
-        assert_eq!("use Database;".try_into(), expected);
-        assert_eq!("use DATABASE".try_into(), expected);
-        assert_eq!("use database".try_into(), expected);
+            Err("name not specified. Usage: USE NAMESPACE <name>".to_string());
+        assert_eq!("use Namespace;".try_into(), expected);
+        assert_eq!("use NAMESPACE".try_into(), expected);
+        assert_eq!("use namespace".try_into(), expected);
 
-        let expected = sql_cmd("use database foo bar");
-        assert_eq!("use database foo bar".try_into(), expected);
+        let expected = sql_cmd("use namespace foo bar");
+        assert_eq!("use namespace foo bar".try_into(), expected);
 
-        let expected = sql_cmd("use database foo BAR");
-        assert_eq!("use database foo BAR".try_into(), expected);
+        let expected = sql_cmd("use namespace foo BAR");
+        assert_eq!("use namespace foo BAR".try_into(), expected);
     }
 
     #[test]

From 9bbbf86116c7464a534968f38b1073db5213feff Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 5 Oct 2022 09:04:18 +0000
Subject: [PATCH 14/40] chore(deps): Bump sqlparser from 0.24.0 to 0.25.0
 (#5795)

Bumps [sqlparser](https://github.com/sqlparser-rs/sqlparser-rs) from 0.24.0 to 0.25.0.
- [Release notes](https://github.com/sqlparser-rs/sqlparser-rs/releases)
- [Changelog](https://github.com/sqlparser-rs/sqlparser-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/sqlparser-rs/sqlparser-rs/compare/v0.24.0...v0.25.0)

---
updated-dependencies:
- dependency-name: sqlparser
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock                  | 8 ++++----
 influxrpc_parser/Cargo.toml | 2 +-
 predicate/Cargo.toml        | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3fa55e8786..7bc4b92f85 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2166,7 +2166,7 @@ version = "0.1.0"
 dependencies = [
  "generated_types",
  "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
  "workspace-hack",
 ]
 
@@ -3565,7 +3565,7 @@ dependencies = [
  "schema",
  "serde_json",
  "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
  "test_helpers",
  "workspace-hack",
 ]
@@ -4732,9 +4732,9 @@ dependencies = [
 
 [[package]]
 name = "sqlparser"
-version = "0.24.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dac9c312566fdfc45a38ecf1924013c82af2a7d5315e46f67b1cc987f12be260"
+checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd"
 dependencies = [
  "log",
 ]
diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml
index 152c099d2d..7a886cf4e7 100644
--- a/influxrpc_parser/Cargo.toml
+++ b/influxrpc_parser/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sqlparser = "0.24.0"
+sqlparser = "0.25.0"
 snafu = "0.7.1"
 
 generated_types = { path = "../generated_types" }
diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml
index 9bf303b6c1..743cc8301b 100644
--- a/predicate/Cargo.toml
+++ b/predicate/Cargo.toml
@@ -15,7 +15,7 @@ query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
 serde_json = "1.0.83"
 snafu = "0.7"
-sqlparser = "0.24.0"
+sqlparser = "0.25.0"
 workspace-hack = { path = "../workspace-hack"}
 
 [dev-dependencies]

From c9a2445fd4b766f7817c2f58e1abb5ceb4fc0876 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 5 Oct 2022 09:48:11 +0000
Subject: [PATCH 15/40] chore(deps): Bump handlebars from 4.3.4 to 4.3.5
 (#5797)

* chore(deps): Bump handlebars from 4.3.4 to 4.3.5

Bumps [handlebars](https://github.com/sunng87/handlebars-rust) from 4.3.4 to 4.3.5.
- [Release notes](https://github.com/sunng87/handlebars-rust/releases)
- [Changelog](https://github.com/sunng87/handlebars-rust/blob/v4.3.5/CHANGELOG.md)
- [Commits](https://github.com/sunng87/handlebars-rust/compare/v4.3.4...v4.3.5)

---
updated-dependencies:
- dependency-name: handlebars
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* chore: Run cargo hakari tasks

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: CircleCI[bot] <circleci@influxdata.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                    | 4 ++--
 iox_data_generator/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7bc4b92f85..488aae235c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1722,9 +1722,9 @@ dependencies = [
 
 [[package]]
 name = "handlebars"
-version = "4.3.4"
+version = "4.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56b224eaa4987c03c30b251de7ef0c15a6a59f34222905850dbc3026dfb24d5f"
+checksum = "433e4ab33f1213cdc25b5fa45c76881240cfe79284cf2b395e8b9e312a30a2fd"
 dependencies = [
  "log",
  "pest",
diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml
index 1e6625a033..ec62aad243 100644
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@@ -11,7 +11,7 @@ chrono = { version = "0.4", default-features = false }
 chrono-english = "0.1.4"
 clap = { version = "3", features = ["derive", "env", "cargo"] }
 futures = "0.3"
-handlebars = "4.3.4"
+handlebars = "4.3.5"
 humantime = "2.1.0"
 influxdb2_client = { path = "../influxdb2_client" }
 itertools = "0.10.5"

From abb9122e2c60688a54516ce25e0f3f9ee2418fb5 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Tue, 4 Oct 2022 15:41:25 +0200
Subject: [PATCH 16/40] refactor: carry namespace name in NamespaceData

Changes the ingester's NamespaceData to carry a ref-counted string
identifier as well as the ID.

The backing storage for the name in NamespaceData is shared with the
index map in ShardData, so it is effectively free!
---
 ingester/src/data.rs            | 18 ++++++++-----
 ingester/src/data/namespace.rs  | 45 ++++++++++++++++++++++++++++++++-
 ingester/src/data/shard.rs      | 26 +++++++++++--------
 ingester/src/handler.rs         |  6 +++--
 ingester/src/querier_handler.rs |  7 ++---
 ingester/src/test_util.rs       |  8 +++---
 6 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 89486ed2df..5bbd422ca7 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -815,7 +815,7 @@ mod tests {
 
         let (table_id, partition_id) = {
             let sd = data.shards.get(&shard1.id).unwrap();
-            let n = sd.namespace("foo").unwrap();
+            let n = sd.namespace(&"foo".into()).unwrap();
             let mem_table = n.table_data("mem").unwrap();
             assert!(n.table_data("mem").is_some());
             let mem_table = mem_table.write().await;
@@ -957,7 +957,7 @@ mod tests {
         assert_progress(&data, shard_index, expected_progress).await;
 
         let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();
         let partition_id;
         let table_id;
         {
@@ -1193,7 +1193,7 @@ mod tests {
 
         // Get the namespace
         let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();
 
         let expected_progress = ShardProgress::new().with_buffered(SequenceNumber::new(1));
         assert_progress(&data, shard_index, expected_progress).await;
@@ -1356,7 +1356,13 @@ mod tests {
 
         let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog)));
 
-        let data = NamespaceData::new(namespace.id, shard.id, partition_provider, &*metrics);
+        let data = NamespaceData::new(
+            namespace.id,
+            "foo".into(),
+            shard.id,
+            partition_provider,
+            &*metrics,
+        );
 
         // w1 should be ignored because the per-partition replay offset is set
         // to 1 already, so it shouldn't be buffered and the buffer should
@@ -1473,7 +1479,7 @@ mod tests {
         assert_eq!(
             data.shard(shard1.id)
                 .unwrap()
-                .namespace(&namespace.name)
+                .namespace(&namespace.name.clone().into())
                 .unwrap()
                 .table_data("mem")
                 .unwrap()
@@ -1505,7 +1511,7 @@ mod tests {
         assert_eq!(
             data.shard(shard1.id)
                 .unwrap()
-                .namespace(&namespace.name)
+                .namespace(&namespace.name.into())
                 .unwrap()
                 .table_data("mem")
                 .unwrap()
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 987dad6c27..418b38c6db 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -44,11 +44,37 @@ impl DoubleRef {
     }
 }
 
+/// The string name / identifier of a Namespace.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub(crate) struct NamespaceName(Arc<str>);
+
+impl<T> From<T> for NamespaceName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::ops::Deref for NamespaceName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
 /// Data of a Namespace that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct NamespaceData {
     namespace_id: NamespaceId,
 
+    #[allow(dead_code)]
+    namespace_name: NamespaceName,
+
     /// The catalog ID of the shard this namespace is being populated from.
     shard_id: ShardId,
 
@@ -111,6 +137,7 @@ impl NamespaceData {
     /// Initialize new tables with default partition template of daily
     pub fn new(
         namespace_id: NamespaceId,
+        namespace_name: NamespaceName,
         shard_id: ShardId,
         partition_provider: Arc<dyn PartitionProvider>,
         metrics: &metric::Registry,
@@ -124,6 +151,7 @@ impl NamespaceData {
 
         Self {
             namespace_id,
+            namespace_name,
             shard_id,
             tables: Default::default(),
             table_count,
@@ -353,6 +381,12 @@ impl NamespaceData {
     pub(super) fn table_count(&self) -> &U64Counter {
         &self.table_count
     }
+
+    /// Returns the [`NamespaceName`] for this namespace.
+    #[cfg(test)]
+    pub(crate) fn namespace_name(&self) -> &NamespaceName {
+        &self.namespace_name
+    }
 }
 
 /// RAAI struct that sets buffering sequence number on creation and clears it on free
@@ -432,7 +466,16 @@ mod tests {
             ),
         ));
 
-        let ns = NamespaceData::new(ns_id, shard_id, partition_provider, &*metrics);
+        let ns = NamespaceData::new(
+            ns_id,
+            NAMESPACE_NAME.into(),
+            shard_id,
+            partition_provider,
+            &*metrics,
+        );
+
+        // Assert the namespace name was stored
+        assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME);
 
         // Assert the namespace does not contain the test data
         assert!(ns.table_data(TABLE_NAME).is_none());
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 3390b2aed8..11432f688c 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -11,7 +11,10 @@ use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
 use write_summary::ShardProgress;
 
-use super::{namespace::NamespaceData, partition::resolver::PartitionProvider};
+use super::{
+    namespace::{NamespaceData, NamespaceName},
+    partition::resolver::PartitionProvider,
+};
 use crate::lifecycle::LifecycleHandle;
 
 /// A double-referenced map where [`NamespaceData`] can be looked up by name, or
@@ -19,12 +22,12 @@ use crate::lifecycle::LifecycleHandle;
 #[derive(Debug, Default)]
 struct DoubleRef {
     // TODO(4880): this can be removed when IDs are sent over the wire.
-    by_name: HashMap<String, Arc<NamespaceData>>,
+    by_name: HashMap<NamespaceName, Arc<NamespaceData>>,
     by_id: HashMap<NamespaceId, Arc<NamespaceData>>,
 }
 
 impl DoubleRef {
-    fn insert(&mut self, name: String, ns: NamespaceData) -> Arc<NamespaceData> {
+    fn insert(&mut self, name: NamespaceName, ns: NamespaceData) -> Arc<NamespaceData> {
         let id = ns.namespace_id();
 
         let ns = Arc::new(ns);
@@ -33,7 +36,7 @@ impl DoubleRef {
         ns
     }
 
-    fn by_name(&self, name: &str) -> Option<Arc<NamespaceData>> {
+    fn by_name(&self, name: &NamespaceName) -> Option<Arc<NamespaceData>> {
         self.by_name.get(name).map(Arc::clone)
     }
 
@@ -99,7 +102,7 @@ impl ShardData {
         lifecycle_handle: &dyn LifecycleHandle,
         executor: &Executor,
     ) -> Result<bool, super::Error> {
-        let namespace_data = match self.namespace(dml_operation.namespace()) {
+        let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) {
             Some(d) => d,
             None => {
                 self.insert_namespace(dml_operation.namespace(), &**catalog)
@@ -113,7 +116,7 @@ impl ShardData {
     }
 
     /// Gets the namespace data out of the map
-    pub(crate) fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
+    pub(crate) fn namespace(&self, namespace: &NamespaceName) -> Option<Arc<NamespaceData>> {
         let n = self.namespaces.read();
         n.by_name(namespace)
     }
@@ -136,6 +139,8 @@ impl ShardData {
         catalog: &dyn Catalog,
     ) -> Result<Arc<NamespaceData>, super::Error> {
         let mut repos = catalog.repositories().await;
+
+        let ns_name = NamespaceName::from(namespace);
         let namespace = repos
             .namespaces()
             .get_by_name(namespace)
@@ -145,16 +150,17 @@ impl ShardData {
 
         let mut n = self.namespaces.write();
 
-        Ok(match n.by_name(&namespace.name) {
+        Ok(match n.by_name(&ns_name) {
             Some(v) => v,
             None => {
                 self.namespace_count.inc(1);
 
                 // Insert the table and then return a ref to it.
                 n.insert(
-                    namespace.name,
+                    ns_name.clone(),
                     NamespaceData::new(
                         namespace.id,
+                        ns_name,
                         self.shard_id,
                         Arc::clone(&self.partition_provider),
                         &*self.metrics,
@@ -240,7 +246,7 @@ mod tests {
         );
 
         // Assert the namespace does not contain the test data
-        assert!(shard.namespace(NAMESPACE_NAME).is_none());
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_none());
         assert!(shard.namespace_by_id(ns_id).is_none());
 
         // Write some test data
@@ -261,7 +267,7 @@ mod tests {
             .expect("buffer op should succeed");
 
         // Both forms of referencing the table should succeed
-        assert!(shard.namespace(NAMESPACE_NAME).is_some());
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_some());
         assert!(shard.namespace_by_id(ns_id).is_some());
 
         // And the table counter metric should increase
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index dde159dc52..7f51190102 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -499,11 +499,12 @@ mod tests {
         // give the writes some time to go through the buffer. Exit once we've verified there's
         // data in there from both writes.
         tokio::time::timeout(Duration::from_secs(2), async {
+            let ns_name = ingester.namespace.name.into();
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
-                    if let Some(data) = data.namespace(&ingester.namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
                         if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await {
                             if let Some(b) = b.first() {
@@ -740,11 +741,12 @@ mod tests {
         // give the writes some time to go through the buffer. Exit once we've verified there's
         // data in there
         tokio::time::timeout(Duration::from_secs(1), async move {
+            let ns_name = namespace.name.into();
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.data.shard(shard.id) {
-                    if let Some(data) = data.namespace(&namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
                         if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await {
                             if let Some(b) = b.first() {
diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs
index d3c8e37e19..cf58daab0c 100644
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@@ -12,8 +12,8 @@ use snafu::{ensure, Snafu};
 
 use crate::{
     data::{
-        partition::UnpersistedPartitionData, IngesterData, IngesterQueryPartition,
-        IngesterQueryResponse,
+        namespace::NamespaceName, partition::UnpersistedPartitionData, IngesterData,
+        IngesterQueryPartition, IngesterQueryResponse,
     },
     query::QueryableBatch,
 };
@@ -57,7 +57,8 @@ pub async fn prepare_data_to_querier(
     let mut found_namespace = false;
     for (shard_id, shard_data) in ingest_data.shards() {
         debug!(shard_id=%shard_id.get());
-        let namespace_data = match shard_data.namespace(&request.namespace) {
+        let namespace_name = NamespaceName::from(&request.namespace);
+        let namespace_data = match shard_data.namespace(&namespace_name) {
             Some(namespace_data) => {
                 debug!(namespace=%request.namespace, "found namespace");
                 found_namespace = true;
diff --git a/ingester/src/test_util.rs b/ingester/src/test_util.rs
index 09045083e8..ed3f8b6348 100644
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@@ -655,7 +655,7 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
             .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
             .await;
@@ -664,7 +664,7 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
             .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
             .await;
@@ -824,7 +824,7 @@ async fn make_one_partition_with_tombstones(
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
             .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
             .await;
@@ -833,7 +833,7 @@ async fn make_one_partition_with_tombstones(
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
             .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
             .await;

From c4c83e084040714ca4a45ee6172f5608cbba24e4 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Thu, 6 Oct 2022 10:54:01 +0200
Subject: [PATCH 17/40] fix: query error propagation (#5801)

- treat OOM protection as "resource exhausted"
- use `DataFusionError` in more places instead of opaque `Box<dyn Error>`
- improve conversion from/into `DataFusionError` to preserve more
  semantics

Overall, this improves our error handling. DF can now return errors like
"resource exhausted" and gRPC should now automatically generate a
sensible status code for it.

Fixes #5799.
---
 compactor/src/query.rs                        |  12 +-
 .../tests/end_to_end_cases/querier.rs         |  84 ++++++++++-
 ingester/src/query.rs                         |  28 ++--
 iox_query/src/frontend/influxrpc.rs           |  68 +++++----
 iox_query/src/lib.rs                          |  18 +--
 iox_query/src/test.rs                         |  26 ++--
 querier/src/chunk/query_access.rs             |  34 +++--
 querier/src/ingester/mod.rs                   |  15 +-
 querier/src/namespace/query_access.rs         |   7 +-
 querier/src/table/mod.rs                      |  12 ++
 querier/src/table/query_access/mod.rs         |   3 +-
 querier/src/table/state_reconciler.rs         |   1 +
 service_common/src/planner.rs                 |  14 +-
 service_grpc_flight/src/lib.rs                |  52 ++++---
 service_grpc_influxrpc/src/service.rs         | 142 +++++++++---------
 test_helpers_end_to_end/src/config.rs         |   5 +
 16 files changed, 326 insertions(+), 195 deletions(-)

diff --git a/compactor/src/query.rs b/compactor/src/query.rs
index ea6e219d4e..20a8d068cc 100644
--- a/compactor/src/query.rs
+++ b/compactor/src/query.rs
@@ -4,10 +4,10 @@ use data_types::{
     ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
     TableSummary, Timestamp, TimestampMinMax, Tombstone,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
 use parquet_file::chunk::ParquetChunk;
@@ -194,7 +194,7 @@ impl QueryChunk for QueryableParquetChunk {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -208,7 +208,7 @@ impl QueryChunk for QueryableParquetChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -230,7 +230,7 @@ impl QueryChunk for QueryableParquetChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         ctx.set_metadata("storage", "compactor");
         ctx.set_metadata("projection", format!("{}", selection));
         trace!(?selection, "selection");
@@ -238,7 +238,7 @@ impl QueryChunk for QueryableParquetChunk {
         self.data
             .read_filter(predicate, selection)
             .context(ReadParquetSnafu)
-            .map_err(|e| Box::new(e) as _)
+            .map_err(|e| DataFusionError::External(Box::new(e)))
     }
 
     /// Returns chunk type
diff --git a/influxdb_iox/tests/end_to_end_cases/querier.rs b/influxdb_iox/tests/end_to_end_cases/querier.rs
index d5f1cfbe0e..b189098a64 100644
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@@ -7,7 +7,8 @@ use futures::FutureExt;
 use predicates::prelude::*;
 use test_helpers::assert_contains;
 use test_helpers_end_to_end::{
-    maybe_skip_integration, run_query, MiniCluster, Step, StepTest, StepTestState, TestConfig,
+    maybe_skip_integration, run_query, try_run_query, GrpcRequestBuilder, MiniCluster, Step,
+    StepTest, StepTestState, TestConfig,
 };
 
 #[tokio::test]
@@ -454,6 +455,87 @@ async fn issue_4631_b() {
     .await
 }
 
+#[tokio::test]
+async fn oom_protection() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let router_config = TestConfig::new_router(&database_url);
+    let ingester_config = TestConfig::new_ingester(&router_config);
+    let querier_config =
+        TestConfig::new_querier(&ingester_config).with_querier_max_table_query_bytes(1);
+    let mut cluster = MiniCluster::new()
+        .with_router(router_config)
+        .await
+        .with_ingester(ingester_config)
+        .await
+        .with_querier(querier_config)
+        .await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!("{},tag1=A,tag2=B val=42i 123457", table_name)),
+            Step::WaitForReadable,
+            Step::AssertNotPersisted,
+            // SQL query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let sql = format!("select * from {}", table_name);
+                    let err = try_run_query(
+                        sql,
+                        state.cluster().namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                    )
+                    .await
+                    .unwrap_err();
+
+                    if let influxdb_iox_client::flight::Error::GrpcError(status) = err {
+                        assert_eq!(
+                            status.code(),
+                            tonic::Code::ResourceExhausted,
+                            "Wrong status code: {}\n\nStatus:\n{}",
+                            status.code(),
+                            status,
+                        );
+                    } else {
+                        panic!("Not a gRPC error: {err}");
+                    }
+                }
+                .boxed()
+            })),
+            // InfluxRPC/storage query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut storage_client = state.cluster().querier_storage_client();
+
+                    let read_filter_request = GrpcRequestBuilder::new()
+                        .source(state.cluster())
+                        .build_read_filter();
+
+                    let status = storage_client
+                        .read_filter(read_filter_request)
+                        .await
+                        .unwrap_err();
+                    assert_eq!(
+                        status.code(),
+                        tonic::Code::ResourceExhausted,
+                        "Wrong status code: {}\n\nStatus:\n{}",
+                        status.code(),
+                        status,
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 /// This structure holds information for tests that need to force a parquet file to be persisted
 struct ForcePersistenceSetup {
     // Set up a cluster that will will persist quickly
diff --git a/ingester/src/query.rs b/ingester/src/query.rs
index 747ff4666c..1829ecf4ae 100644
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@@ -8,14 +8,17 @@ use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary,
     TimestampMinMax, Tombstone,
 };
-use datafusion::physical_plan::{
-    common::SizedRecordBatchStream,
-    metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
-    SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        common::SizedRecordBatchStream,
+        metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
+        SendableRecordBatchStream,
+    },
 };
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
 use predicate::{
@@ -185,7 +188,7 @@ impl QueryChunk for QueryableBatch {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -199,7 +202,7 @@ impl QueryChunk for QueryableBatch {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -210,12 +213,16 @@ impl QueryChunk for QueryableBatch {
         mut ctx: IOxSessionContext,
         _predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         ctx.set_metadata("storage", "ingester");
         ctx.set_metadata("projection", format!("{}", selection));
         trace!(?selection, "selection");
 
-        let schema = self.schema().select(selection).context(SchemaSnafu)?;
+        let schema = self
+            .schema()
+            .select(selection)
+            .context(SchemaSnafu)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
         // Get all record batches from their snapshots
         let batches = self
@@ -234,7 +241,8 @@ impl QueryChunk for QueryableBatch {
                     .map(Arc::new);
                 Some(batch)
             })
-            .collect::<Result<Vec<_>, _>>()?;
+            .collect::<Result<Vec<_>, _>>()
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
         // Return stream of data
         let dummy_metrics = ExecutionPlanMetricsSet::new();
diff --git a/iox_query/src/frontend/influxrpc.rs b/iox_query/src/frontend/influxrpc.rs
index 0940aff71b..1018eed91e 100644
--- a/iox_query/src/frontend/influxrpc.rs
+++ b/iox_query/src/frontend/influxrpc.rs
@@ -37,33 +37,11 @@ const CONCURRENT_TABLE_JOBS: usize = 10;
 
 #[derive(Debug, Snafu)]
 pub enum Error {
-    #[snafu(display("gRPC planner got error making table_name plan for chunk: {}", source))]
-    TableNamePlan {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
-    #[snafu(display("gRPC planner got error listing partition keys: {}", source))]
-    ListingPartitions {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
     #[snafu(display("gRPC planner got error finding column names: {}", source))]
-    FindingColumnNames {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
+    FindingColumnNames { source: DataFusionError },
 
     #[snafu(display("gRPC planner got error finding column values: {}", source))]
-    FindingColumnValues {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
-    #[snafu(display(
-        "gRPC planner got internal error making table_name with default predicate: {}",
-        source
-    ))]
-    InternalTableNamePlanForDefault {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
+    FindingColumnValues { source: DataFusionError },
 
     #[snafu(display(
         "gRPC planner got error fetching chunks for table '{}': {}",
@@ -72,7 +50,7 @@ pub enum Error {
     ))]
     GettingChunks {
         table_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display(
@@ -82,21 +60,17 @@ pub enum Error {
     ))]
     CheckingChunkPredicate {
         chunk_id: ChunkId,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("gRPC planner got error creating string set plan: {}", source))]
     CreatingStringSet { source: StringSetError },
 
     #[snafu(display("gRPC planner got error creating predicates: {}", source))]
-    CreatingPredicates {
-        source: datafusion::error::DataFusionError,
-    },
+    CreatingPredicates { source: DataFusionError },
 
     #[snafu(display("gRPC planner got error building plan: {}", source))]
-    BuildingPlan {
-        source: datafusion::error::DataFusionError,
-    },
+    BuildingPlan { source: DataFusionError },
 
     #[snafu(display(
         "gRPC planner error: column '{}' is not a tag, it is {:?}",
@@ -148,7 +122,7 @@ pub enum Error {
     CastingAggregates {
         agg: Aggregate,
         field_name: String,
-        source: datafusion::error::DataFusionError,
+        source: DataFusionError,
     },
 
     #[snafu(display("Internal error: unexpected aggregate request for None aggregate",))]
@@ -163,6 +137,34 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+impl Error {
+    pub fn to_df_error(self, method: &'static str) -> DataFusionError {
+        let msg = self.to_string();
+
+        match self {
+            Self::GettingChunks { source, .. }
+            | Self::CreatingPredicates { source, .. }
+            | Self::BuildingPlan { source, .. }
+            | Self::CheckingChunkPredicate { source, .. }
+            | Self::FindingColumnNames { source, .. }
+            | Self::FindingColumnValues { source, .. }
+            | Self::CastingAggregates { source, .. } => {
+                DataFusionError::Context(format!("{method}: {msg}"), Box::new(source))
+            }
+            e @ (Self::CreatingStringSet { .. }
+            | Self::TableRemoved { .. }
+            | Self::InvalidTagColumn { .. }
+            | Self::InternalInvalidTagType { .. }
+            | Self::DuplicateGroupColumn { .. }
+            | Self::GroupColumnNotFound { .. }
+            | Self::CreatingAggregates { .. }
+            | Self::CreatingScan { .. }
+            | Self::InternalUnexpectedNoneAggregate {}
+            | Self::InternalAggregateNotSelector { .. }) => DataFusionError::External(Box::new(e)),
+        }
+    }
+}
+
 impl From<super::common::Error> for Error {
     fn from(source: super::common::Error) -> Self {
         Self::CreatingScan { source }
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
index a0bd37a68b..b9d09544a3 100644
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@@ -14,7 +14,7 @@ use async_trait::async_trait;
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use exec::{stringset::StringSet, IOxSessionContext};
 use hashbrown::HashMap;
 use observability_deps::tracing::{debug, trace};
@@ -141,9 +141,6 @@ impl Drop for QueryCompletedToken {
 /// This avoids storing potentially large strings
 pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
 
-/// Error type for [`QueryDatabase`] operations.
-pub type QueryDatabaseError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// A `Database` is the main trait implemented by the IOx subsystems
 /// that store actual data.
 ///
@@ -159,7 +156,7 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
         table_name: &str,
         predicate: &Predicate,
         ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError>;
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;
 
     /// Record that particular type of query was run / planned
     fn record_query(
@@ -175,9 +172,6 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
     fn as_meta(&self) -> &dyn QueryDatabaseMeta;
 }
 
-/// Error type for [`QueryChunk`] operations.
-pub type QueryChunkError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// Collection of data that shares the same partition key
 pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
     /// returns the Id of this chunk. Ids are unique within a
@@ -200,7 +194,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
     fn apply_predicate_to_metadata(
         &self,
         predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
         Ok(self
             .summary()
             .map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow()))
@@ -216,7 +210,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         predicate: &Predicate,
         columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;
 
     /// Return a set of Strings containing the distinct values in the
     /// specified columns. If the predicate can be evaluated entirely
@@ -228,7 +222,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         column_name: &str,
         predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;
 
     /// Provides access to raw `QueryChunk` data as an
     /// asynchronous stream of `RecordBatch`es filtered by a *required*
@@ -248,7 +242,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError>;
+    ) -> Result<SendableRecordBatchStream, DataFusionError>;
 
     /// Returns chunk type. Useful in tests and debug logs.
     fn chunk_type(&self) -> &str;
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
index dee2d1120b..256cd069ea 100644
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@@ -8,8 +8,8 @@ use crate::{
         stringset::{StringSet, StringSetRef},
         ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext,
     },
-    Predicate, PredicateMatch, QueryChunk, QueryChunkError, QueryChunkMeta, QueryCompletedToken,
-    QueryDatabase, QueryDatabaseError, QueryText,
+    Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase,
+    QueryText,
 };
 use arrow::{
     array::{
@@ -24,7 +24,7 @@ use data_types::{
     ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues,
     Statistics, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use datafusion_util::stream_from_batches;
 use futures::StreamExt;
 use hashbrown::HashSet;
@@ -109,7 +109,7 @@ impl QueryDatabase for TestDatabase {
         table_name: &str,
         predicate: &Predicate,
         _ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         // save last predicate
         *self.chunks_predicate.lock() = predicate.clone();
 
@@ -327,9 +327,9 @@ impl TestChunk {
     }
 
     /// Checks the saved error, and returns it if any, otherwise returns OK
-    fn check_error(&self) -> Result<(), QueryChunkError> {
+    fn check_error(&self) -> Result<(), DataFusionError> {
         if let Some(message) = self.saved_error.as_ref() {
-            Err(message.clone().into())
+            Err(DataFusionError::External(message.clone().into()))
         } else {
             Ok(())
         }
@@ -921,13 +921,17 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
         self.predicates.lock().push(predicate.clone());
 
-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
             None => self.table_data.clone(),
             Some(projection) => self
                 .table_data
@@ -948,7 +952,7 @@ impl QueryChunk for TestChunk {
     fn apply_predicate_to_metadata(
         &self,
         predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
@@ -967,7 +971,7 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // Model not being able to get column values from metadata
         Ok(None)
     }
@@ -977,7 +981,7 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
diff --git a/querier/src/chunk/query_access.rs b/querier/src/chunk/query_access.rs
index 0edf477ec7..dc94a55b69 100644
--- a/querier/src/chunk/query_access.rs
+++ b/querier/src/chunk/query_access.rs
@@ -7,13 +7,16 @@ use arrow::{
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::{
-    stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+    },
 };
 use futures::{Stream, TryStreamExt};
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::debug;
 use predicate::Predicate;
@@ -114,7 +117,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         ctx.set_metadata("projection", format!("{}", columns));
         ctx.set_metadata("predicate", format!("{}", &predicate));
 
@@ -161,10 +164,10 @@ impl QueryChunk for QuerierChunk {
                         None
                     }
                     Err(other) => {
-                        return Err(Box::new(Error::RBChunk {
+                        return Err(DataFusionError::External(Box::new(Error::RBChunk {
                             source: other,
                             chunk_id: self.id(),
-                        }))
+                        })))
                     }
                 };
 
@@ -178,7 +181,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         column_name: &str,
         predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         ctx.set_metadata("column_name", column_name.to_string());
         ctx.set_metadata("predicate", format!("{}", &predicate));
 
@@ -205,11 +208,13 @@ impl QueryChunk for QuerierChunk {
                 };
                 ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));
 
-                let mut values = rb_chunk.column_values(
-                    rb_predicate,
-                    Selection::Some(&[column_name]),
-                    BTreeMap::new(),
-                )?;
+                let mut values = rb_chunk
+                    .column_values(
+                        rb_predicate,
+                        Selection::Some(&[column_name]),
+                        BTreeMap::new(),
+                    )
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
                 // The InfluxRPC frontend only supports getting column values
                 // for one column at a time (this is a restriction on the Influx
@@ -221,7 +226,8 @@ impl QueryChunk for QuerierChunk {
                     .context(ColumnNameNotFoundSnafu {
                         chunk_id: self.id(),
                         column_name,
-                    })?;
+                    })
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
                 ctx.set_metadata("output_values", values.len() as i64);
 
                 Ok(Some(values))
@@ -234,7 +240,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         let span_recorder = SpanRecorder::new(
             ctx.span()
                 .map(|span| span.child("QuerierChunk::read_filter")),
diff --git a/querier/src/ingester/mod.rs b/querier/src/ingester/mod.rs
index aac2635c29..86946d2c54 100644
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@@ -11,6 +11,7 @@ use data_types::{
     ChunkId, ChunkOrder, IngesterMapping, PartitionId, SequenceNumber, ShardId, ShardIndex,
     TableSummary, TimestampMinMax,
 };
+use datafusion::error::DataFusionError;
 use datafusion_util::MemoryStream;
 use futures::{stream::FuturesUnordered, TryStreamExt};
 use generated_types::{
@@ -24,7 +25,7 @@ use influxdb_iox_client::flight::{
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
     util::compute_timenanosecond_min_max,
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use iox_time::{Time, TimeProvider};
 use metric::{DurationHistogram, Metric};
@@ -1097,7 +1098,7 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // TODO maybe some special handling?
         Ok(None)
     }
@@ -1107,7 +1108,7 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // TODO maybe some special handling?
         Ok(None)
     }
@@ -1117,11 +1118,15 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, DataFusionError> {
         trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data");
 
         // Apply selection to in-memory batch
-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
             None => self.batches.clone(),
             Some(projection) => self
                 .batches
diff --git a/querier/src/namespace/query_access.rs b/querier/src/namespace/query_access.rs
index b7451000b3..e7a3856554 100644
--- a/querier/src/namespace/query_access.rs
+++ b/querier/src/namespace/query_access.rs
@@ -11,10 +11,11 @@ use data_types::NamespaceId;
 use datafusion::{
     catalog::{catalog::CatalogProvider, schema::SchemaProvider},
     datasource::TableProvider,
+    error::DataFusionError,
 };
 use iox_query::{
     exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext},
-    QueryChunk, QueryCompletedToken, QueryDatabase, QueryDatabaseError, QueryText, DEFAULT_SCHEMA,
+    QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
 };
 use observability_deps::tracing::{debug, trace};
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
@@ -41,7 +42,7 @@ impl QueryDatabase for QuerierNamespace {
         table_name: &str,
         predicate: &Predicate,
         ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         debug!(%table_name, %predicate, "Finding chunks for table");
         // get table metadata
         let table = match self.tables.get(table_name).map(Arc::clone) {
@@ -627,7 +628,7 @@ mod tests {
             .unwrap_err();
         assert_eq!(
             err.to_string(),
-            format!("Cannot build plan: External error: Chunk pruning failed: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
+            format!("Cannot build plan: Resources exhausted: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
         );
     }
 
diff --git a/querier/src/table/mod.rs b/querier/src/table/mod.rs
index 19835fde6f..767fa6c83a 100644
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@@ -8,6 +8,7 @@ use crate::{
     IngesterConnection,
 };
 use data_types::{ColumnId, PartitionId, ShardIndex, TableId, TimestampMinMax};
+use datafusion::error::DataFusionError;
 use futures::{join, StreamExt};
 use iox_query::pruning::prune_summaries;
 use iox_query::{exec::Executor, provider, provider::ChunkPruner, QueryChunk};
@@ -65,6 +66,17 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+impl From<Error> for DataFusionError {
+    fn from(err: Error) -> Self {
+        match err {
+            Error::ChunkPruning {
+                source: err @ provider::Error::TooMuchData { .. },
+            } => Self::ResourcesExhausted(err.to_string()),
+            _ => Self::External(Box::new(err) as _),
+        }
+    }
+}
+
 /// Args to create a [`QuerierTable`].
 pub struct QuerierTableArgs {
     pub sharder: Arc<JumpHash<Arc<ShardIndex>>>,
diff --git a/querier/src/table/query_access/mod.rs b/querier/src/table/query_access/mod.rs
index 5665f79171..e16830577b 100644
--- a/querier/src/table/query_access/mod.rs
+++ b/querier/src/table/query_access/mod.rs
@@ -66,8 +66,7 @@ impl TableProvider for QuerierTable {
                 ctx.child_span("querier table chunks"),
                 projection,
             )
-            .await
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+            .await?;
 
         for chunk in chunks {
             builder = builder.add_chunk(chunk);
diff --git a/querier/src/table/state_reconciler.rs b/querier/src/table/state_reconciler.rs
index baa2935911..d5fe4cced6 100644
--- a/querier/src/table/state_reconciler.rs
+++ b/querier/src/table/state_reconciler.rs
@@ -23,6 +23,7 @@ use crate::{
 use self::interface::{IngesterPartitionInfo, ParquetFileInfo, TombstoneInfo};
 
 #[derive(Snafu, Debug)]
+#[allow(missing_copy_implementations)]
 pub enum ReconcileError {
     #[snafu(display("Compactor processed file that the querier would need to split apart which is not yet implemented"))]
     CompactorConflict,
diff --git a/service_common/src/planner.rs b/service_common/src/planner.rs
index 6431963aad..e1bc5adf71 100644
--- a/service_common/src/planner.rs
+++ b/service_common/src/planner.rs
@@ -60,7 +60,7 @@ impl Planner {
                 planner
                     .table_names(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("table_names error: {}", e)))
+                    .map_err(|e| e.to_df_error("table_names"))
             })
             .await
     }
@@ -82,7 +82,7 @@ impl Planner {
                 planner
                     .tag_keys(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("tag_keys error: {}", e)))
+                    .map_err(|e| e.to_df_error("tag_keys"))
             })
             .await
     }
@@ -106,7 +106,7 @@ impl Planner {
                 planner
                     .tag_values(database, &tag_name, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("tag_values error: {}", e)))
+                    .map_err(|e| e.to_df_error("tag_values"))
             })
             .await
     }
@@ -128,7 +128,7 @@ impl Planner {
                 planner
                     .field_columns(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("field_columns error: {}", e)))
+                    .map_err(|e| e.to_df_error("field_columns"))
             })
             .await
     }
@@ -150,7 +150,7 @@ impl Planner {
                 planner
                     .read_filter(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_filter error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_filter"))
             })
             .await
     }
@@ -174,7 +174,7 @@ impl Planner {
                 planner
                     .read_group(database, predicate, agg, &group_columns)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_group error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_group"))
             })
             .await
     }
@@ -199,7 +199,7 @@ impl Planner {
                 planner
                     .read_window_aggregate(database, predicate, agg, every, offset)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_window_aggregate error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_window_aggregate"))
             })
             .await
     }
diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs
index f88ce0d184..f4d84266e6 100644
--- a/service_grpc_flight/src/lib.rs
+++ b/service_grpc_flight/src/lib.rs
@@ -9,7 +9,7 @@ use arrow_flight::{
 use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use bytes::{Bytes, BytesMut};
 use data_types::{DatabaseName, DatabaseNameError};
-use datafusion::physical_plan::ExecutionPlan;
+use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
 use futures::{SinkExt, Stream, StreamExt};
 use generated_types::influxdata::iox::querier::v1 as proto;
 use iox_query::{
@@ -54,7 +54,7 @@ pub enum Error {
     ))]
     Query {
         database_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Invalid database name: {}", source))]
@@ -91,29 +91,40 @@ impl From<Error> for tonic::Status {
             Error::Optimize { .. }
             | Error::Planning { .. } | Error::Serialization { .. } => warn!(?err, msg),
         }
-        err.to_status()
+        err.into_status()
     }
 }
 
 impl Error {
     /// Converts a result from the business logic into the appropriate tonic
     /// status
-    fn to_status(&self) -> tonic::Status {
-        use tonic::Status;
-        match &self {
-            Self::InvalidTicket { .. } => Status::invalid_argument(self.to_string()),
-            Self::InvalidTicketLegacy { .. } => Status::invalid_argument(self.to_string()),
-            Self::InvalidQuery { .. } => Status::invalid_argument(self.to_string()),
-            Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()),
-            Self::Query { .. } => Status::internal(self.to_string()),
-            Self::InvalidDatabaseName { .. } => Status::invalid_argument(self.to_string()),
-            Self::Planning {
-                source: service_common::planner::Error::External(_),
-            } => Status::internal(self.to_string()),
-            Self::Planning { .. } => Status::invalid_argument(self.to_string()),
-            Self::Optimize { .. } => Status::internal(self.to_string()),
-            Self::Serialization { .. } => Status::internal(self.to_string()),
-        }
+    fn into_status(self) -> tonic::Status {
+        let msg = self.to_string();
+
+        let code = match self {
+            Self::DatabaseNotFound { .. } => tonic::Code::NotFound,
+            Self::InvalidTicket { .. }
+            | Self::InvalidTicketLegacy { .. }
+            | Self::InvalidQuery { .. }
+            | Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument,
+            Self::Planning { source, .. } | Self::Query { source, .. } => {
+                // traverse context chain
+                let mut source = source;
+                while let DataFusionError::Context(_msg, inner) = source {
+                    source = *inner;
+                }
+
+                match source {
+                    DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
+                    DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
+                    DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
+                    _ => tonic::Code::Internal,
+                }
+            }
+            Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal,
+        };
+
+        tonic::Status::new(code, msg)
     }
 }
 
@@ -334,7 +345,6 @@ impl GetStream {
         let mut stream_record_batches = ctx
             .execute_stream(Arc::clone(&physical_plan))
             .await
-            .map_err(|e| Box::new(e) as _)
             .context(QuerySnafu {
                 database_name: &database_name,
             })?;
@@ -382,7 +392,7 @@ impl GetStream {
                         // failure sending here is OK because we're cutting the stream anyways
                         tx.send(Err(Error::Query {
                             database_name: database_name.clone(),
-                            source: Box::new(e),
+                            source: DataFusionError::ArrowError(e),
                         }
                         .into()))
                             .await
diff --git a/service_grpc_influxrpc/src/service.rs b/service_grpc_influxrpc/src/service.rs
index f8d54e2d05..8ad7bbbbfb 100644
--- a/service_grpc_influxrpc/src/service.rs
+++ b/service_grpc_influxrpc/src/service.rs
@@ -12,6 +12,7 @@ use crate::{
     StorageService,
 };
 use data_types::{org_and_bucket_to_database, DatabaseName};
+use datafusion::error::DataFusionError;
 use futures::Stream;
 use generated_types::{
     google::protobuf::Empty, literal_or_regex::Value as RegexOrLiteralValue,
@@ -54,43 +55,43 @@ pub enum Error {
     #[snafu(display("Error listing tables in database '{}': {}", db_name, source))]
     ListingTables {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error listing columns in database '{}': {}", db_name, source))]
     ListingColumns {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error listing fields in database '{}': {}", db_name, source))]
     ListingFields {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error creating series plans for database '{}': {}", db_name, source))]
     PlanningFilteringSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error creating group plans for database '{}': {}", db_name, source))]
     PlanningGroupSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error running series plans for database '{}': {}", db_name, source))]
     FilteringSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error running grouping plans for database '{}': {}", db_name, source))]
     GroupingSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display(
@@ -102,7 +103,7 @@ pub enum Error {
     ListingTagValues {
         db_name: String,
         tag_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error converting Predicate '{}: {}", rpc_predicate_string, source))]
@@ -177,44 +178,56 @@ impl From<Error> for tonic::Status {
     /// status
     fn from(err: Error) -> Self {
         error!("Error handling gRPC request: {}", err);
-        err.to_status()
+        err.into_status()
     }
 }
 
 impl Error {
     /// Converts a result from the business logic into the appropriate tonic
     /// status
-    fn to_status(&self) -> tonic::Status {
-        match &self {
-            Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()),
-            Self::ListingTables { .. } => Status::internal(self.to_string()),
-            Self::ListingColumns { .. } => {
-                // TODO: distinguish between input errors and internal errors
-                Status::invalid_argument(self.to_string())
+    fn into_status(self) -> tonic::Status {
+        let msg = self.to_string();
+
+        let code = match self {
+            Self::DatabaseNotFound { .. } => tonic::Code::NotFound,
+            Self::ListingTables { source, .. }
+            | Self::ListingColumns { source, .. }
+            | Self::ListingFields { source, .. }
+            | Self::PlanningFilteringSeries { source, .. }
+            | Self::PlanningGroupSeries { source, .. }
+            | Self::FilteringSeries { source, .. }
+            | Self::GroupingSeries { source, .. }
+            | Self::ListingTagValues { source, .. } => {
+                // traverse context chain
+                let mut source = source;
+                while let DataFusionError::Context(_msg, inner) = source {
+                    source = *inner;
+                }
+
+                match source {
+                    DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
+                    DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
+                    DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
+                    _ => tonic::Code::Internal,
+                }
             }
-            Self::ListingFields { .. } => {
-                // TODO: distinguish between input errors and internal errors
-                Status::invalid_argument(self.to_string())
+            Self::ConvertingPredicate { .. }
+            | Self::ConvertingReadGroupAggregate { .. }
+            | Self::ConvertingReadGroupType { .. }
+            | Self::ConvertingWindowAggregate { .. }
+            | Self::ConvertingTagKeyInTagValues { .. }
+            | Self::ComputingGroupedSeriesSet { .. }
+            | Self::ConvertingFieldList { .. }
+            | Self::MeasurementLiteralOrRegex { .. }
+            | Self::MissingTagKeyPredicate {}
+            | Self::InvalidTagKeyRegex { .. } => tonic::Code::InvalidArgument,
+            Self::SendingResults { .. } | Self::InternalHintsFieldNotSupported { .. } => {
+                tonic::Code::Internal
             }
-            Self::PlanningFilteringSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::PlanningGroupSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::FilteringSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::GroupingSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::ListingTagValues { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingPredicate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingReadGroupAggregate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingReadGroupType { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingWindowAggregate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingTagKeyInTagValues { .. } => Status::invalid_argument(self.to_string()),
-            Self::ComputingGroupedSeriesSet { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingFieldList { .. } => Status::invalid_argument(self.to_string()),
-            Self::SendingResults { .. } => Status::internal(self.to_string()),
-            Self::InternalHintsFieldNotSupported { .. } => Status::internal(self.to_string()),
-            Self::NotYetImplemented { .. } => Status::internal(self.to_string()),
-            Self::MeasurementLiteralOrRegex { .. } => Status::invalid_argument(self.to_string()),
-            Self::MissingTagKeyPredicate {} => Status::invalid_argument(self.to_string()),
-            Self::InvalidTagKeyRegex { .. } => Status::invalid_argument(self.to_string()),
-        }
+            Self::NotYetImplemented { .. } => tonic::Code::Unimplemented,
+        };
+
+        tonic::Status::new(code, msg)
     }
 }
 
@@ -341,7 +354,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status())?
+        .map_err(|e| e.into_status())?
         .into_iter()
         .map(Ok)
         .collect::<Vec<_>>();
@@ -423,7 +436,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status())?
+        .map_err(|e| e.into_status())?
         .into_iter()
         .map(Ok)
         .collect::<Vec<_>>();
@@ -489,7 +502,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -560,7 +573,7 @@ where
                         operation: "tag_value for a measurement, with general predicate"
                             .to_string(),
                     }
-                    .to_status());
+                    .into_status());
                 }
 
                 measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx).await
@@ -593,7 +606,7 @@ where
             }
         };
 
-        let response = response.map_err(|e| e.to_status());
+        let response = response.map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -652,7 +665,7 @@ where
         let results =
             tag_values_grouped_by_measurement_and_tag_key_impl(Arc::clone(&db), db_name, req, &ctx)
                 .await
-                .map_err(|e| e.to_status())?
+                .map_err(|e| e.into_status())?
                 .into_iter()
                 .map(Ok)
                 .collect::<Vec<_>>();
@@ -762,7 +775,7 @@ where
 
         let response = measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx)
             .await
-            .map_err(|e| e.to_status());
+            .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -833,7 +846,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -907,7 +920,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -981,9 +994,9 @@ where
         .map(|fieldlist| {
             fieldlist_to_measurement_fields_response(fieldlist)
                 .context(ConvertingFieldListSnafu)
-                .map_err(|e| e.to_status())
+                .map_err(|e| e.into_status())
         })
-        .map_err(|e| e.to_status())?;
+        .map_err(|e| e.into_status())?;
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -1048,13 +1061,11 @@ where
     let plan = Planner::new(ctx)
         .table_names(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTablesSnafu { db_name })?;
 
     let table_names = ctx
         .to_string_set(plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTablesSnafu { db_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1095,13 +1106,11 @@ where
     let tag_key_plan = Planner::new(ctx)
         .tag_keys(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingColumnsSnafu { db_name })?;
 
     let tag_keys = ctx
         .to_string_set(tag_key_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingColumnsSnafu { db_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1142,13 +1151,11 @@ where
     let tag_value_plan = Planner::new(ctx)
         .tag_values(db, tag_name, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTagValuesSnafu { db_name, tag_name })?;
 
     let tag_values = ctx
         .to_string_set(tag_value_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTagValuesSnafu { db_name, tag_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1266,14 +1273,12 @@ where
     let series_plan = Planner::new(ctx)
         .read_filter(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(PlanningFilteringSeriesSnafu { db_name })?;
 
     // Execute the plans.
     let series_or_groups = ctx
         .to_series_and_groups(series_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(FilteringSeriesSnafu { db_name })
         .log_if_error("Running series set plan")?;
 
@@ -1319,9 +1324,8 @@ where
                 .await
         }
     };
-    let grouped_series_set_plan = grouped_series_set_plan
-        .map_err(|e| Box::new(e) as _)
-        .context(PlanningGroupSeriesSnafu { db_name })?;
+    let grouped_series_set_plan =
+        grouped_series_set_plan.context(PlanningGroupSeriesSnafu { db_name })?;
 
     // PERF - This used to send responses to the client before execution had
     // completed, but now it doesn't. We may need to revisit this in the future
@@ -1331,7 +1335,6 @@ where
     let series_or_groups = ctx
         .to_series_and_groups(grouped_series_set_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(GroupingSeriesSnafu { db_name })
         .log_if_error("Running Grouped SeriesSet Plan")?;
 
@@ -1370,13 +1373,11 @@ where
     let field_list_plan = Planner::new(ctx)
         .field_columns(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingFieldsSnafu { db_name })?;
 
     let field_list = ctx
         .to_field_list(field_list_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingFieldsSnafu { db_name })?;
 
     trace!(field_names=?field_list, "Field names response");
@@ -1878,7 +1879,7 @@ mod tests {
         let response = fixture.storage_client.tag_keys(request).await;
         assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "TagKeys", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "TagKeys", "server_error", 1);
     }
 
     /// test the plumbing of the RPC layer for measurement_tag_keys--
@@ -1984,7 +1985,7 @@ mod tests {
         let response = fixture.storage_client.measurement_tag_keys(request).await;
         assert_contains!(response.unwrap_err().to_string(), "This is an error");
 
-        grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "server_error", 1);
     }
 
     /// test the plumbing of the RPC layer for tag_values -- specifically that
@@ -2173,7 +2174,8 @@ mod tests {
             "Error converting tag_key to UTF-8 in tag_values request"
         );
 
-        grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 2);
+        grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2524,7 +2526,7 @@ mod tests {
 
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2730,7 +2732,7 @@ mod tests {
         let response = fixture.storage_client.read_filter(request).await;
         assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadFilter", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadFilter", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2822,7 +2824,7 @@ mod tests {
             .to_string();
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadGroup", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadGroup", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2988,7 +2990,7 @@ mod tests {
 
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "server_error", 1);
     }
 
     #[tokio::test]
diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs
index b3dc091a93..d4597e8584 100644
--- a/test_helpers_end_to_end/src/config.rs
+++ b/test_helpers_end_to_end/src/config.rs
@@ -290,6 +290,11 @@ impl TestConfig {
         self.with_env("INFLUXDB_IOX_FLIGHT_DO_GET_PANIC", times.to_string())
     }
 
+    /// Configure maximum per-table query bytes for the querier.
+    pub fn with_querier_max_table_query_bytes(self, bytes: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_MAX_TABLE_QUERY_BYTES", bytes.to_string())
+    }
+
     /// Changes the log to JSON for easier parsing.
     pub fn with_json_logs(self) -> Self {
         self.with_env("LOG_FORMAT", "json")

From c022ab6786b95439c4ac499d6ffac941d3ed080e Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 3 Oct 2022 15:19:14 +0200
Subject: [PATCH 18/40] feat: deferred partition sort key fetcher

Adds a new DeferredSortKey type that fetches a partition's sort key from
the catalog in the background, or on-demand if not yet pre-fetched.

From the caller's perspective, little has changed compared to reading it
from the catalog directly - the sort key is always returned when calling
get(), regardless of the mechanism, and retries are handled
transparently. Internally the sort key MAY have been pre-fetched in the
background between the DeferredSortKey being initialised, and the call
to get().

The background task waits a (uniformly) random duration of time before
issuing the catalog query to pre-fetch the sort key. This allows large
numbers of DeferredSortKey to (randomly) smear the lookup queries over a
large duration of time. This allows a large number of DeferredSortKey to
be initialised in a short period of time, without creating an equally
large spike in queries against the catalog in the same time period.
---
 Cargo.lock                                    |   1 +
 ingester/Cargo.toml                           |   1 +
 ingester/src/data/partition/resolver/mod.rs   |   3 +
 .../src/data/partition/resolver/sort_key.rs   | 317 ++++++++++++++++++
 4 files changed, 322 insertions(+)
 create mode 100644 ingester/src/data/partition/resolver/sort_key.rs

diff --git a/Cargo.lock b/Cargo.lock
index 488aae235c..cdb9a2bd37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2206,6 +2206,7 @@ dependencies = [
  "pin-project",
  "predicate",
  "prost 0.11.0",
+ "rand",
  "schema",
  "snafu",
  "test_helpers",
diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml
index beb94c37e9..51e01c3def 100644
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@@ -45,6 +45,7 @@ write_buffer = { path = "../write_buffer" }
 write_summary = { path = "../write_summary" }
 tokio-util = { version = "0.7.4" }
 trace = { path = "../trace" }
+rand = "0.8.5"
 
 [dev-dependencies]
 assert_matches = "1.5.0"
diff --git a/ingester/src/data/partition/resolver/mod.rs b/ingester/src/data/partition/resolver/mod.rs
index fcb5e5fb6a..904eb781f5 100644
--- a/ingester/src/data/partition/resolver/mod.rs
+++ b/ingester/src/data/partition/resolver/mod.rs
@@ -11,6 +11,9 @@ pub use r#trait::*;
 mod catalog;
 pub use catalog::*;
 
+mod sort_key;
+pub(crate) use sort_key::*;
+
 #[cfg(test)]
 mod mock;
 #[cfg(test)]
diff --git a/ingester/src/data/partition/resolver/sort_key.rs b/ingester/src/data/partition/resolver/sort_key.rs
new file mode 100644
index 0000000000..c0c5555963
--- /dev/null
+++ b/ingester/src/data/partition/resolver/sort_key.rs
@@ -0,0 +1,317 @@
+//! A optimised resolver of a partition [`SortKey`].
+
+use std::{sync::Arc, time::Duration};
+
+use backoff::{Backoff, BackoffConfig};
+use data_types::PartitionId;
+use iox_catalog::interface::Catalog;
+use parking_lot::Mutex;
+use rand::Rng;
+use schema::sort::SortKey;
+use tokio::task::JoinHandle;
+
+/// The states of a [`DeferredSortKey`] instance.
+#[derive(Debug)]
+enum State {
+    /// The value has not yet been fetched by the background task.
+    Unresolved,
+    /// The value was fetched by the background task and is read to be consumed.
+    Resolved(Option<SortKey>),
+}
+
+/// A resolver of [`SortKey`] from the catalog for a given partition.
+///
+/// This implementation combines lazy / deferred loading of the [`SortKey`] from
+/// the [`Catalog`], and a background timer that pre-fetches the [`SortKey`]
+/// after some random duration of time. Combined, these behaviours smear the
+/// [`SortKey`] queries across the allowable time range, avoiding a large number
+/// of queries from executing when multiple [`SortKey`] are needed in the system
+/// at one point in time.
+///
+/// If the [`DeferredSortKey`] is dropped and the background task is still
+/// incomplete (sleeping / actively fetching the [`SortKey`]) it is aborted
+/// immediately. The background task exists once it has successfully fetched the
+/// [`SortKey`].
+///
+/// # Stale Cached Values
+///
+/// This is effectively a cache that is pre-warmed in the background - this
+/// necessitates that the caller can tolerate, or determine, stale values.
+#[derive(Debug)]
+pub(crate) struct DeferredSortKey {
+    value: Arc<Mutex<State>>,
+    partition_id: PartitionId,
+
+    handle: JoinHandle<()>,
+
+    backoff_config: BackoffConfig,
+    catalog: Arc<dyn Catalog>,
+}
+
+impl DeferredSortKey {
+    /// Construct a [`DeferredSortKey`] instance that fetches the [`SortKey`]
+    /// for the specified `partition_id`.
+    ///
+    /// The background task will wait a uniformly random duration of time
+    /// between `[0, max_smear)` before attempting to pre-fetch the [`SortKey`]
+    /// from `catalog`.
+    pub(crate) fn new(
+        partition_id: PartitionId,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self {
+        // Init the value container the background thread populates.
+        let value = Arc::new(Mutex::new(State::Unresolved));
+
+        // Select random duration from a uniform distribution, up to the
+        // configured maximum.
+        let wait_for = rand::thread_rng().gen_range(Duration::ZERO..max_smear);
+
+        // Spawn the background task, sleeping for the random duration of time
+        // before fetching the sort key.
+        let handle = tokio::spawn({
+            let value = Arc::clone(&value);
+            let catalog = Arc::clone(&catalog);
+            let backoff_config = backoff_config.clone();
+            async move {
+                // Sleep for the random duration
+                tokio::time::sleep(wait_for).await;
+                // Fetch the sort key from the catalog
+                let v = fetch(partition_id, &*catalog, &backoff_config).await;
+                // And attempt to
+                let mut state = value.lock();
+                *state = match *state {
+                    State::Unresolved => State::Resolved(v),
+                    State::Resolved(_) => return,
+                };
+            }
+        });
+
+        Self {
+            value,
+            partition_id,
+            handle,
+            backoff_config,
+            catalog,
+        }
+    }
+
+    /// Read the [`SortKey`] for the partition.
+    ///
+    /// If the [`SortKey`] was pre-fetched in the background, it is returned
+    /// immediately. If the [`SortKey`] has not yet been resolved, this call
+    /// blocks while it is read from the [`Catalog`].
+    pub(crate) async fn get(&self) -> Option<SortKey> {
+        {
+            let state = self.value.lock();
+
+            // If there is a resolved value, return it.
+            if let State::Resolved(v) = &*state {
+                return v.clone();
+            }
+        }
+
+        // Otherwise resolve the value immediately, aborting the background
+        // task.
+        let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await;
+
+        {
+            let mut state = self.value.lock();
+            self.handle.abort();
+            *state = State::Resolved(sort_key.clone());
+        }
+
+        sort_key
+    }
+}
+
+impl Drop for DeferredSortKey {
+    fn drop(&mut self) {
+        // Attempt to abort the background task, regardless of it having
+        // completed or not.
+        self.handle.abort()
+    }
+}
+
+/// Fetch the [`SortKey`] from the [`Catalog`] for `partition_id`, retrying
+/// endlessly when errors occur.
+async fn fetch(
+    partition_id: PartitionId,
+    catalog: &dyn Catalog,
+    backoff_config: &BackoffConfig,
+) -> Option<SortKey> {
+    Backoff::new(backoff_config)
+        .retry_all_errors("fetch partition sort key", || async {
+            let s = catalog
+                .repositories()
+                .await
+                .partitions()
+                .get_by_id(partition_id)
+                .await?
+                .expect("resolving sort key for non-existent partition")
+                .sort_key();
+
+            Result::<_, iox_catalog::interface::Error>::Ok(s)
+        })
+        .await
+        .expect("retry forever")
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::ShardIndex;
+    use test_helpers::timeout::FutureTimeout;
+
+    use crate::test_util::populate_catalog;
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+
+    // A test that (most likely) exercises the "read on demand" code path.
+    //
+    // The background task is configured to run some time between now, and
+    // 10,000 hours in the future - it most likely doesn't get to complete
+    // before the get() call is issued.
+    //
+    // If this test flakes, it is POSSIBLE but UNLIKELY that the background task
+    // has completed and the get() call reads a pre-fetched value.
+    #[tokio::test]
+    async fn test_read_demand() {
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            Duration::from_secs(36_000_000),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        )
+        .get()
+        .await;
+        assert!(fetched.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            Duration::from_secs(10_000),
+            Arc::clone(&catalog),
+            backoff_config,
+        )
+        .get()
+        .await;
+
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+
+    // A test that deterministically exercises the "background pre-fetch" code path.
+    #[tokio::test]
+    async fn test_read_pre_fetched() {
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        assert!(fetcher.get().await.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        let fetched = fetcher.get().await;
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+}

From afcb96ae47a86572b1ce833ac3be70f3c596cb13 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Tue, 4 Oct 2022 14:40:12 +0200
Subject: [PATCH 19/40] perf(ingester): deferred sort key lookup queries

This commit carries the SortKey in the PartitionData, and configures the
ingester to use deferred sort key lookups, smearing the lookups across a
fixed period of time after initialising the PartitionData, instead of
querying for the sort key at persist time.

This allows large numbers of PartitionData to be initialised without
causing a equally large spike in catalog load to resolve the sort key -
instead this load is spread out randomly to reduce peak query rps.
---
 ingester/src/data/namespace.rs                |  3 +-
 ingester/src/data/partition.rs                | 53 +++++++++++-
 ingester/src/data/partition/resolver/cache.rs | 83 ++++++++++++++++---
 .../src/data/partition/resolver/catalog.rs    |  4 +-
 ingester/src/data/partition/resolver/trait.rs |  3 +-
 ingester/src/data/shard.rs                    |  3 +-
 ingester/src/data/table.rs                    |  4 +-
 ingester/src/handler.rs                       | 15 +++-
 iox_catalog/src/interface.rs                  |  5 +-
 9 files changed, 153 insertions(+), 20 deletions(-)

diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 418b38c6db..4b67e9642c 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -430,7 +430,7 @@ mod tests {
     use metric::{Attributes, Metric};
 
     use crate::{
-        data::partition::{resolver::MockPartitionProvider, PartitionData},
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
         lifecycle::mock_handle::MockLifecycleHandle,
         test_util::{make_write_op, populate_catalog},
     };
@@ -462,6 +462,7 @@ mod tests {
                 ns_id,
                 table_id,
                 TABLE_NAME.into(),
+                SortKeyState::Provided(None),
                 None,
             ),
         ));
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 009018820a..7707f8301f 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -8,12 +8,15 @@ use data_types::{
 };
 use iox_query::exec::Executor;
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use schema::{selection::Selection, sort::SortKey};
 use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;
 
-use self::buffer::{BufferBatch, DataBuffer};
+use self::{
+    buffer::{BufferBatch, DataBuffer},
+    resolver::DeferredSortKey,
+};
 use crate::{data::query_dedup::query, query::QueryableBatch};
 
 mod buffer;
@@ -132,7 +135,28 @@ impl SnapshotBatch {
     }
 }
 
-/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard
+/// The load state of the [`SortKey`] for a given partition.
+#[derive(Debug)]
+pub(crate) enum SortKeyState {
+    /// The [`SortKey`] has not yet been fetched from the catalog, and will be
+    /// lazy loaded (or loaded in the background) by a call to
+    /// [`DeferredSortKey::get()`].
+    Deferred(DeferredSortKey),
+    /// The sort key is known and specified.
+    Provided(Option<SortKey>),
+}
+
+impl SortKeyState {
+    async fn get(&self) -> Option<SortKey> {
+        match self {
+            Self::Deferred(v) => v.get().await,
+            Self::Provided(v) => v.clone(),
+        }
+    }
+}
+
+/// Data of an IOx Partition of a given Table of a Namespace that belongs to a
+/// given Shard
 #[derive(Debug)]
 pub struct PartitionData {
     /// The catalog ID of the partition this buffer is for.
@@ -140,6 +164,17 @@ pub struct PartitionData {
     /// The string partition key for this partition.
     partition_key: PartitionKey,
 
+    /// The sort key of this partition.
+    ///
+    /// This can known, in which case this field will contain a
+    /// [`SortKeyState::Provided`] with the [`SortKey`], or unknown with a value
+    /// of [`SortKeyState::Deferred`] causing it to be loaded from the catalog
+    /// (potentially) in the background or at read time.
+    ///
+    /// Callers should use [`Self::sort_key()`] to be abstracted away from these
+    /// fetch details.
+    sort_key: SortKeyState,
+
     /// The shard, namespace & table IDs for this partition.
     shard_id: ShardId,
     namespace_id: NamespaceId,
@@ -156,6 +191,7 @@ pub struct PartitionData {
 
 impl PartitionData {
     /// Initialize a new partition data buffer
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         id: PartitionId,
         partition_key: PartitionKey,
@@ -163,11 +199,13 @@ impl PartitionData {
         namespace_id: NamespaceId,
         table_id: TableId,
         table_name: Arc<str>,
+        sort_key: SortKeyState,
         max_persisted_sequence_number: Option<SequenceNumber>,
     ) -> Self {
         Self {
             id,
             partition_key,
+            sort_key,
             shard_id,
             namespace_id,
             table_id,
@@ -347,6 +385,13 @@ impl PartitionData {
     pub fn namespace_id(&self) -> NamespaceId {
         self.namespace_id
     }
+
+    /// Return the [`SortKey`] for this partition.
+    ///
+    /// NOTE: this MAY involve querying the catalog with unbounded retries.
+    pub async fn sort_key(&self) -> Option<SortKey> {
+        self.sort_key.get().await
+    }
 }
 
 #[cfg(test)]
@@ -366,6 +411,7 @@ mod tests {
             NamespaceId::new(42),
             TableId::new(1),
             "foo".into(),
+            SortKeyState::Provided(None),
             None,
         );
 
@@ -413,6 +459,7 @@ mod tests {
             NamespaceId::new(42),
             TableId::new(t_id),
             "restaurant".into(),
+            SortKeyState::Provided(None),
             None,
         );
         let exec = Executor::new(1);
diff --git a/ingester/src/data/partition/resolver/cache.rs b/ingester/src/data/partition/resolver/cache.rs
index 8755e73d90..a9dd897444 100644
--- a/ingester/src/data/partition/resolver/cache.rs
+++ b/ingester/src/data/partition/resolver/cache.rs
@@ -1,13 +1,15 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Duration};
 
 use async_trait::async_trait;
+use backoff::BackoffConfig;
 use data_types::{
     NamespaceId, Partition, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
 };
+use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 
-use crate::data::partition::PartitionData;
+use crate::data::partition::{resolver::DeferredSortKey, PartitionData, SortKeyState};
 
 use super::r#trait::PartitionProvider;
 
@@ -43,6 +45,18 @@ struct Entry {
 /// Each cache hit _removes_ the entry from the cache - this eliminates the
 /// memory overhead for items that were hit. This is the expected (only valid!)
 /// usage pattern.
+///
+/// # Deferred Sort Key Loading
+///
+/// This cache does NOT cache the [`SortKey`] for each [`PartitionData`], as the
+/// sort key can be large and is likely unique per table, and thus not
+/// share-able across instances / prohibitively expensive to cache.
+///
+/// Instead cached instances are returned with a deferred sort key resolver
+/// which attempts to fetch the sort key in the background some time after
+/// construction.
+///
+/// [`SortKey`]: schema::sort::SortKey
 #[derive(Debug)]
 pub(crate) struct PartitionCache<T> {
     // The inner delegate called for a cache miss.
@@ -59,13 +73,31 @@ pub(crate) struct PartitionCache<T> {
     /// a faster search for cache misses.
     #[allow(clippy::type_complexity)]
     entries: Mutex<HashMap<PartitionKey, HashMap<ShardId, HashMap<TableId, Entry>>>>,
+
+    /// Data needed to construct the [`DeferredSortKey`] for cached entries.
+    catalog: Arc<dyn Catalog>,
+    backoff_config: BackoffConfig,
+    /// The maximum amount of time a [`DeferredSortKey`] may wait until
+    /// pre-fetching the sort key in the background.
+    max_smear: Duration,
 }
 
 impl<T> PartitionCache<T> {
     /// Initialise a [`PartitionCache`] containing the specified partitions.
     ///
     /// Any cache miss is passed through to `inner`.
-    pub(crate) fn new<P>(inner: T, partitions: P) -> Self
+    ///
+    /// Any cache hit returns a [`PartitionData`] configured with a
+    /// [`SortKeyState::Deferred`] for deferred key loading in the background.
+    /// The [`DeferredSortKey`] is initialised with the given `catalog`,
+    /// `backoff_config`, and `max_smear` maximal load wait duration.
+    pub(crate) fn new<P>(
+        inner: T,
+        partitions: P,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self
     where
         P: IntoIterator<Item = Partition>,
     {
@@ -97,6 +129,9 @@ impl<T> PartitionCache<T> {
         Self {
             entries: Mutex::new(entries),
             inner,
+            catalog,
+            backoff_config,
+            max_smear,
         }
     }
 
@@ -171,6 +206,12 @@ where
                 namespace_id,
                 table_id,
                 table_name,
+                SortKeyState::Deferred(DeferredSortKey::new(
+                    cached.partition_id,
+                    self.max_smear,
+                    Arc::clone(&__self.catalog),
+                    self.backoff_config.clone(),
+                )),
                 cached.max_sequence_number,
             );
         }
@@ -186,6 +227,8 @@ where
 
 #[cfg(test)]
 mod tests {
+    use iox_catalog::mem::MemCatalog;
+
     use crate::data::partition::resolver::MockPartitionProvider;
 
     use super::*;
@@ -197,6 +240,22 @@ mod tests {
     const TABLE_ID: TableId = TableId::new(3);
     const TABLE_NAME: &str = "platanos";
 
+    fn new_cache<P>(
+        inner: MockPartitionProvider,
+        partitions: P,
+    ) -> PartitionCache<MockPartitionProvider>
+    where
+        P: IntoIterator<Item = Partition>,
+    {
+        PartitionCache::new(
+            inner,
+            partitions,
+            Duration::from_secs(10_000_000),
+            Arc::new(MemCatalog::new(Arc::new(metric::Registry::default()))),
+            BackoffConfig::default(),
+        )
+    }
+
     #[tokio::test]
     async fn test_miss() {
         let data = PartitionData::new(
@@ -206,11 +265,12 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         );
         let inner = MockPartitionProvider::default().with_partition(data);
 
-        let cache = PartitionCache::new(inner, []);
+        let cache = new_cache(inner, []);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
@@ -238,11 +298,11 @@ mod tests {
             shard_id: SHARD_ID,
             table_id: TABLE_ID,
             partition_key: stored_partition_key.clone(),
-            sort_key: Default::default(),
+            sort_key: vec!["dos".to_string(), "bananas".to_string()],
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
 
         let callers_partition_key = PartitionKey::from(PARTITION_KEY);
         let got = cache
@@ -274,7 +334,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_miss_partition_jey() {
+    async fn test_miss_partition_key() {
         let other_key = PartitionKey::from("test");
         let other_key_id = PartitionId::new(99);
         let inner = MockPartitionProvider::default().with_partition(PartitionData::new(
@@ -284,6 +344,7 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -296,7 +357,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 other_key.clone(),
@@ -323,6 +384,7 @@ mod tests {
             NAMESPACE_ID,
             other_table,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -335,7 +397,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
@@ -362,6 +424,7 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -374,7 +437,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
diff --git a/ingester/src/data/partition/resolver/catalog.rs b/ingester/src/data/partition/resolver/catalog.rs
index 8035546be6..128b9a5614 100644
--- a/ingester/src/data/partition/resolver/catalog.rs
+++ b/ingester/src/data/partition/resolver/catalog.rs
@@ -9,7 +9,7 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 
-use crate::data::partition::PartitionData;
+use crate::data::partition::{PartitionData, SortKeyState};
 
 use super::r#trait::PartitionProvider;
 
@@ -78,6 +78,7 @@ impl PartitionProvider for CatalogPartitionResolver {
             namespace_id,
             table_id,
             table_name,
+            SortKeyState::Provided(p.sort_key()),
             p.persisted_sequence_number,
         )
     }
@@ -144,6 +145,7 @@ mod tests {
             .await;
         assert_eq!(got.namespace_id(), namespace_id);
         assert_eq!(*got.table_name(), *table_name);
+        assert_eq!(got.sort_key().await, None);
         assert_eq!(got.max_persisted_sequence_number(), None);
         assert!(got.partition_key.ptr_eq(&callers_partition_key));
 
diff --git a/ingester/src/data/partition/resolver/trait.rs b/ingester/src/data/partition/resolver/trait.rs
index ab89d6753e..a8bf3134e4 100644
--- a/ingester/src/data/partition/resolver/trait.rs
+++ b/ingester/src/data/partition/resolver/trait.rs
@@ -49,7 +49,7 @@ mod tests {
 
     use data_types::PartitionId;
 
-    use crate::data::partition::resolver::MockPartitionProvider;
+    use crate::data::partition::{resolver::MockPartitionProvider, SortKeyState};
 
     use super::*;
 
@@ -68,6 +68,7 @@ mod tests {
             namespace_id,
             table_id,
             Arc::clone(&table_name),
+            SortKeyState::Provided(None),
             None,
         );
 
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 11432f688c..041001126b 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -202,7 +202,7 @@ mod tests {
     use metric::{Attributes, Metric};
 
     use crate::{
-        data::partition::{resolver::MockPartitionProvider, PartitionData},
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
         lifecycle::mock_handle::MockLifecycleHandle,
         test_util::{make_write_op, populate_catalog},
     };
@@ -234,6 +234,7 @@ mod tests {
                 ns_id,
                 table_id,
                 TABLE_NAME.into(),
+                SortKeyState::Provided(None),
                 None,
             ),
         ));
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 000d4d1973..9a49b5f291 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -293,7 +293,7 @@ mod tests {
 
     use crate::{
         data::{
-            partition::{resolver::MockPartitionProvider, PartitionData},
+            partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
             Error,
         },
         lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle},
@@ -328,6 +328,7 @@ mod tests {
                 ns_id,
                 table_id,
                 TABLE_NAME.into(),
+                SortKeyState::Provided(None),
                 None,
             ),
         ));
@@ -387,6 +388,7 @@ mod tests {
                 ns_id,
                 table_id,
                 TABLE_NAME.into(),
+                SortKeyState::Provided(None),
                 None,
             ),
         ));
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index 7f51190102..1f51ce194d 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -41,6 +41,13 @@ use crate::{
     },
 };
 
+/// The maximum duration of time between creating a [`PartitionData`] and its
+/// [`SortKey`] being fetched from the catalog.
+///
+/// [`PartitionData`]: crate::data::partition::PartitionData
+/// [`SortKey`]: schema::sort::SortKey
+const SORT_KEY_PRE_FETCH: Duration = Duration::from_secs(30);
+
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
 pub enum Error {
@@ -160,7 +167,13 @@ impl IngestHandlerImpl {
 
         // Build the partition provider.
         let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
-        let partition_provider = PartitionCache::new(partition_provider, recent_partitions);
+        let partition_provider = PartitionCache::new(
+            partition_provider,
+            recent_partitions,
+            SORT_KEY_PRE_FETCH,
+            Arc::clone(&catalog),
+            BackoffConfig::default(),
+        );
         let partition_provider: Arc<dyn PartitionProvider> = Arc::new(partition_provider);
 
         // build the initial ingester data state
diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index 431c22cdb7..3aae75747d 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -463,7 +463,10 @@ pub trait PartitionRepo: Send + Sync {
         partition_id: PartitionId,
     ) -> Result<Option<PartitionInfo>>;
 
-    /// Update the sort key for the partition
+    /// Update the sort key for the partition.
+    ///
+    /// NOTE: it is expected that ONLY the ingesters update sort keys for
+    /// existing partitions.
     async fn update_sort_key(
         &mut self,
         partition_id: PartitionId,

From 95ed41f14063f53e37c10614a77a3d69c6c26188 Mon Sep 17 00:00:00 2001
From: Nga Tran <nga-tran@live.com>
Date: Thu, 6 Oct 2022 13:21:23 -0400
Subject: [PATCH 20/40] feat: Projection pushdown for querier -> ingester for
 rpc queries (#5782)

* feat: initial step to identify where the projection should be provided

* feat: start getting columns of all expressions

* chore: format

* test: test for the table_chunk_stream

* fix: fix a compile error. Thanks @alamb

* test: full tests for table_chunk_stream

* chore: cleanup

* fix: do not cut any columns in case all fields are needed

* test: add one more test case of reading all columns

* refactor: move code that identify columbs ot push down to a function. Add the use of  field_columns

* chore: cleanup

* refactor: make sream_from_batch support empty batches

* chore: cleanup

* chore: fix clippy after auto merge

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 datafusion_util/src/lib.rs                |  15 +-
 iox_query/src/exec/seriesset/converter.rs |   5 +-
 iox_query/src/frontend/influxrpc.rs       | 949 +++++++++++++++++-----
 iox_query/src/lib.rs                      |   3 +
 iox_query/src/provider/adapter.rs         |  10 +-
 iox_query/src/test.rs                     | 107 ++-
 querier/src/cache/read_buffer.rs          |   4 +-
 querier/src/namespace/query_access.rs     |   3 +-
 query_tests/src/table_schema.rs           |   2 +-
 9 files changed, 859 insertions(+), 239 deletions(-)

diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs
index 75fd250dd0..38a9c8cd05 100644
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@@ -15,7 +15,7 @@ use datafusion::execution::context::TaskContext;
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::common::SizedRecordBatchStream;
 use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics};
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan};
 use datafusion::prelude::SessionContext;
 use datafusion::{
     arrow::{
@@ -236,12 +236,19 @@ where
 }
 
 /// Create a SendableRecordBatchStream a RecordBatch
-pub fn stream_from_batch(batch: RecordBatch) -> SendableRecordBatchStream {
-    stream_from_batches(vec![Arc::new(batch)])
+pub fn stream_from_batch(schema: Arc<Schema>, batch: RecordBatch) -> SendableRecordBatchStream {
+    stream_from_batches(schema, vec![Arc::new(batch)])
 }
 
 /// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema
-pub fn stream_from_batches(batches: Vec<Arc<RecordBatch>>) -> SendableRecordBatchStream {
+pub fn stream_from_batches(
+    schema: Arc<Schema>,
+    batches: Vec<Arc<RecordBatch>>,
+) -> SendableRecordBatchStream {
+    if batches.is_empty() {
+        return Box::pin(EmptyRecordBatchStream::new(schema));
+    }
+
     let dummy_metrics = ExecutionPlanMetricsSet::new();
     let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
     let stream = SizedRecordBatchStream::new(batches[0].schema(), batches, mem_metrics);
diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs
index 6c85358e4f..ca6be3acde 100644
--- a/iox_query/src/exec/seriesset/converter.rs
+++ b/iox_query/src/exec/seriesset/converter.rs
@@ -762,7 +762,7 @@ mod tests {
         .unwrap();
 
         // Input has one row that has no value (NULL value) for tag_b, which is its own series
-        let input = stream_from_batch(batch);
+        let input = stream_from_batch(batch.schema(), batch);
 
         let table_name = "foo";
         let tag_columns = ["tag_a", "tag_b"];
@@ -873,7 +873,8 @@ mod tests {
                     .collect();
 
                 // stream from those batches
-                stream_from_batches(batches)
+                assert!(!batches.is_empty());
+                stream_from_batches(batches[0].schema(), batches)
             })
             .collect()
     }
diff --git a/iox_query/src/frontend/influxrpc.rs b/iox_query/src/frontend/influxrpc.rs
index 1018eed91e..1a8750c779 100644
--- a/iox_query/src/frontend/influxrpc.rs
+++ b/iox_query/src/frontend/influxrpc.rs
@@ -17,12 +17,14 @@ use arrow::datatypes::DataType;
 use data_types::ChunkId;
 use datafusion::{
     error::DataFusionError,
+    logical_expr::utils::exprlist_to_columns,
     logical_plan::{col, when, DFSchemaRef, Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder},
+    prelude::Column,
 };
 use datafusion_util::AsExpr;
 use futures::{Stream, StreamExt, TryStreamExt};
 use hashbrown::HashSet;
-use observability_deps::tracing::{debug, trace};
+use observability_deps::tracing::{debug, trace, warn};
 use predicate::{rpc_predicate::InfluxRpcPredicate, Predicate, PredicateMatch};
 use query_functions::{
     group_by::{Aggregate, WindowDuration},
@@ -31,6 +33,7 @@ use query_functions::{
 };
 use schema::{selection::Selection, InfluxColumnType, Schema, TIME_COLUMN_NAME};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::collections::HashSet as StdHashSet;
 use std::{cmp::Reverse, collections::BTreeSet, sync::Arc};
 
 const CONCURRENT_TABLE_JOBS: usize = 10;
@@ -72,6 +75,11 @@ pub enum Error {
     #[snafu(display("gRPC planner got error building plan: {}", source))]
     BuildingPlan { source: DataFusionError },
 
+    #[snafu(display("gRPC planner got error reading columns from expression: {}", source))]
+    ReadColumns {
+        source: datafusion::error::DataFusionError,
+    },
+
     #[snafu(display(
         "gRPC planner error: column '{}' is not a tag, it is {:?}",
         tag_name,
@@ -145,6 +153,7 @@ impl Error {
             Self::GettingChunks { source, .. }
             | Self::CreatingPredicates { source, .. }
             | Self::BuildingPlan { source, .. }
+            | Self::ReadColumns { source, .. }
             | Self::CheckingChunkPredicate { source, .. }
             | Self::FindingColumnNames { source, .. }
             | Self::FindingColumnValues { source, .. }
@@ -229,49 +238,50 @@ impl InfluxRpcPlanner {
         let table_predicates = rpc_predicate
             .table_predicates(database.as_meta())
             .context(CreatingPredicatesSnafu)?;
-        let tables: Vec<_> = table_chunk_stream(Arc::clone(&database), &table_predicates, &ctx)
-            .try_filter_map(|(table_name, predicate, chunks)| async move {
-                // Identify which chunks can answer from its metadata and then record its table,
-                // and which chunks needs full plan and group them into their table
-                let mut chunks_full = vec![];
-                for chunk in cheap_chunk_first(chunks) {
-                    trace!(chunk_id=%chunk.id(), %table_name, "Considering table");
+        let tables: Vec<_> =
+            table_chunk_stream(Arc::clone(&database), false, &table_predicates, &ctx)
+                .try_filter_map(|(table_name, predicate, chunks)| async move {
+                    // Identify which chunks can answer from its metadata and then record its table,
+                    // and which chunks needs full plan and group them into their table
+                    let mut chunks_full = vec![];
+                    for chunk in cheap_chunk_first(chunks) {
+                        trace!(chunk_id=%chunk.id(), %table_name, "Considering table");
 
-                    // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate
-                    // deleted data before we can determine if its table participates in the requested predicate.
-                    if chunk.has_delete_predicates() {
-                        chunks_full.push(chunk);
-                    } else {
-                        // Try and apply the predicate using only metadata
-                        let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
-                            CheckingChunkPredicateSnafu {
-                                chunk_id: chunk.id(),
-                            },
-                        )?;
+                        // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate
+                        // deleted data before we can determine if its table participates in the requested predicate.
+                        if chunk.has_delete_predicates() {
+                            chunks_full.push(chunk);
+                        } else {
+                            // Try and apply the predicate using only metadata
+                            let pred_result = chunk
+                                .apply_predicate_to_metadata(predicate)
+                                .context(CheckingChunkPredicateSnafu {
+                                    chunk_id: chunk.id(),
+                                })?;
 
-                        match pred_result {
-                            PredicateMatch::AtLeastOneNonNullField => {
-                                trace!("Metadata predicate: table matches");
-                                // Meta data of the table covers predicates of the request
-                                return Ok(Some((table_name, None)));
+                            match pred_result {
+                                PredicateMatch::AtLeastOneNonNullField => {
+                                    trace!("Metadata predicate: table matches");
+                                    // Meta data of the table covers predicates of the request
+                                    return Ok(Some((table_name, None)));
+                                }
+                                PredicateMatch::Unknown => {
+                                    trace!("Metadata predicate: unknown match");
+                                    // We cannot match the predicate to get answer from meta data, let do full plan
+                                    chunks_full.push(chunk);
+                                }
+                                PredicateMatch::Zero => {
+                                    trace!("Metadata predicate: zero rows match");
+                                } // this chunk's table does not participate in the request
                             }
-                            PredicateMatch::Unknown => {
-                                trace!("Metadata predicate: unknown match");
-                                // We cannot match the predicate to get answer from meta data, let do full plan
-                                chunks_full.push(chunk);
-                            }
-                            PredicateMatch::Zero => {
-                                trace!("Metadata predicate: zero rows match");
-                            } // this chunk's table does not participate in the request
                         }
                     }
-                }
 
-                Ok((!chunks_full.is_empty())
-                    .then_some((table_name, Some((predicate, chunks_full)))))
-            })
-            .try_collect()
-            .await?;
+                    Ok((!chunks_full.is_empty())
+                        .then_some((table_name, Some((predicate, chunks_full)))))
+                })
+                .try_collect()
+                .await?;
 
         // Feed builder
         let mut builder = StringSetPlanBuilder::new();
@@ -343,84 +353,88 @@ impl InfluxRpcPlanner {
             }
         }
 
-        let tables: Vec<_> =
-            table_chunk_stream(Arc::clone(&database), &table_predicates_need_chunks, &ctx)
-                .and_then(|(table_name, predicate, chunks)| {
-                    let mut ctx = ctx.child_ctx("table");
-                    ctx.set_metadata("table", table_name.to_owned());
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&database),
+            false,
+            &table_predicates_need_chunks,
+            &ctx,
+        )
+        .and_then(|(table_name, predicate, chunks)| {
+            let mut ctx = ctx.child_ctx("table");
+            ctx.set_metadata("table", table_name.to_owned());
 
-                    async move {
-                        let mut chunks_full = vec![];
-                        let mut known_columns = BTreeSet::new();
+            async move {
+                let mut chunks_full = vec![];
+                let mut known_columns = BTreeSet::new();
 
-                        for chunk in cheap_chunk_first(chunks) {
-                            // Try and apply the predicate using only metadata
-                            let pred_result = chunk
-                                .apply_predicate_to_metadata(predicate)
-                                .context(CheckingChunkPredicateSnafu {
-                                    chunk_id: chunk.id(),
-                                })?;
+                for chunk in cheap_chunk_first(chunks) {
+                    // Try and apply the predicate using only metadata
+                    let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
+                        CheckingChunkPredicateSnafu {
+                            chunk_id: chunk.id(),
+                        },
+                    )?;
 
-                            if matches!(pred_result, PredicateMatch::Zero) {
-                                continue;
+                    if matches!(pred_result, PredicateMatch::Zero) {
+                        continue;
+                    }
+
+                    // get only tag columns from metadata
+                    let schema = chunk.schema();
+
+                    let column_names: Vec<&str> = schema
+                        .tags_iter()
+                        .map(|f| f.name().as_str())
+                        .collect::<Vec<&str>>();
+
+                    let selection = Selection::Some(&column_names);
+
+                    // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
+                    // deleted data before getting tag keys
+                    if chunk.has_delete_predicates() {
+                        debug!(
+                            %table_name,
+                            chunk_id=%chunk.id().get(),
+                            "column names need full plan"
+                        );
+                        chunks_full.push(chunk);
+                    } else {
+                        // filter the columns further from the predicate
+                        let maybe_names = chunk
+                            .column_names(
+                                ctx.child_ctx("column_names execution"),
+                                predicate,
+                                selection,
+                            )
+                            .context(FindingColumnNamesSnafu)?;
+
+                        match maybe_names {
+                            Some(mut names) => {
+                                debug!(
+                                    %table_name,
+                                    names=?names,
+                                    chunk_id=%chunk.id().get(),
+                                    "column names found from metadata",
+                                );
+                                known_columns.append(&mut names);
                             }
-
-                            // get only tag columns from metadata
-                            let schema = chunk.schema();
-
-                            let column_names: Vec<&str> = schema
-                                .tags_iter()
-                                .map(|f| f.name().as_str())
-                                .collect::<Vec<&str>>();
-
-                            let selection = Selection::Some(&column_names);
-
-                            // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
-                            // deleted data before getting tag keys
-                            if chunk.has_delete_predicates() {
+                            None => {
                                 debug!(
                                     %table_name,
                                     chunk_id=%chunk.id().get(),
                                     "column names need full plan"
                                 );
                                 chunks_full.push(chunk);
-                            } else {
-                                // filter the columns further from the predicate
-                                let maybe_names = chunk
-                                    .column_names(
-                                        ctx.child_ctx("column_names execution"),
-                                        predicate,
-                                        selection,
-                                    )
-                                    .context(FindingColumnNamesSnafu)?;
-
-                                match maybe_names {
-                                    Some(mut names) => {
-                                        debug!(
-                                            %table_name,
-                                            names=?names,
-                                            chunk_id=%chunk.id().get(),
-                                            "column names found from metadata",
-                                        );
-                                        known_columns.append(&mut names);
-                                    }
-                                    None => {
-                                        debug!(
-                                            %table_name,
-                                            chunk_id=%chunk.id().get(),
-                                            "column names need full plan"
-                                        );
-                                        chunks_full.push(chunk);
-                                    }
-                                }
                             }
                         }
-
-                        Ok((table_name, predicate, chunks_full, known_columns))
                     }
-                })
-                .try_collect()
-                .await?;
+                }
+
+                Ok((table_name, predicate, chunks_full, known_columns))
+            }
+        })
+        .try_collect()
+        .await?;
 
         // At this point, we have a set of column names we know pass
         // in `known_columns`, and potentially some tables in chunks
@@ -494,100 +508,104 @@ impl InfluxRpcPlanner {
             table_predicates_filtered.push((table_name, predicate));
         }
 
-        let tables: Vec<_> =
-            table_chunk_stream(Arc::clone(&database), &table_predicates_filtered, &ctx)
-                .and_then(|(table_name, predicate, chunks)| async move {
-                    let mut chunks_full = vec![];
-                    let mut known_values = BTreeSet::new();
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&database),
+            false,
+            &table_predicates_filtered,
+            &ctx,
+        )
+        .and_then(|(table_name, predicate, chunks)| async move {
+            let mut chunks_full = vec![];
+            let mut known_values = BTreeSet::new();
 
-                    for chunk in cheap_chunk_first(chunks) {
-                        // Try and apply the predicate using only metadata
-                        let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
-                            CheckingChunkPredicateSnafu {
-                                chunk_id: chunk.id(),
-                            },
-                        )?;
+            for chunk in cheap_chunk_first(chunks) {
+                // Try and apply the predicate using only metadata
+                let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
+                    CheckingChunkPredicateSnafu {
+                        chunk_id: chunk.id(),
+                    },
+                )?;
 
-                        if matches!(pred_result, PredicateMatch::Zero) {
-                            continue;
+                if matches!(pred_result, PredicateMatch::Zero) {
+                    continue;
+                }
+
+                // use schema to validate column type
+                let schema = chunk.schema();
+
+                // Skip this table if the tag_name is not a column in this chunk
+                // Note: This may happen even when the table contains the tag_name, because some chunks may not
+                //       contain all columns.
+                let idx = if let Some(idx) = schema.find_index_of(tag_name) {
+                    idx
+                } else {
+                    continue;
+                };
+
+                // Validate that this really is a Tag column
+                let (influx_column_type, field) = schema.field(idx);
+                ensure!(
+                    matches!(influx_column_type, Some(InfluxColumnType::Tag)),
+                    InvalidTagColumnSnafu {
+                        tag_name,
+                        influx_column_type,
+                    }
+                );
+                ensure!(
+                    influx_column_type
+                        .unwrap()
+                        .valid_arrow_type(field.data_type()),
+                    InternalInvalidTagTypeSnafu {
+                        tag_name,
+                        data_type: field.data_type().clone(),
+                    }
+                );
+
+                // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
+                // deleted data before getting tag values
+                if chunk.has_delete_predicates() {
+                    debug!(
+                        %table_name,
+                        chunk_id=%chunk.id().get(),
+                        "need full plan to find tag values"
+                    );
+
+                    chunks_full.push(chunk);
+                } else {
+                    // try and get the list of values directly from metadata
+                    let mut ctx = self.ctx.child_ctx("tag_values execution");
+                    ctx.set_metadata("table", table_name.to_owned());
+
+                    let maybe_values = chunk
+                        .column_values(ctx, tag_name, predicate)
+                        .context(FindingColumnValuesSnafu)?;
+
+                    match maybe_values {
+                        Some(mut names) => {
+                            debug!(
+                                %table_name,
+                                names=?names,
+                                chunk_id=%chunk.id().get(),
+                                "tag values found from metadata",
+                            );
+                            known_values.append(&mut names);
                         }
-
-                        // use schema to validate column type
-                        let schema = chunk.schema();
-
-                        // Skip this table if the tag_name is not a column in this chunk
-                        // Note: This may happen even when the table contains the tag_name, because some chunks may not
-                        //       contain all columns.
-                        let idx = if let Some(idx) = schema.find_index_of(tag_name) {
-                            idx
-                        } else {
-                            continue;
-                        };
-
-                        // Validate that this really is a Tag column
-                        let (influx_column_type, field) = schema.field(idx);
-                        ensure!(
-                            matches!(influx_column_type, Some(InfluxColumnType::Tag)),
-                            InvalidTagColumnSnafu {
-                                tag_name,
-                                influx_column_type,
-                            }
-                        );
-                        ensure!(
-                            influx_column_type
-                                .unwrap()
-                                .valid_arrow_type(field.data_type()),
-                            InternalInvalidTagTypeSnafu {
-                                tag_name,
-                                data_type: field.data_type().clone(),
-                            }
-                        );
-
-                        // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
-                        // deleted data before getting tag values
-                        if chunk.has_delete_predicates() {
+                        None => {
                             debug!(
                                 %table_name,
                                 chunk_id=%chunk.id().get(),
                                 "need full plan to find tag values"
                             );
-
                             chunks_full.push(chunk);
-                        } else {
-                            // try and get the list of values directly from metadata
-                            let mut ctx = self.ctx.child_ctx("tag_values execution");
-                            ctx.set_metadata("table", table_name.to_owned());
-
-                            let maybe_values = chunk
-                                .column_values(ctx, tag_name, predicate)
-                                .context(FindingColumnValuesSnafu)?;
-
-                            match maybe_values {
-                                Some(mut names) => {
-                                    debug!(
-                                        %table_name,
-                                        names=?names,
-                                        chunk_id=%chunk.id().get(),
-                                        "tag values found from metadata",
-                                    );
-                                    known_values.append(&mut names);
-                                }
-                                None => {
-                                    debug!(
-                                        %table_name,
-                                        chunk_id=%chunk.id().get(),
-                                        "need full plan to find tag values"
-                                    );
-                                    chunks_full.push(chunk);
-                                }
-                            }
                         }
                     }
+                }
+            }
 
-                    Ok((table_name, predicate, chunks_full, known_values))
-                })
-                .try_collect()
-                .await?;
+            Ok((table_name, predicate, chunks_full, known_values))
+        })
+        .try_collect()
+        .await?;
 
         let mut builder = StringSetPlanBuilder::new();
 
@@ -1314,8 +1332,18 @@ impl InfluxRpcPlanner {
 }
 
 /// Stream of chunks for table predicates.
+/// This function is used by influx grpc meta queries that want to know which table/tags/fields
+/// that match the given predicates.
+/// `need_fields` means the grpc queries will need to return field columns. If  `need_fields`
+/// is false, the grpc query does not need to return field columns but it still filters data on the
+/// field columns in the predicate
+///
+/// This function is directly invoked by `table_name, `tag_keys` and `tag_values` where need_fields should be false.
+/// This function is indirectly invoked by `field_columns`, `read_filter`, `read_group` and `read_window_aggregate`
+/// through the function `create_plans` where need_fields should be true.
 fn table_chunk_stream<'a>(
     database: Arc<dyn QueryDatabase>,
+    need_fields: bool,
     table_predicates: &'a [(String, Predicate)],
     ctx: &'a IOxSessionContext,
 ) -> impl Stream<Item = Result<(&'a str, &'a Predicate, Vec<Arc<dyn QueryChunk>>)>> + 'a {
@@ -1326,9 +1354,22 @@ fn table_chunk_stream<'a>(
 
             let database = Arc::clone(&database);
 
+            let table_schema = database.table_schema(table_name);
+            let projection = match table_schema {
+                Some(table_schema) => {
+                    columns_in_predicates(need_fields, table_schema, table_name, predicate)
+                }
+                None => None,
+            };
+
             async move {
                 let chunks = database
-                    .chunks(table_name, predicate, ctx.child_ctx("table chunks"))
+                    .chunks(
+                        table_name,
+                        predicate,
+                        &projection,
+                        ctx.child_ctx("table chunks"),
+                    )
                     .await
                     .context(GettingChunksSnafu { table_name })?;
 
@@ -1338,6 +1379,89 @@ fn table_chunk_stream<'a>(
         .buffered(CONCURRENT_TABLE_JOBS)
 }
 
+// Return all columns in predicate's field_columns, exprs and val_exprs.
+// Return None means nothing is filtered in this function and all field columns should be used.
+// None is returned when:
+//   - we cannot determine at least one column in the predicate
+//   - need_fields is true and the predicate does not have any field_columns.
+//     This signal that all fields are needed.
+// Note that the returned columns can also include tag and time columns if they happen to be
+// in the predicate.
+fn columns_in_predicates(
+    need_fields: bool,
+    table_schema: Arc<Schema>,
+    table_name: &String,
+    predicate: &Predicate,
+) -> Option<Vec<usize>> {
+    let mut columns = StdHashSet::new();
+
+    // columns in field_columns
+    match &predicate.field_columns {
+        Some(field_columns) => {
+            for field in field_columns {
+                columns.insert(Column {
+                    relation: None,
+                    name: (*field).clone(),
+                });
+            }
+        }
+        None => {
+            if need_fields {
+                // fields wanted and `field_columns` is empty mean al fields will be needed
+                return None;
+            }
+        }
+    }
+
+    // columns in exprs
+    let expr_cols_result =
+        exprlist_to_columns(&predicate.exprs, &mut columns).context(ReadColumnsSnafu);
+
+    // columns in val_exprs
+    let exprs: Vec<Expr> = predicate
+        .value_expr
+        .iter()
+        .map(|e| Expr::from((*e).clone()))
+        .collect();
+    let val_exprs_cols_result = exprlist_to_columns(&exprs, &mut columns).context(ReadColumnsSnafu);
+
+    let projection = if expr_cols_result.is_err() || val_exprs_cols_result.is_err() {
+        if expr_cols_result.is_err() {
+            let error_message = expr_cols_result.err().unwrap().to_string();
+            warn!(?table_name, ?predicate.exprs, ?error_message, "cannot determine columns in predicate.exprs");
+        }
+        if val_exprs_cols_result.is_err() {
+            let error_message = val_exprs_cols_result.err().unwrap().to_string();
+            warn!(?table_name, ?predicate.value_expr, ?error_message, "cannot determine columns in predicate.value_expr");
+        }
+
+        None
+    } else {
+        // convert the column names into their corresponding indexes in the schema
+        let cols = columns
+            .iter()
+            .map(|c| table_schema.find_index_of(&c.name))
+            .collect::<Vec<_>>();
+
+        if cols.contains(&None) || cols.is_empty() {
+            // At least one column has no matching index, we do not know which
+            // columns to filter. Read all columns
+            warn!(
+                ?table_name,
+                ?predicate,
+                ?table_schema,
+                "cannot find index for at least one column in the table schema"
+            );
+            None
+        } else {
+            // We know which columns to filter, read only those columns
+            Some(cols.into_iter().flatten().collect::<Vec<_>>())
+        }
+    };
+
+    projection
+}
+
 /// Create plans that fetch the data specified in table_predicates.
 ///
 /// table_predicates contains `(table_name, predicate_specialized_for_that_table)`
@@ -1366,7 +1490,7 @@ where
         + Sync,
     P: Send,
 {
-    table_chunk_stream(Arc::clone(&database), table_predicates, &ctx)
+    table_chunk_stream(Arc::clone(&database), true, table_predicates, &ctx)
         .and_then(|(table_name, predicate, chunks)| async move {
             let chunks = prune_chunks_metadata(chunks, predicate)?;
             Ok((table_name, predicate, chunks))
@@ -1764,15 +1888,462 @@ fn cheap_chunk_first(mut chunks: Vec<Arc<dyn QueryChunk>>) -> Vec<Arc<dyn QueryC
 mod tests {
     use datafusion::logical_plan::lit;
     use futures::{future::BoxFuture, FutureExt};
-    use predicate::Predicate;
+    use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 
     use crate::{
-        exec::Executor,
+        exec::{ExecutionContextProvider, Executor},
         test::{TestChunk, TestDatabase},
     };
 
     use super::*;
 
+    #[test]
+    fn test_columns_in_predicates() {
+        // setup a db
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+        // index of columns in the above chunk: [bar, foo, i64_field, i64_field_2, time]
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let table = "h2o";
+        let schema = test_db.table_schema(table).unwrap();
+
+        // test 1: empty predicate without need_fields
+        let predicate = Predicate::new();
+        let need_fields = false;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        assert_eq!(projection, None);
+
+        // test 2: empty predicate with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        assert_eq!(projection, None);
+
+        // test 3: predicate on tag without need_fields
+        let predicate = Predicate::new().with_expr(col("foo").eq(lit("some_thing")));
+        let need_fields = false;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        // return index of foo
+        assert_eq!(projection, vec![1]);
+
+        // test 4: predicate on tag with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        // return None means all fields
+        assert_eq!(projection, None);
+
+        // test 5: predicate on tag with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("foo").eq(lit("some_thing")))
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let need_fields = false;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of i64_field and foo
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 6: predicate on tag with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of foo and index of i64_field
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 7: predicate on tag and field with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("bar").eq(lit(1)).and(col("i64_field").eq(lit(1))))
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let need_fields = false;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+
+        // test 7: predicate on tag and field with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_no_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate has no field_columns
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        ////////////////////////////
+        // Test 1: need_fields --> all columns will be selected
+        let need_fields = true;
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because we asked it return all fileds (and implicit PK) even though the predicate is on `foo` only
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        ////////////////////////////
+        // Test 2: no need_fields --> only PK + columns in predicate are return
+        let need_fields = false;
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  only 3 columns of the table PK + cols in predicate
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 3);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_empty_pred() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // empty predicate
+        let predicate = Predicate::new();
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        /////////////
+        // Test 1: empty predicate with need_fields
+        let need_fields = true;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        /////////////
+        // Test 2: empty predicate without need_fields
+        let need_fields = false;
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_tag_no_data() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column(), // no row added for this chunk on purpose
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // Since no data, we do not do pushdown in the test chunk.
+        // the no-data returned chunk will include all columns of the table
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_and_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        let need_fields = false;
+
+        /////////////
+        // Test 1: predicate on field `i64_field_2` and `field_columns` is empty
+        // predicate on field column
+        let expr = col("i64_field_2").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes 4 columns: 3 cols of PK plus i64_field_2
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 4);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        /////////////
+        // Test 2: predicate on tag `foo` and `field_columns` is not empty
+        let expr = col("bar").eq(lit(10));
+        let predicate = Predicate::new()
+            .with_expr(expr)
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes 4 columns: 3 cols of PK plus i64_field_1
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 4);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_unknown_field() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate on unknown column
+        let expr = col("unknown_name").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes all 5 columns since we hit the unknown columnd
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
     #[tokio::test]
     async fn test_predicate_rewrite_table_names() {
         run_test(|test_db, rpc_predicate| {
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
index b9d09544a3..7863e9750f 100644
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@@ -151,10 +151,13 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
     /// Returns a set of chunks within the partition with data that may match
     /// the provided predicate. If possible, chunks which have no rows that can
     /// possibly match the predicate may be omitted.
+    /// If projection is None, returned chunks will include all columns of its original data. Otherwise,
+    /// returned chunks will includs PK columns (tags and time) and columns specified in the projection.
     async fn chunks(
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         ctx: IOxSessionContext,
     ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;
 
diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs
index 23cb2e2f6a..cf143dcb57 100644
--- a/iox_query/src/provider/adapter.rs
+++ b/iox_query/src/provider/adapter.rs
@@ -262,7 +262,7 @@ mod tests {
         let batch = make_batch();
 
         let output_schema = batch.schema();
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -291,7 +291,7 @@ mod tests {
             Field::new("c", DataType::Utf8, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -321,7 +321,7 @@ mod tests {
             Field::new("d", DataType::Float32, true),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -349,7 +349,7 @@ mod tests {
             Field::new("c", DataType::Utf8, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
 
         assert_contains!(
@@ -368,7 +368,7 @@ mod tests {
             Field::new("b", DataType::Int32, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
 
         assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'");
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
index 256cd069ea..e7a0503f1c 100644
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@@ -108,18 +108,54 @@ impl QueryDatabase for TestDatabase {
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         _ctx: IOxSessionContext,
     ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         // save last predicate
         *self.chunks_predicate.lock() = predicate.clone();
 
-        let partitions = self.partitions.lock();
-        Ok(partitions
+        let partitions = self.partitions.lock().clone();
+        let chunks = partitions
             .values()
             .flat_map(|x| x.values())
             .filter(|x| x.table_name == table_name)
-            .map(|x| Arc::clone(x) as _)
-            .collect())
+            .map(|x| Arc::clone(x) as Arc<dyn QueryChunk>)
+            .collect::<Vec<_>>();
+
+        // Return chunks with fewer columns if a projection is specified
+        let mut new_chunks = Vec::with_capacity(chunks.len());
+        for c in chunks {
+            let schema = c.schema();
+            let cols = schema.select_given_and_pk_columns(projection);
+            let cols = cols.iter().map(|c| c.as_str()).collect::<Vec<_>>();
+            let selection = Selection::Some(&cols);
+
+            let read_result =
+                c.read_filter(IOxSessionContext::with_testing(), predicate, selection);
+            if read_result.is_err() {
+                return Err(read_result.err().unwrap());
+            }
+            let mut stream = read_result.unwrap();
+
+            let mut new_chunk = TestChunk::new(c.table_name());
+            while let Some(b) = stream.next().await {
+                let b = b.expect("Error in stream");
+                new_chunk.table_data.push(Arc::new(b));
+            }
+
+            let new_chunk = if !new_chunk.table_data.is_empty() {
+                let new_schema = Schema::try_from(new_chunk.table_data[0].schema()).unwrap();
+                let new_chunk = new_chunk.add_schema_to_table(new_schema, true, None);
+                Arc::new(new_chunk) as _
+            } else {
+                // No data, return the original empty chunk with the original schema
+                c
+            };
+
+            new_chunks.push(new_chunk);
+        }
+
+        Ok(new_chunks)
     }
 
     fn record_query(
@@ -509,12 +545,8 @@ impl TestChunk {
         mut self,
         new_column_schema: Schema,
         add_column_summary: bool,
-        stats: Option<Statistics>,
+        input_stats: Option<Statistics>,
     ) -> Self {
-        // assume the new schema has exactly a single table
-        assert_eq!(new_column_schema.len(), 1);
-        let (col_type, new_field) = new_column_schema.field(0);
-
         let mut merger = SchemaMerger::new();
         merger = merger.merge(&new_column_schema).unwrap();
         merger = merger
@@ -522,34 +554,38 @@ impl TestChunk {
             .expect("merging was successful");
         self.schema = merger.build();
 
-        if add_column_summary {
-            let influxdb_type = col_type.map(|t| match t {
-                InfluxColumnType::Tag => InfluxDbType::Tag,
-                InfluxColumnType::Field(_) => InfluxDbType::Field,
-                InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
-            });
+        for i in 0..new_column_schema.len() {
+            let (col_type, new_field) = new_column_schema.field(i);
+            if add_column_summary {
+                let influxdb_type = col_type.map(|t| match t {
+                    InfluxColumnType::Tag => InfluxDbType::Tag,
+                    InfluxColumnType::Field(_) => InfluxDbType::Field,
+                    InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
+                });
 
-            let stats = stats.unwrap_or_else(|| match new_field.data_type() {
-                DataType::Boolean => Statistics::Bool(StatValues::default()),
-                DataType::Int64 => Statistics::I64(StatValues::default()),
-                DataType::UInt64 => Statistics::U64(StatValues::default()),
-                DataType::Utf8 => Statistics::String(StatValues::default()),
-                DataType::Dictionary(_, value_type) => {
-                    assert!(matches!(**value_type, DataType::Utf8));
-                    Statistics::String(StatValues::default())
-                }
-                DataType::Float64 => Statistics::F64(StatValues::default()),
-                DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
-                _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
-            });
+                let stats = input_stats.clone();
+                let stats = stats.unwrap_or_else(|| match new_field.data_type() {
+                    DataType::Boolean => Statistics::Bool(StatValues::default()),
+                    DataType::Int64 => Statistics::I64(StatValues::default()),
+                    DataType::UInt64 => Statistics::U64(StatValues::default()),
+                    DataType::Utf8 => Statistics::String(StatValues::default()),
+                    DataType::Dictionary(_, value_type) => {
+                        assert!(matches!(**value_type, DataType::Utf8));
+                        Statistics::String(StatValues::default())
+                    }
+                    DataType::Float64 => Statistics::F64(StatValues::default()),
+                    DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
+                    _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
+                });
 
-            let column_summary = ColumnSummary {
-                name: new_field.name().clone(),
-                influxdb_type,
-                stats,
-            };
+                let column_summary = ColumnSummary {
+                    name: new_field.name().clone(),
+                    influxdb_type,
+                    stats,
+                };
 
-            self.table_summary.columns.push(column_summary);
+                self.table_summary.columns.push(column_summary);
+            }
         }
 
         self
@@ -942,7 +978,8 @@ impl QueryChunk for TestChunk {
                 })
                 .collect::<std::result::Result<Vec<_>, ArrowError>>()?,
         };
-        Ok(stream_from_batches(batches))
+
+        Ok(stream_from_batches(self.schema().as_arrow(), batches))
     }
 
     fn chunk_type(&self) -> &str {
diff --git a/querier/src/cache/read_buffer.rs b/querier/src/cache/read_buffer.rs
index 4c68bcac9d..63138e242a 100644
--- a/querier/src/cache/read_buffer.rs
+++ b/querier/src/cache/read_buffer.rs
@@ -470,9 +470,9 @@ mod tests {
             .into_iter()
             .map(lp_to_record_batch)
             .map(Arc::new)
-            .collect();
+            .collect::<Vec<_>>();
 
-        let stream = stream_from_batches(batches);
+        let stream = stream_from_batches(batches[0].schema(), batches);
 
         let metric_registry = metric::Registry::new();
 
diff --git a/querier/src/namespace/query_access.rs b/querier/src/namespace/query_access.rs
index e7a3856554..30b9975a06 100644
--- a/querier/src/namespace/query_access.rs
+++ b/querier/src/namespace/query_access.rs
@@ -41,6 +41,7 @@ impl QueryDatabase for QuerierNamespace {
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         ctx: IOxSessionContext,
     ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         debug!(%table_name, %predicate, "Finding chunks for table");
@@ -58,7 +59,7 @@ impl QueryDatabase for QuerierNamespace {
             .chunks(
                 predicate,
                 ctx.span().map(|span| span.child("querier table chunks")),
-                &None, // todo: pushdown projection to chunks
+                projection,
             )
             .await?;
 
diff --git a/query_tests/src/table_schema.rs b/query_tests/src/table_schema.rs
index f01a1b8b7d..359ba1ce49 100644
--- a/query_tests/src/table_schema.rs
+++ b/query_tests/src/table_schema.rs
@@ -38,7 +38,7 @@ async fn run_table_schema_test_case<D>(
 
         let ctx = db.new_query_context(None);
         let chunks = db
-            .chunks(table_name, &Default::default(), ctx)
+            .chunks(table_name, &Default::default(), &None, ctx)
             .await
             .expect("error getting chunks");
         for chunk in chunks {

From 5bd6b43666adc8a4c5be28e056a6c1ee775bcc5b Mon Sep 17 00:00:00 2001
From: Stuart Carnie <stuart.carnie@gmail.com>
Date: Fri, 7 Oct 2022 11:01:22 +1100
Subject: [PATCH 21/40] fix: Correct representation of 3-part measurement name
 (#5794)

Closes #5662
---
 influxdb_influxql_parser/src/common.rs        | 196 +++++++++++++-----
 influxdb_influxql_parser/src/delete.rs        |   5 +
 influxdb_influxql_parser/src/internal.rs      |   6 +-
 influxdb_influxql_parser/src/select.rs        |  13 +-
 .../src/show_measurements.rs                  | 111 +++++-----
 .../src/simple_from_clause.rs                 |  77 +++----
 6 files changed, 229 insertions(+), 179 deletions(-)

diff --git a/influxdb_influxql_parser/src/common.rs b/influxdb_influxql_parser/src/common.rs
index 51266177d6..a6b245c397 100644
--- a/influxdb_influxql_parser/src/common.rs
+++ b/influxdb_influxql_parser/src/common.rs
@@ -2,6 +2,7 @@ use crate::expression::conditional::{conditional_expression, ConditionalExpressi
 use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use crate::literal::unsigned_integer;
+use crate::string::{regex, Regex};
 use core::fmt;
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
@@ -11,73 +12,82 @@ use nom::multi::separated_list1;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt::{Display, Formatter};
 
-/// Represents a fully-qualified measurement name.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub struct MeasurementNameExpression {
+/// Represents a measurement name as either an identifier or a regular expression.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MeasurementName {
+    /// A measurement name expressed as an [`Identifier`].
+    Name(Identifier),
+
+    /// A measurement name expressed as a [`Regex`].
+    Regex(Regex),
+}
+
+impl Parser for MeasurementName {
+    /// Parse a measurement name, which may be an identifier or a regular expression.
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            map(identifier, MeasurementName::Name),
+            map(regex, MeasurementName::Regex),
+        ))(i)
+    }
+}
+
+impl Display for MeasurementName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Name(ident) => fmt::Display::fmt(ident, f),
+            Self::Regex(regex) => fmt::Display::fmt(regex, f),
+        }
+    }
+}
+
+/// Represents a fully-qualified, 3-part measurement name.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct QualifiedMeasurementName {
     pub database: Option<Identifier>,
     pub retention_policy: Option<Identifier>,
-    pub name: Identifier,
+    pub name: MeasurementName,
 }
 
-impl MeasurementNameExpression {
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`.
-    pub fn new(name: Identifier) -> Self {
-        Self {
-            database: None,
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
-    pub fn new_db(name: Identifier, database: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
-    pub fn new_db_rp(name: Identifier, database: Identifier, retention_policy: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: Some(retention_policy),
-            name,
-        }
-    }
-}
-
-impl fmt::Display for MeasurementNameExpression {
+impl Display for QualifiedMeasurementName {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
             Self {
                 database: None,
                 retention_policy: None,
                 name,
-            } => write!(f, "{}", name)?,
+            } => write!(f, "{}", name),
             Self {
                 database: Some(db),
                 retention_policy: None,
                 name,
-            } => write!(f, "{}..{}", db, name)?,
+            } => write!(f, "{}..{}", db, name),
             Self {
                 database: None,
                 retention_policy: Some(rp),
                 name,
-            } => write!(f, "{}.{}", rp, name)?,
+            } => write!(f, "{}.{}", rp, name),
             Self {
                 database: Some(db),
                 retention_policy: Some(rp),
                 name,
-            } => write!(f, "{}.{}.{}", db, rp, name)?,
-        };
-        Ok(())
+            } => write!(f, "{}.{}.{}", db, rp, name),
+        }
     }
 }
 
-/// Match a 3-part measurement name expression.
-pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementNameExpression> {
+/// Match a fully-qualified, 3-part measurement name.
+///
+/// ```text
+/// qualified_measurement_name ::= measurement_name |
+///                              ( policy_name "." measurement_name ) |
+///                              ( db_name "." policy_name? "." measurement_name )
+///
+/// db_name          ::= identifier
+/// policy_name      ::= identifier
+/// measurement_name ::= identifier | regex_lit
+/// ```
+pub fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> {
     let (remaining_input, (opt_db_rp, name)) = pair(
         opt(alt((
             // database "." retention_policy "."
@@ -93,7 +103,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
             // retention_policy "."
             map(terminated(identifier, tag(".")), |rp| (None, Some(rp))),
         ))),
-        identifier,
+        MeasurementName::parse,
     )(i)?;
 
     // Extract possible `database` and / or `retention_policy`
@@ -104,7 +114,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
 
     Ok((
         remaining_input,
-        MeasurementNameExpression {
+        QualifiedMeasurementName {
             database,
             retention_policy,
             name,
@@ -290,35 +300,107 @@ mod tests {
     use crate::assert_expect_error;
     use nom::character::complete::alphanumeric1;
 
-    #[test]
-    fn test_measurement_name_expression() {
-        let (_, got) = measurement_name_expression("diskio").unwrap();
-        assert_eq!(
-            got,
-            MeasurementNameExpression {
+    impl From<&str> for MeasurementName {
+        /// Convert a `str` to [`MeasurementName::Name`].
+        fn from(s: &str) -> Self {
+            Self::Name(Identifier(s.into()))
+        }
+    }
+
+    impl QualifiedMeasurementName {
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`.
+        pub fn new(name: MeasurementName) -> Self {
+            Self {
                 database: None,
                 retention_policy: None,
-                name: "diskio".into(),
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
+        pub fn new_db(name: MeasurementName, database: Identifier) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: None,
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
+        pub fn new_db_rp(
+            name: MeasurementName,
+            database: Identifier,
+            retention_policy: Identifier,
+        ) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: Some(retention_policy),
+                name,
+            }
+        }
+    }
+
+    #[test]
+    fn test_qualified_measurement_name() {
+        use MeasurementName::*;
+
+        let (_, got) = qualified_measurement_name("diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Name("diskio".into()),
             }
         );
 
-        let (_, got) = measurement_name_expression("telegraf.autogen.diskio").unwrap();
+        let (_, got) = qualified_measurement_name("/diskio/").unwrap();
         assert_eq!(
             got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                 database: Some("telegraf".into()),
                 retention_policy: Some("autogen".into()),
-                name: "diskio".into(),
+                name: Name("diskio".into()),
             }
         );
 
-        let (_, got) = measurement_name_expression("telegraf..diskio").unwrap();
+        let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap();
         assert_eq!(
             got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                 database: Some("telegraf".into()),
                 retention_policy: None,
-                name: "diskio".into(),
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Regex("diskio".into()),
             }
         );
     }
diff --git a/influxdb_influxql_parser/src/delete.rs b/influxdb_influxql_parser/src/delete.rs
index 3613e027ea..6d8a8c7cad 100644
--- a/influxdb_influxql_parser/src/delete.rs
+++ b/influxdb_influxql_parser/src/delete.rs
@@ -73,9 +73,14 @@ mod test {
         // Validate via the Display trait, as we don't need to validate the contents of the
         // FROM and / or WHERE clauses, given they are tested in their on modules.
 
+        // Measurement name expressed as an identifier
         let (_, got) = delete_statement("DELETE FROM foo").unwrap();
         assert_eq!(format!("{}", got), "DELETE FROM foo");
 
+        // Measurement name expressed as a regular expression
+        let (_, got) = delete_statement("DELETE FROM /foo/").unwrap();
+        assert_eq!(format!("{}", got), "DELETE FROM /foo/");
+
         let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap();
         assert_eq!(format!("{}", got), "DELETE FROM foo WHERE time > 10");
 
diff --git a/influxdb_influxql_parser/src/internal.rs b/influxdb_influxql_parser/src/internal.rs
index f9a2b2dcdc..a18c6f5a10 100644
--- a/influxdb_influxql_parser/src/internal.rs
+++ b/influxdb_influxql_parser/src/internal.rs
@@ -22,12 +22,10 @@ impl<I: Display> Display for Error<I> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Syntax { input: _, message } => {
-                write!(f, "Syntax error: {}", message)?;
+                write!(f, "Syntax error: {}", message)
             }
-            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind)?,
+            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind),
         }
-
-        Ok(())
     }
 }
 
diff --git a/influxdb_influxql_parser/src/select.rs b/influxdb_influxql_parser/src/select.rs
index 111c0c869c..7b9764c182 100644
--- a/influxdb_influxql_parser/src/select.rs
+++ b/influxdb_influxql_parser/src/select.rs
@@ -1,6 +1,6 @@
 use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, order_by_clause, where_clause,
-    MeasurementNameExpression, OneOrMore, OrderByClause, Parser,
+    limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause,
+    OneOrMore, OrderByClause, Parser, QualifiedMeasurementName,
 };
 use crate::expression::arithmetic::Expr::Wildcard;
 use crate::expression::arithmetic::{
@@ -164,8 +164,7 @@ pub fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> {
 /// Represents a single measurement selection found in a `FROM` clause.
 #[derive(Clone, Debug, PartialEq)]
 pub enum MeasurementSelection {
-    Name(MeasurementNameExpression),
-    Regex(Regex),
+    Name(QualifiedMeasurementName),
     Subquery(Box<SelectStatement>),
 }
 
@@ -173,7 +172,6 @@ impl Display for MeasurementSelection {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
             Self::Name(ref name) => fmt::Display::fmt(name, f),
-            Self::Regex(ref re) => fmt::Display::fmt(re, f),
             Self::Subquery(ref subquery) => write!(f, "({})", subquery),
         }
     }
@@ -182,8 +180,7 @@ impl Display for MeasurementSelection {
 impl Parser for MeasurementSelection {
     fn parse(i: &str) -> ParseResult<&str, Self> {
         alt((
-            map(measurement_name_expression, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
+            map(qualified_measurement_name, MeasurementSelection::Name),
             map(
                 delimited(
                     preceded(multispace0, char('(')),
@@ -812,7 +809,7 @@ mod test {
         assert_matches!(got, MeasurementSelection::Name(_));
 
         let (_, got) = MeasurementSelection::parse("/regex/").unwrap();
-        assert_matches!(got, MeasurementSelection::Regex(_));
+        assert_matches!(got, MeasurementSelection::Name(_));
 
         let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap();
         assert_matches!(got, MeasurementSelection::Subquery(_));
diff --git a/influxdb_influxql_parser/src/show_measurements.rs b/influxdb_influxql_parser/src/show_measurements.rs
index 582d562df8..d5277fad9b 100644
--- a/influxdb_influxql_parser/src/show_measurements.rs
+++ b/influxdb_influxql_parser/src/show_measurements.rs
@@ -2,24 +2,21 @@
 //!
 //! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements
 
+use crate::common::{
+    limit_clause, offset_clause, qualified_measurement_name, where_clause, QualifiedMeasurementName,
+};
+use crate::expression::conditional::ConditionalExpression;
+use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
-use nom::character::complete::{char, multispace0, multispace1};
+use nom::character::complete::{multispace0, multispace1};
 use nom::combinator::{map, opt, value};
 use nom::sequence::tuple;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt;
 use std::fmt::Formatter;
 
-use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, where_clause,
-    MeasurementNameExpression,
-};
-use crate::expression::conditional::ConditionalExpression;
-use crate::identifier::{identifier, Identifier};
-use crate::string::{regex, Regex};
-
 /// OnExpression represents an InfluxQL database or retention policy name
 /// or a wildcard.
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
@@ -110,18 +107,16 @@ impl fmt::Display for ShowMeasurementsStatement {
 
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum MeasurementExpression {
-    Equals(MeasurementNameExpression),
-    Regex(Regex),
+    Equals(QualifiedMeasurementName),
+    Regex(QualifiedMeasurementName),
 }
 
 impl fmt::Display for MeasurementExpression {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
-            Self::Equals(ref name) => write!(f, "= {}", name)?,
-            Self::Regex(ref re) => write!(f, "=~ {}", re)?,
-        };
-
-        Ok(())
+            Self::Equals(ref name) => write!(f, "= {}", name),
+            Self::Regex(ref re) => write!(f, "=~ {}", re),
+        }
     }
 }
 
@@ -140,23 +135,15 @@ fn with_measurement_clause(i: &str) -> ParseResult<&str, MeasurementExpression>
             "expected = or =~",
             alt((
                 map(
-                    tuple((
-                        tag("=~"),
-                        multispace0,
-                        expect("expected regular expression literal", regex),
-                    )),
-                    |(_, _, regex)| MeasurementExpression::Regex(regex),
+                    preceded(pair(tag("=~"), multispace0), qualified_measurement_name),
+                    MeasurementExpression::Regex,
                 ),
                 map(
-                    tuple((
-                        char('='),
-                        multispace0,
-                        expect(
-                            "expected measurement name or wildcard",
-                            measurement_name_expression,
-                        ),
-                    )),
-                    |(_, _, name)| MeasurementExpression::Equals(name),
+                    preceded(
+                        pair(tag("="), multispace0),
+                        expect("expected measurement name", qualified_measurement_name),
+                    ),
+                    MeasurementExpression::Equals,
                 ),
             )),
         ),
@@ -200,6 +187,7 @@ pub fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement
 mod test {
     use super::*;
     use crate::assert_expect_error;
+    use crate::common::MeasurementName;
     use crate::expression::arithmetic::Expr;
     use assert_matches::assert_matches;
 
@@ -232,7 +220,7 @@ mod test {
             ShowMeasurementsStatement {
                 on_expression: Some(OnExpression::Database("foo".into())),
                 measurement_expression: Some(MeasurementExpression::Equals(
-                    MeasurementNameExpression {
+                    QualifiedMeasurementName {
                         database: None,
                         retention_policy: None,
                         name: "bar".into(),
@@ -255,7 +243,9 @@ mod test {
             got,
             ShowMeasurementsStatement {
                 on_expression: Some(OnExpression::Database("foo".into())),
-                measurement_expression: Some(MeasurementExpression::Regex(Regex("bar".into()))),
+                measurement_expression: Some(MeasurementExpression::Regex(
+                    QualifiedMeasurementName::new(MeasurementName::Regex("bar".into()))
+                )),
                 condition: Some(Expr::Literal(true.into()).into()),
                 limit: None,
                 offset: None
@@ -343,33 +333,50 @@ mod test {
 
     #[test]
     fn test_with_measurement_clause() {
+        use crate::common::MeasurementName::*;
+
         let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap();
         assert_eq!(
             got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: None,
-                retention_policy: None,
-                name: "foo".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Name("foo".into())))
         );
 
         let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
 
         // Expressions are still valid when whitespace is omitted
 
         let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap();
         assert_eq!(
             got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: Some("foo".into()),
-                retention_policy: None,
-                name: "bar".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new_db(
+                Name("bar".into()),
+                "foo".into()
+            ))
         );
 
         let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662
+
+        let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Name("foo".into())))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
 
         // Fallible cases
 
@@ -379,28 +386,16 @@ mod test {
             "invalid WITH clause, expected MEASUREMENT"
         );
 
-        // Must have a regex for equal regex operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement =~ foo"),
-            "expected regular expression literal"
-        );
-
         // Unsupported regex not equal operator
         assert_expect_error!(
             with_measurement_clause("WITH measurement !~ foo"),
             "expected = or =~"
         );
 
-        // Must have an identifier for equal operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement = /foo/"),
-            "expected measurement name or wildcard"
-        );
-
         // Must have an identifier
         assert_expect_error!(
             with_measurement_clause("WITH measurement = 1"),
-            "expected measurement name or wildcard"
+            "expected measurement name"
         );
     }
 }
diff --git a/influxdb_influxql_parser/src/simple_from_clause.rs b/influxdb_influxql_parser/src/simple_from_clause.rs
index f3d7ab0481..07528a9fc2 100644
--- a/influxdb_influxql_parser/src/simple_from_clause.rs
+++ b/influxdb_influxql_parser/src/simple_from_clause.rs
@@ -1,41 +1,12 @@
-use crate::common::{measurement_name_expression, MeasurementNameExpression, OneOrMore, Parser};
+use crate::common::{
+    qualified_measurement_name, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName,
+};
 use crate::identifier::{identifier, Identifier};
 use crate::internal::ParseResult;
-use crate::string::{regex, Regex};
-use nom::branch::alt;
 use nom::bytes::complete::tag_no_case;
 use nom::character::complete::multispace1;
-use nom::combinator::map;
 use nom::sequence::{pair, preceded};
 use std::fmt;
-use std::fmt::Formatter;
-
-/// Represents a single measurement selection found in a `FROM` measurement clause.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum MeasurementSelection<T: Parser> {
-    Name(T),
-    Regex(Regex),
-}
-
-impl<T: Parser> Parser for MeasurementSelection<T> {
-    fn parse(i: &str) -> ParseResult<&str, Self> {
-        alt((
-            map(T::parse, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
-        ))(i)
-    }
-}
-
-impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::Name(ref name) => fmt::Display::fmt(name, f)?,
-            Self::Regex(ref re) => fmt::Display::fmt(re, f)?,
-        };
-
-        Ok(())
-    }
-}
 
 /// Represents a `FROM` clause of a `DELETE` or `SHOW` statement.
 ///
@@ -43,7 +14,7 @@ impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
 /// for measurements names.
 ///
 /// A `FROM` clause for a number of `SHOW` statements can accept a 3-part measurement name or
-pub type FromMeasurementClause<U> = OneOrMore<MeasurementSelection<U>>;
+pub type FromMeasurementClause<U> = OneOrMore<U>;
 
 fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasurementClause<T>> {
     preceded(
@@ -54,9 +25,9 @@ fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasu
     )(i)
 }
 
-impl Parser for MeasurementNameExpression {
+impl Parser for QualifiedMeasurementName {
     fn parse(i: &str) -> ParseResult<&str, Self> {
-        measurement_name_expression(i)
+        qualified_measurement_name(i)
     }
 }
 
@@ -68,10 +39,9 @@ impl Parser for MeasurementNameExpression {
 /// It is defined by the following EBNF notation:
 ///
 /// ```text
-/// from_clause ::= "FROM" measurement_selection ("," measurement_selection)*
-/// measurement_selection ::= measurement
+/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)*
 ///
-/// measurement      ::= measurement_name |
+/// qualified_measurement_name ::= measurement_name |
 ///                      ( policy_name "." measurement_name ) |
 ///                      ( db_name "." policy_name? "." measurement_name )
 ///
@@ -92,7 +62,7 @@ impl Parser for MeasurementNameExpression {
 /// ```text
 /// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar
 /// ```
-pub type ShowFromClause = FromMeasurementClause<MeasurementNameExpression>;
+pub type ShowFromClause = FromMeasurementClause<QualifiedMeasurementName>;
 
 /// Parse a `FROM` clause for various `SHOW` statements.
 pub fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> {
@@ -106,7 +76,7 @@ impl Parser for Identifier {
 }
 
 /// Represents a `FROM` clause for a `DELETE` statement.
-pub type DeleteFromClause = FromMeasurementClause<Identifier>;
+pub type DeleteFromClause = FromMeasurementClause<MeasurementName>;
 
 /// Parse a `FROM` clause for a `DELETE` statement.
 pub fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> {
@@ -119,49 +89,52 @@ mod test {
 
     #[test]
     fn test_show_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;
 
         let (_, from) = show_from_clause("FROM c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new("c".into()))])
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))])
         );
 
         let (_, from) = show_from_clause("FROM a..c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db(
+                Name("c".into()),
                 "a".into()
-            ))])
+            )])
         );
 
         let (_, from) = show_from_clause("FROM a.b.c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db_rp(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp(
+                Name("c".into()),
                 "a".into(),
                 "b".into()
-            ))])
+            )])
         );
 
         let (_, from) = show_from_clause("FROM /reg/").unwrap();
-        assert_eq!(from, ShowFromClause::new(vec![Regex("reg".into())]));
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))])
+        );
 
         let (_, from) = show_from_clause("FROM c, /reg/").unwrap();
         assert_eq!(
             from,
             ShowFromClause::new(vec![
-                Name(MeasurementNameExpression::new("c".into())),
-                Regex("reg".into())
+                QualifiedMeasurementName::new(Name("c".into())),
+                QualifiedMeasurementName::new(Regex("reg".into()))
             ])
         );
     }
 
     #[test]
     fn test_delete_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;
 
         let (_, from) = delete_from_clause("FROM c").unwrap();
         assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())]));

From 07361f5b4076db1f68067a6efb2534bc02b70626 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 7 Oct 2022 13:25:26 +0000
Subject: [PATCH 22/40] chore(deps): Bump tracing from 0.1.36 to 0.1.37 (#5811)

Bumps [tracing](https://github.com/tokio-rs/tracing) from 0.1.36 to 0.1.37.
- [Release notes](https://github.com/tokio-rs/tracing/releases)
- [Commits](https://github.com/tokio-rs/tracing/compare/tracing-0.1.36...tracing-0.1.37)

---
updated-dependencies:
- dependency-name: tracing
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 488aae235c..d94f4d4dae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5418,9 +5418,9 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.36"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
  "cfg-if",
  "log",
@@ -5431,9 +5431,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2"
+checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5442,9 +5442,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.29"
+version = "0.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
+checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
 dependencies = [
  "once_cell",
  "valuable",

From 02e3ab125c657fb40420b9058b51962315899560 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 7 Oct 2022 13:35:03 +0000
Subject: [PATCH 23/40] chore(deps): Bump syn from 1.0.101 to 1.0.102 (#5813)

Bumps [syn](https://github.com/dtolnay/syn) from 1.0.101 to 1.0.102.
- [Release notes](https://github.com/dtolnay/syn/releases)
- [Commits](https://github.com/dtolnay/syn/compare/1.0.101...1.0.102)

---
updated-dependencies:
- dependency-name: syn
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d94f4d4dae..31157a9858 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3654,9 +3654,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.43"
+version = "1.0.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
 dependencies = [
  "unicode-ident",
 ]
@@ -4937,9 +4937,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "1.0.101"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
+checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
 dependencies = [
  "proc-macro2",
  "quote",

From 8013781ac2e5ea68992f79df5b7d60e1a3a5dc98 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Fri, 7 Oct 2022 14:05:54 -0400
Subject: [PATCH 24/40] feat: rewrite missing column references to NULL (#5818)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 predicate/src/lib.rs                          |  1 -
 predicate/src/rpc_predicate.rs                | 66 ++++++++++++-
 predicate/src/rpc_predicate/column_rewrite.rs | 99 +++++++++++++++++++
 predicate/src/rpc_predicate/field_rewrite.rs  |  4 +-
 predicate/src/{ => rpc_predicate}/rewrite.rs  |  0
 query_tests/src/influxrpc/read_filter.rs      |  8 +-
 service_grpc_influxrpc/src/expr.rs            |  1 +
 service_grpc_influxrpc/src/service.rs         |  7 +-
 8 files changed, 172 insertions(+), 14 deletions(-)
 create mode 100644 predicate/src/rpc_predicate/column_rewrite.rs
 rename predicate/src/{ => rpc_predicate}/rewrite.rs (100%)

diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs
index 03b52e521d..633a345e50 100644
--- a/predicate/src/lib.rs
+++ b/predicate/src/lib.rs
@@ -12,7 +12,6 @@
 
 pub mod delete_expr;
 pub mod delete_predicate;
-pub mod rewrite;
 pub mod rpc_predicate;
 
 use arrow::{
diff --git a/predicate/src/rpc_predicate.rs b/predicate/src/rpc_predicate.rs
index 2836a6e57e..833dfdc063 100644
--- a/predicate/src/rpc_predicate.rs
+++ b/predicate/src/rpc_predicate.rs
@@ -1,19 +1,23 @@
+mod column_rewrite;
 mod field_rewrite;
 mod measurement_rewrite;
+mod rewrite;
 mod value_rewrite;
 
-use crate::{rewrite, Predicate};
+use crate::Predicate;
 
 use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::lit;
 use datafusion::logical_plan::{
-    Column, Expr, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
+    Column, Expr, ExprRewritable, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
 };
+use observability_deps::tracing::{debug, trace};
 use schema::Schema;
 use std::collections::BTreeSet;
 use std::sync::Arc;
 
+use self::column_rewrite::MissingColumnRewriter;
 use self::field_rewrite::FieldProjectionRewriter;
 use self::measurement_rewrite::rewrite_measurement_references;
 use self::value_rewrite::rewrite_field_value_references;
@@ -187,6 +191,7 @@ fn normalize_predicate(
     let mut predicate = predicate.clone();
 
     let mut field_projections = FieldProjectionRewriter::new(Arc::clone(&schema));
+    let mut missing_columums = MissingColumnRewriter::new(Arc::clone(&schema));
 
     let mut field_value_exprs = vec![];
 
@@ -194,24 +199,38 @@ fn normalize_predicate(
         .exprs
         .into_iter()
         .map(|e| {
-            rewrite_measurement_references(table_name, e)
+            debug!(?e, "rewriting expr");
+
+            let e = rewrite_measurement_references(table_name, e)
+                .map(|e| log_rewrite(e, "rewrite_measurement_references"))
                 // Rewrite any references to `_value = some_value` to literal true values.
                 // Keeps track of these expressions, which can then be used to
                 // augment field projections with conditions using `CASE` statements.
                 .and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e))
+                .map(|e| log_rewrite(e, "rewrite_field_value_references"))
                 // Rewrite any references to `_field` with a literal
                 // and keep track of referenced field names to add to
                 // the field column projection set.
                 .and_then(|e| field_projections.rewrite_field_exprs(e))
+                .map(|e| log_rewrite(e, "field_projections"))
+                // remove references to columns that don't exist in this schema
+                .and_then(|e| e.rewrite(&mut missing_columums))
+                .map(|e| log_rewrite(e, "missing_columums"))
                 // apply IOx specific rewrites (that unlock other simplifications)
                 .and_then(rewrite::rewrite)
-                // Call the core DataFusion simplification logic
+                .map(|e| log_rewrite(e, "rewrite"))
+                // Call DataFusion simplification logic
                 .and_then(|e| {
                     let adapter = SimplifyAdapter::new(schema.as_ref());
                     // simplify twice to ensure "full" cleanup
                     e.simplify(&adapter)?.simplify(&adapter)
                 })
+                .map(|e| log_rewrite(e, "simplify_expr"))
                 .and_then(rewrite::simplify_predicate)
+                .map(|e| log_rewrite(e, "simplify_expr"));
+
+            debug!(?e, "rewritten expr");
+            e
         })
         // Filter out literal true so is_empty works correctly
         .filter(|f| match f {
@@ -227,6 +246,11 @@ fn normalize_predicate(
     field_projections.add_to_predicate(predicate)
 }
 
+fn log_rewrite(expr: Expr, description: &str) -> Expr {
+    trace!(?expr, %description, "After rewrite");
+    expr
+}
+
 struct SimplifyAdapter<'a> {
     schema: &'a Schema,
     execution_props: ExecutionProps,
@@ -290,9 +314,27 @@ mod tests {
 
     use super::*;
     use arrow::datatypes::DataType;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        scalar::ScalarValue,
+    };
     use test_helpers::assert_contains;
 
+    #[test]
+    fn test_normalize_predicate_coerced() {
+        let schema = schema();
+        let predicate = normalize_predicate(
+            "table",
+            Arc::clone(&schema),
+            &Predicate::new().with_expr(col("t1").eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(col("t1").eq(lit("f1")));
+
+        assert_eq!(predicate, expected);
+    }
+
     #[test]
     fn test_normalize_predicate_field_rewrite() {
         let predicate = normalize_predicate(
@@ -336,6 +378,20 @@ mod tests {
         assert_eq!(predicate, expected);
     }
 
+    #[test]
+    fn test_normalize_predicate_field_non_tag() {
+        // should treat
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None)));
+        assert_eq!(predicate, expected);
+    }
+
     #[test]
     fn test_normalize_predicate_field_rewrite_multi_field_unsupported() {
         let err = normalize_predicate(
diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs
new file mode 100644
index 0000000000..7a29331fca
--- /dev/null
+++ b/predicate/src/rpc_predicate/column_rewrite.rs
@@ -0,0 +1,99 @@
+use std::sync::Arc;
+
+use datafusion::{
+    error::Result as DataFusionResult, logical_plan::ExprRewriter, prelude::*, scalar::ScalarValue,
+};
+use schema::Schema;
+
+/// Logic for rewriting expressions from influxrpc that reference non
+/// existent columns to NULL
+#[derive(Debug)]
+pub(crate) struct MissingColumnRewriter {
+    /// The input schema
+    schema: Arc<Schema>,
+}
+
+impl MissingColumnRewriter {
+    /// Create a new [`MissingColumnRewriter`] targeting the given schema
+    pub(crate) fn new(schema: Arc<Schema>) -> Self {
+        Self { schema }
+    }
+
+    fn column_exists(&self, col: &Column) -> DataFusionResult<bool> {
+        // todo a real error here (rpc_predicates shouldn't have table/relation qualifiers)
+        assert!(col.relation.is_none());
+
+        if self.schema.find_index_of(&col.name).is_some() {
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+}
+
+fn lit_null() -> Expr {
+    lit(ScalarValue::Utf8(None))
+}
+
+impl ExprRewriter for MissingColumnRewriter {
+    fn mutate(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        Ok(match expr {
+            Expr::Column(col) if !self.column_exists(&col)? => lit_null(),
+            expr => expr,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{arrow::datatypes::DataType, logical_plan::ExprRewritable};
+    use schema::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn all_columns_defined_no_rewrite() {
+        // t1 = "foo"
+        let expr = col("t1").eq(lit("foo"));
+        assert_eq!(rewrite(expr.clone()), expr);
+
+        // f1 > 1.0
+        let expr = col("f1").gt(lit(1.0));
+        assert_eq!(rewrite(expr.clone()), expr);
+    }
+
+    #[test]
+    fn all_columns_not_defined() {
+        // non_defined = "foo" --> NULL = "foo"
+        let expr = col("non_defined").eq(lit("foo"));
+        let expected = lit_null().eq(lit("foo"));
+        assert_eq!(rewrite(expr), expected);
+
+        // non_defined = 1.4 --> NULL = 1.4
+        let expr = col("non_defined").eq(lit(1.4));
+        // No type is inferred so this is a literal null string (even though it maybe should be a literal float)
+        let expected = lit_null().eq(lit(1.4));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    #[test]
+    fn some_columns_not_defined() {
+        // t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar"
+        let expr = col("t1")
+            .eq(lit("foo"))
+            .and(col("non_defined").eq(lit("bar")));
+        let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar")));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    fn rewrite(expr: Expr) -> Expr {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .field("f1", DataType::Int64)
+            .build()
+            .unwrap();
+
+        let mut rewriter = MissingColumnRewriter::new(Arc::new(schema));
+        expr.rewrite(&mut rewriter).unwrap()
+    }
+}
diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs
index 3cccfa219a..3f983a28e7 100644
--- a/predicate/src/rpc_predicate/field_rewrite.rs
+++ b/predicate/src/rpc_predicate/field_rewrite.rs
@@ -55,8 +55,8 @@ impl FieldProjectionRewriter {
         }
     }
 
-    // Rewrites the predicate. See the description on
-    // [`FieldProjectionRewriter`] for more details.
+    /// Rewrites the predicate. See the description on
+    /// [`FieldProjectionRewriter`] for more details.
     pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult<Expr> {
         // for predicates like `A AND B AND C`
         // rewrite `A`, `B` and `C` separately and put them back together
diff --git a/predicate/src/rewrite.rs b/predicate/src/rpc_predicate/rewrite.rs
similarity index 100%
rename from predicate/src/rewrite.rs
rename to predicate/src/rpc_predicate/rewrite.rs
diff --git a/query_tests/src/influxrpc/read_filter.rs b/query_tests/src/influxrpc/read_filter.rs
index c0485f42aa..71cf9495c0 100644
--- a/query_tests/src/influxrpc/read_filter.rs
+++ b/query_tests/src/influxrpc/read_filter.rs
@@ -205,12 +205,12 @@ async fn test_read_filter_invalid_predicate_case() {
 #[tokio::test]
 async fn test_read_filter_unknown_column_in_predicate() {
     let predicate = Predicate::new()
-        // mystery_region is not a real column, so this predicate is
+        // mystery_region and bar are not real columns, so this predicate is
         // invalid but IOx should be able to handle it (and produce no results)
         .with_expr(
-            col("baz")
-                .eq(lit(4i32))
-                .or(col("bar").and(col("mystery_region").gt(lit(5i32)))),
+            col("baz").eq(lit(4i32)).or(col("bar")
+                .eq(lit("baz"))
+                .and(col("mystery_region").gt(lit(5i32)))),
         );
 
     let predicate = InfluxRpcPredicate::new(None, predicate);
diff --git a/service_grpc_influxrpc/src/expr.rs b/service_grpc_influxrpc/src/expr.rs
index 58a5806b4e..8da9cebc67 100644
--- a/service_grpc_influxrpc/src/expr.rs
+++ b/service_grpc_influxrpc/src/expr.rs
@@ -906,6 +906,7 @@ mod tests {
                     let schema = SchemaBuilder::new()
                         .tag("t1")
                         .tag("t2")
+                        .tag("host")
                         .field("foo", DataType::Int64)
                         .field("bar", DataType::Int64)
                         .build()
diff --git a/service_grpc_influxrpc/src/service.rs b/service_grpc_influxrpc/src/service.rs
index 8ad7bbbbfb..734f856b88 100644
--- a/service_grpc_influxrpc/src/service.rs
+++ b/service_grpc_influxrpc/src/service.rs
@@ -1802,11 +1802,13 @@ mod tests {
         // Note multiple tables / measureemnts:
         let chunk0 = TestChunk::new("m1")
             .with_id(0)
+            .with_tag_column("state")
             .with_tag_column("k1")
             .with_tag_column("k2");
 
         let chunk1 = TestChunk::new("m2")
             .with_id(1)
+            .with_tag_column("state")
             .with_tag_column("k3")
             .with_tag_column("k4");
 
@@ -1826,7 +1828,7 @@ mod tests {
         };
 
         let actual_tag_keys = fixture.storage_client.tag_keys(request).await.unwrap();
-        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"];
+        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"];
 
         assert_eq!(actual_tag_keys, expected_tag_keys,);
 
@@ -1898,6 +1900,7 @@ mod tests {
             .with_tag_column("k0");
 
         let chunk1 = TestChunk::new("m4")
+            .with_tag_column("state")
             .with_tag_column("k1")
             .with_tag_column("k2")
             .with_tag_column("k3")
@@ -1927,7 +1930,7 @@ mod tests {
             .measurement_tag_keys(request)
             .await
             .unwrap();
-        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"];
+        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"];
 
         assert_eq!(
             actual_tag_keys, expected_tag_keys,

From d8a318eb57242ed8f0b3f667d457a8cb4136a21d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Fri, 7 Oct 2022 16:34:00 -0400
Subject: [PATCH 25/40] docs: Tweak local run guide (#5787)

Update the instructions on how to run IOx locally

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 docs/underground_guide.md | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/docs/underground_guide.md b/docs/underground_guide.md
index 201dd5e44b..c087bcce88 100644
--- a/docs/underground_guide.md
+++ b/docs/underground_guide.md
@@ -15,17 +15,25 @@ developers.
 Build IOx for release with pprof:
 
 ```shell
+cd influxdb_iox
 cargo build --release --features=pprof
 ```
 
-## Step 2: Start redpanda and postgres
+You can also install the `influxdb_iox` command locally via 
 
-Now, start up redpanda and postgres locally in docker containers:
+```shell
+cd influxdb_iox
+cargo install --path influxdb_iox
+```
+
+## Step 2: Start kafka and postgres
+
+Now, start up kafka and postgres locally in docker containers:
 ```shell
 # get rskafka from https://github.com/influxdata/rskafka
 cd rskafka
-# Run redpanda on localhost:9010
-docker-compose -f docker-compose-redpanda.yml up &
+# Run kafka on localhost:9010
+docker-compose -f docker-compose-kafka.yml up &
 # now run postgres
 docker run -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust postgres &
 ```
@@ -136,8 +144,8 @@ INFLUXDB_IOX_GRPC_BIND_ADDR=localhost:8084 \
 INFLUXDB_IOX_WRITE_BUFFER_TYPE=kafka \
 INFLUXDB_IOX_WRITE_BUFFER_ADDR=localhost:9010 \
 xINFLUXDB_IOX_WRITE_BUFFER_AUTO_CREATE_TOPICS=10 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_START=0 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_START=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_END=0 \
 INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES=5000000000 \
 INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES=4000000000 \
 INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost:5432/postgres \
@@ -151,6 +159,11 @@ LOG_FILTER=info \
 
 # Step 5: Ingest data
 
+You can load data using the influxdb_iox client:
+```shell
+influxdb_iox  --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp
+```
+
 Now you can post data to `http://localhost:8080` with your favorite load generating tool
 
 My favorite is https://github.com/alamb/low_card
@@ -171,3 +184,17 @@ posting fairly large requests (necessitating the
 # Step 6: Profile
 
 See [`profiling.md`](./profiling.md).
+
+
+# Step 7: Clean up local state
+
+If you find yourself needing to clean up postgres / kafka state use these commands:
+```shell
+docker ps -a -q | xargs docker stop
+docker rm rskafka_proxy_1
+docker rm rskafka_kafka-0_1
+docker rm rskafka_kafka-1_1
+docker rm rskafka_kafka-2_1
+docker rm rskafka_zookeeper_1
+docker volume rm  rskafka_kafka_0_data rskafka_kafka_1_data rskafka_kafka_2_data rskafka_zookeeper_data
+```

From 2277fcf08ac487c73e5765dbcff7405058061fad Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 01:42:37 +0000
Subject: [PATCH 26/40] chore(deps): Bump serde_json from 1.0.85 to 1.0.86

Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.85 to 1.0.86.
- [Release notes](https://github.com/serde-rs/json/releases)
- [Commits](https://github.com/serde-rs/json/compare/v1.0.85...v1.0.86)

---
updated-dependencies:
- dependency-name: serde_json
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock                        | 4 ++--
 clap_blocks/Cargo.toml            | 2 +-
 import/Cargo.toml                 | 2 +-
 influxdb2_client/Cargo.toml       | 2 +-
 influxdb_iox/Cargo.toml           | 2 +-
 iox_data_generator/Cargo.toml     | 2 +-
 ioxd_common/Cargo.toml            | 2 +-
 predicate/Cargo.toml              | 2 +-
 service_grpc_flight/Cargo.toml    | 2 +-
 service_grpc_influxrpc/Cargo.toml | 2 +-
 write_summary/Cargo.toml          | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 31157a9858..f91ce69a81 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4396,9 +4396,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.85"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
+checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
 dependencies = [
  "itoa 1.0.3",
  "ryu",
diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml
index 679f65a96f..99e429a067 100644
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@@ -14,7 +14,7 @@ metric = { path = "../metric" }
 object_store = "0.5.0"
 observability_deps = { path = "../observability_deps" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tempfile = "3.1.0"
 trace = { path = "../trace" }
diff --git a/import/Cargo.toml b/import/Cargo.toml
index c773711a23..bdd07e008d 100644
--- a/import/Cargo.toml
+++ b/import/Cargo.toml
@@ -17,7 +17,7 @@ object_store = { version = "0.5.0", features = ["aws"] }
 observability_deps = { path = "../observability_deps" }
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.82"
+serde_json = "1.0.86"
 thiserror = "1.0.37"
 tokio = { version = "1.21" }
 tonic = { version = "0.8" }
diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml
index 060445779b..b3858aac87 100644
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@@ -9,7 +9,7 @@ bytes = "1.2"
 futures = { version = "0.3", default-features = false }
 reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 url = "2.3.1"
 uuid = { version = "1", features = ["v4"] }
diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index ec1392882d..a689df6155 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -57,7 +57,7 @@ libc = { version = "0.2" }
 num_cpus = "1.13.0"
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 rustyline = { version = "10.0", default-features = false }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 thiserror = "1.0.37"
 tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml
index ec62aad243..adc8b85949 100644
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@@ -22,7 +22,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
 regex = "1.6"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 toml = "0.5.9"
diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml
index 1a89ba0cd5..b293640285 100644
--- a/ioxd_common/Cargo.toml
+++ b/ioxd_common/Cargo.toml
@@ -40,7 +40,7 @@ log = "0.4"
 parking_lot = "0.12"
 reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 serde_urlencoded = "0.7.0"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml
index 743cc8301b..e1d423255f 100644
--- a/predicate/Cargo.toml
+++ b/predicate/Cargo.toml
@@ -13,7 +13,7 @@ itertools = "0.10"
 observability_deps = { path = "../observability_deps" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 sqlparser = "0.25.0"
 workspace-hack = { path = "../workspace-hack"}
diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml
index 172e89b560..b9999514e1 100644
--- a/service_grpc_flight/Cargo.toml
+++ b/service_grpc_flight/Cargo.toml
@@ -26,7 +26,7 @@ futures = "0.3"
 pin-project = "1.0"
 prost = "0.11"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tonic = "0.8"
diff --git a/service_grpc_influxrpc/Cargo.toml b/service_grpc_influxrpc/Cargo.toml
index ea4169e05c..00c5df645f 100644
--- a/service_grpc_influxrpc/Cargo.toml
+++ b/service_grpc_influxrpc/Cargo.toml
@@ -26,7 +26,7 @@ pin-project = "1.0"
 prost = "0.11"
 regex = "1.6.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-stream = { version = "0.1", features = ["net"] }
diff --git a/write_summary/Cargo.toml b/write_summary/Cargo.toml
index d3313a19ee..d303ad5b50 100644
--- a/write_summary/Cargo.toml
+++ b/write_summary/Cargo.toml
@@ -9,7 +9,7 @@ data_types = { path = "../data_types" }
 dml = { path = "../dml" }
 generated_types = { path = "../generated_types" }
 observability_deps = { path = "../observability_deps" }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 workspace-hack = { path = "../workspace-hack"}
 

From ab78f99ab2e3de07d210d115135b10720fdec0d2 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 10 Oct 2022 14:32:48 +0200
Subject: [PATCH 27/40] refactor: eager background task abort

Changes the get() code path to abort the background load task when the
caller will resolve the sort key.

Note that an aborted future will leave the DeferredSortKey without a
background task to fetch the key, and the next caller will have to query
the catalog. Given the rarity of aborted futures, and desire to minimise
catalog load, this seems like a decent trade-off.

This commit also documents the many-readers eager loading problem.
---
 ingester/src/data/partition/resolver/sort_key.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ingester/src/data/partition/resolver/sort_key.rs b/ingester/src/data/partition/resolver/sort_key.rs
index c0c5555963..61b1977a85 100644
--- a/ingester/src/data/partition/resolver/sort_key.rs
+++ b/ingester/src/data/partition/resolver/sort_key.rs
@@ -79,7 +79,8 @@ impl DeferredSortKey {
                 tokio::time::sleep(wait_for).await;
                 // Fetch the sort key from the catalog
                 let v = fetch(partition_id, &*catalog, &backoff_config).await;
-                // And attempt to
+                // And attempt to update the value container, if it hasn't
+                // already resolved
                 let mut state = value.lock();
                 *state = match *state {
                     State::Unresolved => State::Resolved(v),
@@ -102,6 +103,17 @@ impl DeferredSortKey {
     /// If the [`SortKey`] was pre-fetched in the background, it is returned
     /// immediately. If the [`SortKey`] has not yet been resolved, this call
     /// blocks while it is read from the [`Catalog`].
+    ///
+    /// # Concurrency
+    ///
+    /// If this method requires resolving the [`SortKey`], N concurrent callers
+    /// will cause N queries against the catalog.
+    ///
+    /// # Await Safety
+    ///
+    /// Cancelling the future returned by calling [`Self::get()`] before
+    /// completion will leave [`Self`] without a background task. The next call
+    /// to [`Self::get()`] will incur a catalog query (see concurrency above).
     pub(crate) async fn get(&self) -> Option<SortKey> {
         {
             let state = self.value.lock();
@@ -114,11 +126,11 @@ impl DeferredSortKey {
 
         // Otherwise resolve the value immediately, aborting the background
         // task.
+        self.handle.abort();
         let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await;
 
         {
             let mut state = self.value.lock();
-            self.handle.abort();
             *state = State::Resolved(sort_key.clone());
         }
 

From 4518bd49d1df0ccdda4825ed991bcb0660f10b2e Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 10 Oct 2022 14:37:34 +0200
Subject: [PATCH 28/40] test: constify duration seconds

---
 ingester/src/data/partition/resolver/sort_key.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ingester/src/data/partition/resolver/sort_key.rs b/ingester/src/data/partition/resolver/sort_key.rs
index 61b1977a85..36e3ee5f1a 100644
--- a/ingester/src/data/partition/resolver/sort_key.rs
+++ b/ingester/src/data/partition/resolver/sort_key.rs
@@ -189,13 +189,15 @@ mod tests {
     // A test that (most likely) exercises the "read on demand" code path.
     //
     // The background task is configured to run some time between now, and
-    // 10,000 hours in the future - it most likely doesn't get to complete
+    // 10,000,000 seconds in the future - it most likely doesn't get to complete
     // before the get() call is issued.
     //
     // If this test flakes, it is POSSIBLE but UNLIKELY that the background task
     // has completed and the get() call reads a pre-fetched value.
     #[tokio::test]
     async fn test_read_demand() {
+        const LONG_LONG_TIME: Duration = Duration::from_secs(10_000_000);
+
         let metrics = Arc::new(metric::Registry::default());
         let backoff_config = BackoffConfig::default();
         let catalog: Arc<dyn Catalog> =
@@ -237,7 +239,7 @@ mod tests {
         // Read the updated sort key
         let fetched = DeferredSortKey::new(
             partition_id,
-            Duration::from_secs(10_000),
+            LONG_LONG_TIME,
             Arc::clone(&catalog),
             backoff_config,
         )

From f8bc4d8881659bb7ca618b4002c971ceb4b5586c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 13:06:59 +0000
Subject: [PATCH 29/40] chore(deps): Bump libc from 0.2.134 to 0.2.135 (#5822)

Bumps [libc](https://github.com/rust-lang/libc) from 0.2.134 to 0.2.135.
- [Release notes](https://github.com/rust-lang/libc/releases)
- [Commits](https://github.com/rust-lang/libc/compare/0.2.134...0.2.135)

---
updated-dependencies:
- dependency-name: libc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f91ce69a81..4fa9164ab1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2665,9 +2665,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.134"
+version = "0.2.135"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
+checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
 
 [[package]]
 name = "libloading"

From 0eac3812c810553b8ef746fbd9e477d13a85e7b4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 13:15:29 +0000
Subject: [PATCH 30/40] chore(deps): Bump snafu from 0.7.1 to 0.7.2 (#5821)

Bumps [snafu](https://github.com/shepmaster/snafu) from 0.7.1 to 0.7.2.
- [Release notes](https://github.com/shepmaster/snafu/releases)
- [Changelog](https://github.com/shepmaster/snafu/blob/main/CHANGELOG.md)
- [Commits](https://github.com/shepmaster/snafu/compare/0.7.1...0.7.2)

---
updated-dependencies:
- dependency-name: snafu
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                  | 8 ++++----
 influxrpc_parser/Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4fa9164ab1..1d9a3bbb1d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4659,9 +4659,9 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
 name = "snafu"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2"
+checksum = "dd726aec4ebad65756394ff89a9b9598793d4e30121cd71690244c1e497b3aee"
 dependencies = [
  "doc-comment",
  "snafu-derive",
@@ -4669,9 +4669,9 @@ dependencies = [
 
 [[package]]
 name = "snafu-derive"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5"
+checksum = "712529e9b0b014eabaa345b38e06032767e3dc393e8b017e853b1d7247094e74"
 dependencies = [
  "heck",
  "proc-macro2",
diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml
index 7a886cf4e7..80a8496db6 100644
--- a/influxrpc_parser/Cargo.toml
+++ b/influxrpc_parser/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 
 [dependencies]
 sqlparser = "0.25.0"
-snafu = "0.7.1"
+snafu = "0.7.2"
 
 generated_types = { path = "../generated_types" }
 workspace-hack = { path = "../workspace-hack"}
\ No newline at end of file

From 97c6e0f8ceac3816dfa1cb0fbe690c911a7ba463 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Tue, 4 Oct 2022 17:34:43 +0200
Subject: [PATCH 31/40] refactor: use TableName, not Arc<str>

Adds a type wrapper TableName, internally an Arc<str> to leverage the
type system instead of passing around untyped strings.
---
 ingester/src/data.rs                          | 24 ++++++-----
 ingester/src/data/namespace.rs                | 41 +++++++++++-------
 ingester/src/data/partition.rs                |  6 ++-
 ingester/src/data/partition/buffer.rs         |  8 ++--
 ingester/src/data/partition/resolver/cache.rs |  7 ++-
 .../src/data/partition/resolver/catalog.rs    | 11 +++--
 ingester/src/data/partition/resolver/mock.rs  |  6 +--
 ingester/src/data/partition/resolver/trait.rs | 18 +++-----
 ingester/src/data/table.rs                    | 43 ++++++++++++++++---
 ingester/src/handler.rs                       | 10 +++--
 ingester/src/querier_handler.rs               |  7 +--
 ingester/src/query.rs                         |  6 +--
 ingester/src/test_util.rs                     |  8 ++--
 13 files changed, 122 insertions(+), 73 deletions(-)

diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 5bbd422ca7..4d89b8f976 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -34,6 +34,7 @@ pub mod table;
 use self::{
     partition::{resolver::PartitionProvider, PartitionStatus},
     shard::ShardData,
+    table::TableName,
 };
 
 #[cfg(test)]
@@ -449,16 +450,17 @@ impl Persister for IngesterData {
             .record(file_size as u64);
 
         // and remove the persisted data from memory
+        let table_name = TableName::from(&partition_info.table_name);
         namespace
             .mark_persisted(
-                &partition_info.table_name,
+                &table_name,
                 &partition_info.partition.partition_key,
                 iox_metadata.max_sequence_number,
             )
             .await;
         debug!(
             ?partition_id,
-            table_name=%partition_info.table_name,
+            %table_name,
             partition_key=%partition_info.partition.partition_key,
             max_sequence_number=%iox_metadata.max_sequence_number.get(),
             "marked partition as persisted"
@@ -816,8 +818,8 @@ mod tests {
         let (table_id, partition_id) = {
             let sd = data.shards.get(&shard1.id).unwrap();
             let n = sd.namespace(&"foo".into()).unwrap();
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("mem").is_some());
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"mem".into()).is_some());
             let mem_table = mem_table.write().await;
             let p = mem_table
                 .get_partition_by_key(&"1970-01-01".into())
@@ -961,8 +963,8 @@ mod tests {
         let partition_id;
         let table_id;
         {
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("cpu").is_some());
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"cpu".into()).is_some());
 
             let mem_table = mem_table.write().await;
             table_id = mem_table.table_id();
@@ -1077,7 +1079,7 @@ mod tests {
             .unwrap();
         assert_eq!(partition_info.partition.sort_key, vec!["time"]);
 
-        let mem_table = n.table_data("mem").unwrap();
+        let mem_table = n.table_data(&"mem".into()).unwrap();
         let mem_table = mem_table.read().await;
 
         // verify that the parquet_max_sequence_number got updated
@@ -1372,7 +1374,7 @@ mod tests {
             .await
             .unwrap();
         {
-            let table_data = data.table_data("mem").unwrap();
+            let table_data = data.table_data(&"mem".into()).unwrap();
             let table = table_data.read().await;
             let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
             assert_eq!(
@@ -1388,7 +1390,7 @@ mod tests {
             .await
             .unwrap();
 
-        let table_data = data.table_data("mem").unwrap();
+        let table_data = data.table_data(&"mem".into()).unwrap();
         let table = table_data.read().await;
         let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
         assert_eq!(
@@ -1481,7 +1483,7 @@ mod tests {
                 .unwrap()
                 .namespace(&namespace.name.clone().into())
                 .unwrap()
-                .table_data("mem")
+                .table_data(&"mem".into())
                 .unwrap()
                 .read()
                 .await
@@ -1513,7 +1515,7 @@ mod tests {
                 .unwrap()
                 .namespace(&namespace.name.into())
                 .unwrap()
-                .table_data("mem")
+                .table_data(&"mem".into())
                 .unwrap()
                 .read()
                 .await
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 4b67e9642c..94013b36c8 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -13,20 +13,23 @@ use write_summary::ShardProgress;
 
 #[cfg(test)]
 use super::triggers::TestTriggers;
-use super::{partition::resolver::PartitionProvider, table::TableData};
+use super::{
+    partition::resolver::PartitionProvider,
+    table::{TableData, TableName},
+};
 use crate::lifecycle::LifecycleHandle;
 
 /// A double-referenced map where [`TableData`] can be looked up by name, or ID.
 #[derive(Debug, Default)]
 struct DoubleRef {
     // TODO(4880): this can be removed when IDs are sent over the wire.
-    by_name: HashMap<Arc<str>, Arc<tokio::sync::RwLock<TableData>>>,
+    by_name: HashMap<TableName, Arc<tokio::sync::RwLock<TableData>>>,
     by_id: HashMap<TableId, Arc<tokio::sync::RwLock<TableData>>>,
 }
 
 impl DoubleRef {
     fn insert(&mut self, t: TableData) -> Arc<tokio::sync::RwLock<TableData>> {
-        let name = Arc::clone(t.table_name());
+        let name = t.table_name().clone();
         let id = t.table_id();
 
         let t = Arc::new(tokio::sync::RwLock::new(t));
@@ -35,7 +38,7 @@ impl DoubleRef {
         t
     }
 
-    fn by_name(&self, name: &str) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+    fn by_name(&self, name: &TableName) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
         self.by_name.get(name).map(Arc::clone)
     }
 
@@ -196,6 +199,7 @@ impl NamespaceData {
                     .clone();
 
                 for (t, b) in write.into_tables() {
+                    let t = TableName::from(t);
                     let table_data = match self.table_data(&t) {
                         Some(t) => t,
                         None => self.insert_table(&t, catalog).await?,
@@ -221,10 +225,13 @@ impl NamespaceData {
                 Ok(pause_writes)
             }
             DmlOperation::Delete(delete) => {
-                let table_name = delete.table_name().context(super::TableNotPresentSnafu)?;
-                let table_data = match self.table_data(table_name) {
+                let table_name = delete
+                    .table_name()
+                    .context(super::TableNotPresentSnafu)?
+                    .into();
+                let table_data = match self.table_data(&table_name) {
                     Some(t) => t,
-                    None => self.insert_table(table_name, catalog).await?,
+                    None => self.insert_table(&table_name, catalog).await?,
                 };
 
                 let mut table_data = table_data.write().await;
@@ -244,7 +251,7 @@ impl NamespaceData {
     #[cfg(test)] // Only used in tests
     pub(crate) async fn snapshot(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
     ) -> Option<(
         Vec<Arc<super::partition::SnapshotBatch>>,
@@ -270,7 +277,7 @@ impl NamespaceData {
     #[cfg(test)] // Only used in tests
     pub(crate) async fn snapshot_to_persisting(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
     ) -> Option<Arc<super::partition::PersistingBatch>> {
         if let Some(table_data) = self.table_data(table_name) {
@@ -287,7 +294,7 @@ impl NamespaceData {
     /// Gets the buffered table data
     pub(crate) fn table_data(
         &self,
-        table_name: &str,
+        table_name: &TableName,
     ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
         let t = self.tables.read();
         t.by_name(table_name)
@@ -305,7 +312,7 @@ impl NamespaceData {
     /// Inserts the table or returns it if it happens to be inserted by some other thread
     async fn insert_table(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         catalog: &Arc<dyn Catalog>,
     ) -> Result<Arc<tokio::sync::RwLock<TableData>>, super::Error> {
         let mut repos = catalog.repositories().await;
@@ -314,7 +321,9 @@ impl NamespaceData {
             .get_table_persist_info(self.shard_id, self.namespace_id, table_name)
             .await
             .context(super::CatalogSnafu)?
-            .context(super::TableNotFoundSnafu { table_name })?;
+            .ok_or_else(|| super::Error::TableNotFound {
+                table_name: table_name.to_string(),
+            })?;
 
         let mut t = self.tables.write();
 
@@ -326,7 +335,7 @@ impl NamespaceData {
                 // Insert the table and then return a ref to it.
                 t.insert(TableData::new(
                     info.table_id,
-                    table_name,
+                    table_name.clone(),
                     self.shard_id,
                     self.namespace_id,
                     info.tombstone_max_sequence_number,
@@ -341,7 +350,7 @@ impl NamespaceData {
     /// data buffer.
     pub(super) async fn mark_persisted(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
         sequence_number: SequenceNumber,
     ) {
@@ -479,7 +488,7 @@ mod tests {
         assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME);
 
         // Assert the namespace does not contain the test data
-        assert!(ns.table_data(TABLE_NAME).is_none());
+        assert!(ns.table_data(&TABLE_NAME.into()).is_none());
         assert!(ns.table_id(table_id).is_none());
 
         // Write some test data
@@ -499,7 +508,7 @@ mod tests {
         .expect("buffer op should succeed");
 
         // Both forms of referencing the table should succeed
-        assert!(ns.table_data(TABLE_NAME).is_some());
+        assert!(ns.table_data(&TABLE_NAME.into()).is_some());
         assert!(ns.table_id(table_id).is_some());
 
         // And the table counter metric should increase
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 7707f8301f..b35a2a6d31 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -19,6 +19,8 @@ use self::{
 };
 use crate::{data::query_dedup::query, query::QueryableBatch};
 
+use super::table::TableName;
+
 mod buffer;
 pub mod resolver;
 
@@ -180,7 +182,7 @@ pub struct PartitionData {
     namespace_id: NamespaceId,
     table_id: TableId,
     /// The name of the table this partition is part of.
-    table_name: Arc<str>,
+    table_name: TableName,
 
     pub(super) data: DataBuffer,
 
@@ -198,7 +200,7 @@ impl PartitionData {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
         sort_key: SortKeyState,
         max_persisted_sequence_number: Option<SequenceNumber>,
     ) -> Self {
diff --git a/ingester/src/data/partition/buffer.rs b/ingester/src/data/partition/buffer.rs
index 739da735fa..3195b9c74d 100644
--- a/ingester/src/data/partition/buffer.rs
+++ b/ingester/src/data/partition/buffer.rs
@@ -9,6 +9,8 @@ use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;
 
+use crate::data::table::TableName;
+
 use super::{PersistingBatch, QueryableBatch, SnapshotBatch};
 
 /// Data of an IOx partition split into batches
@@ -109,7 +111,7 @@ impl DataBuffer {
     /// Both buffer and snapshots will be empty after this
     pub(super) fn snapshot_to_queryable_batch(
         &mut self,
-        table_name: &Arc<str>,
+        table_name: &TableName,
         partition_id: PartitionId,
         tombstone: Option<Tombstone>,
     ) -> Option<QueryableBatch> {
@@ -129,7 +131,7 @@ impl DataBuffer {
             None
         } else {
             Some(QueryableBatch::new(
-                Arc::clone(table_name),
+                table_name.clone(),
                 partition_id,
                 data,
                 tombstones,
@@ -164,7 +166,7 @@ impl DataBuffer {
         shard_id: ShardId,
         table_id: TableId,
         partition_id: PartitionId,
-        table_name: &Arc<str>,
+        table_name: &TableName,
     ) -> Option<Arc<PersistingBatch>> {
         if self.persisting.is_some() {
             panic!("Unable to snapshot while persisting. This is an unexpected state.")
diff --git a/ingester/src/data/partition/resolver/cache.rs b/ingester/src/data/partition/resolver/cache.rs
index a9dd897444..7f282ae38c 100644
--- a/ingester/src/data/partition/resolver/cache.rs
+++ b/ingester/src/data/partition/resolver/cache.rs
@@ -9,7 +9,10 @@ use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 
-use crate::data::partition::{resolver::DeferredSortKey, PartitionData, SortKeyState};
+use crate::data::{
+    partition::{resolver::DeferredSortKey, PartitionData, SortKeyState},
+    table::TableName,
+};
 
 use super::r#trait::PartitionProvider;
 
@@ -189,7 +192,7 @@ where
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         // Use the cached PartitionKey instead of the caller's partition_key,
         // instead preferring to reuse the already-shared Arc<str> in the cache.
diff --git a/ingester/src/data/partition/resolver/catalog.rs b/ingester/src/data/partition/resolver/catalog.rs
index 128b9a5614..e42c4876c4 100644
--- a/ingester/src/data/partition/resolver/catalog.rs
+++ b/ingester/src/data/partition/resolver/catalog.rs
@@ -9,7 +9,10 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 
-use crate::data::partition::{PartitionData, SortKeyState};
+use crate::data::{
+    partition::{PartitionData, SortKeyState},
+    table::TableName,
+};
 
 use super::r#trait::PartitionProvider;
 
@@ -55,7 +58,7 @@ impl PartitionProvider for CatalogPartitionResolver {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         debug!(
             %partition_key,
@@ -132,7 +135,7 @@ mod tests {
         };
 
         let callers_partition_key = PartitionKey::from(PARTITION_KEY);
-        let table_name = TABLE_NAME.into();
+        let table_name = TableName::from(TABLE_NAME);
         let resolver = CatalogPartitionResolver::new(Arc::clone(&catalog));
         let got = resolver
             .get_partition(
@@ -140,7 +143,7 @@ mod tests {
                 shard_id,
                 namespace_id,
                 table_id,
-                Arc::clone(&table_name),
+                table_name.clone(),
             )
             .await;
         assert_eq!(got.namespace_id(), namespace_id);
diff --git a/ingester/src/data/partition/resolver/mock.rs b/ingester/src/data/partition/resolver/mock.rs
index e65f127ef4..80f859c43e 100644
--- a/ingester/src/data/partition/resolver/mock.rs
+++ b/ingester/src/data/partition/resolver/mock.rs
@@ -1,12 +1,12 @@
 //! A mock [`PartitionProvider`] to inject [`PartitionData`] for tests.
 
-use std::{collections::HashMap, sync::Arc};
+use std::collections::HashMap;
 
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
 use parking_lot::Mutex;
 
-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};
 
 use super::r#trait::PartitionProvider;
 
@@ -58,7 +58,7 @@ impl PartitionProvider for MockPartitionProvider {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         let p = self
             .partitions
diff --git a/ingester/src/data/partition/resolver/trait.rs b/ingester/src/data/partition/resolver/trait.rs
index a8bf3134e4..4ca50ec949 100644
--- a/ingester/src/data/partition/resolver/trait.rs
+++ b/ingester/src/data/partition/resolver/trait.rs
@@ -3,7 +3,7 @@ use std::{fmt::Debug, sync::Arc};
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
 
-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};
 
 /// An infallible resolver of [`PartitionData`] for the specified shard, table,
 /// and partition key, returning an initialised [`PartitionData`] buffer for it.
@@ -20,7 +20,7 @@ pub trait PartitionProvider: Send + Sync + Debug {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData;
 }
 
@@ -35,7 +35,7 @@ where
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         (**self)
             .get_partition(partition_key, shard_id, namespace_id, table_id, table_name)
@@ -59,7 +59,7 @@ mod tests {
         let shard_id = ShardId::new(42);
         let namespace_id = NamespaceId::new(1234);
         let table_id = TableId::new(24);
-        let table_name = "platanos".into();
+        let table_name = TableName::from("platanos");
         let partition = PartitionId::new(4242);
         let data = PartitionData::new(
             partition,
@@ -67,7 +67,7 @@ mod tests {
             shard_id,
             namespace_id,
             table_id,
-            Arc::clone(&table_name),
+            table_name.clone(),
             SortKeyState::Provided(None),
             None,
         );
@@ -75,13 +75,7 @@ mod tests {
         let mock = Arc::new(MockPartitionProvider::default().with_partition(data));
 
         let got = mock
-            .get_partition(
-                key,
-                shard_id,
-                namespace_id,
-                table_id,
-                Arc::clone(&table_name),
-            )
+            .get_partition(key, shard_id, namespace_id, table_id, table_name.clone())
             .await;
         assert_eq!(got.partition_id(), partition);
         assert_eq!(got.namespace_id(), namespace_id);
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 9a49b5f291..008f74d149 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -50,11 +50,40 @@ impl DoubleRef {
     }
 }
 
+/// The string name / identifier of a Table.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct TableName(Arc<str>);
+
+impl<T> From<T> for TableName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::fmt::Display for TableName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl std::ops::Deref for TableName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
 /// Data of a Table in a given Namesapce that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct TableData {
     table_id: TableId,
-    table_name: Arc<str>,
+    table_name: TableName,
 
     /// The catalog ID of the shard & namespace this table is being populated
     /// from.
@@ -85,7 +114,7 @@ impl TableData {
     /// for the first time.
     pub(super) fn new(
         table_id: TableId,
-        table_name: &str,
+        table_name: TableName,
         shard_id: ShardId,
         namespace_id: NamespaceId,
         tombstone_max_sequence_number: Option<SequenceNumber>,
@@ -93,7 +122,7 @@ impl TableData {
     ) -> Self {
         Self {
             table_id,
-            table_name: table_name.into(),
+            table_name,
             shard_id,
             namespace_id,
             tombstone_max_sequence_number,
@@ -137,7 +166,7 @@ impl TableData {
                         self.shard_id,
                         self.namespace_id,
                         self.table_id,
-                        Arc::clone(&self.table_name),
+                        self.table_name.clone(),
                     )
                     .await;
                 // Add the double-referenced partition to the map.
@@ -276,7 +305,7 @@ impl TableData {
     }
 
     /// Returns the name of this table.
-    pub(crate) fn table_name(&self) -> &Arc<str> {
+    pub(crate) fn table_name(&self) -> &TableName {
         &self.table_name
     }
 }
@@ -335,7 +364,7 @@ mod tests {
 
         let mut table = TableData::new(
             table_id,
-            TABLE_NAME,
+            TABLE_NAME.into(),
             shard_id,
             ns_id,
             None,
@@ -395,7 +424,7 @@ mod tests {
 
         let mut table = TableData::new(
             table_id,
-            TABLE_NAME,
+            TABLE_NAME.into(),
             shard_id,
             ns_id,
             None,
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index 1f51ce194d..67b34342dd 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -445,7 +445,7 @@ mod tests {
     use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};
 
     use super::*;
-    use crate::data::partition::SnapshotBatch;
+    use crate::data::{partition::SnapshotBatch, table::TableName};
 
     #[tokio::test]
     async fn read_from_write_buffer_write_to_mutable_buffer() {
@@ -513,13 +513,15 @@ mod tests {
         // data in there from both writes.
         tokio::time::timeout(Duration::from_secs(2), async {
             let ns_name = ingester.namespace.name.into();
+            let table_name = TableName::from("a");
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
                     if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                             if let Some(b) = b.first() {
                                 if b.data.num_rows() > 0 {
                                     has_measurement = true;
@@ -755,13 +757,15 @@ mod tests {
         // data in there
         tokio::time::timeout(Duration::from_secs(1), async move {
             let ns_name = namespace.name.into();
+            let table_name = TableName::from("cpu");
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.data.shard(shard.id) {
                     if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                             if let Some(b) = b.first() {
                                 custom_batch_verification(b);
 
diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs
index cf58daab0c..7ff7494af0 100644
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@@ -12,8 +12,8 @@ use snafu::{ensure, Snafu};
 
 use crate::{
     data::{
-        namespace::NamespaceName, partition::UnpersistedPartitionData, IngesterData,
-        IngesterQueryPartition, IngesterQueryResponse,
+        namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName,
+        IngesterData, IngesterQueryPartition, IngesterQueryResponse,
     },
     query::QueryableBatch,
 };
@@ -69,7 +69,8 @@ pub async fn prepare_data_to_querier(
             }
         };
 
-        let table_data = match namespace_data.table_data(&request.table) {
+        let table_name = TableName::from(&request.table);
+        let table_data = match namespace_data.table_data(&table_name) {
             Some(table_data) => {
                 debug!(table_name=%request.table, "found table");
                 table_data
diff --git a/ingester/src/query.rs b/ingester/src/query.rs
index 1829ecf4ae..219dcbcf6e 100644
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@@ -28,7 +28,7 @@ use predicate::{
 use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
 use snafu::{ResultExt, Snafu};
 
-use crate::data::partition::SnapshotBatch;
+use crate::data::{partition::SnapshotBatch, table::TableName};
 
 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@@ -60,7 +60,7 @@ pub(crate) struct QueryableBatch {
     pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
 
     /// This is needed to return a reference for a trait function
-    pub(crate) table_name: Arc<str>,
+    pub(crate) table_name: TableName,
 
     /// Partition ID
     pub(crate) partition_id: PartitionId,
@@ -69,7 +69,7 @@ pub(crate) struct QueryableBatch {
 impl QueryableBatch {
     /// Initilaize a QueryableBatch
     pub(crate) fn new(
-        table_name: Arc<str>,
+        table_name: TableName,
         partition_id: PartitionId,
         data: Vec<Arc<SnapshotBatch>>,
         deletes: Vec<Tombstone>,
diff --git a/ingester/src/test_util.rs b/ingester/src/test_util.rs
index ed3f8b6348..cde40ac9c2 100644
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@@ -657,7 +657,7 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
             .unwrap()
             .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     } else if loc.contains(DataLocation::SNAPSHOT) {
         // move partition 1 data to snapshot
@@ -666,7 +666,7 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
             .unwrap()
             .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     }
 
@@ -826,7 +826,7 @@ async fn make_one_partition_with_tombstones(
             .unwrap()
             .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     } else if loc.contains(DataLocation::SNAPSHOT) {
         // move partition 1 data to snapshot
@@ -835,7 +835,7 @@ async fn make_one_partition_with_tombstones(
             .unwrap()
             .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     }
 

From 933493fab3b07ada4005f4783d820bd5952cca94 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 11 Oct 2022 01:19:10 +0000
Subject: [PATCH 32/40] chore(deps): Bump object_store from 0.5.0 to 0.5.1

Bumps [object_store](https://github.com/apache/arrow-rs) from 0.5.0 to 0.5.1.
- [Release notes](https://github.com/apache/arrow-rs/releases)
- [Changelog](https://github.com/apache/arrow-rs/blob/master/CHANGELOG-old.md)
- [Commits](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1)

---
updated-dependencies:
- dependency-name: object_store
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock                           | 10 +++++-----
 clap_blocks/Cargo.toml               |  2 +-
 compactor/Cargo.toml                 |  2 +-
 garbage_collector/Cargo.toml         |  2 +-
 import/Cargo.toml                    |  2 +-
 influxdb_iox/Cargo.toml              |  2 +-
 ingester/Cargo.toml                  |  2 +-
 iox_tests/Cargo.toml                 |  2 +-
 ioxd_compactor/Cargo.toml            |  2 +-
 ioxd_ingester/Cargo.toml             |  2 +-
 ioxd_querier/Cargo.toml              |  2 +-
 ioxd_router/Cargo.toml               |  2 +-
 object_store_metrics/Cargo.toml      |  2 +-
 parquet_file/Cargo.toml              |  2 +-
 parquet_to_line_protocol/Cargo.toml  |  2 +-
 querier/Cargo.toml                   |  2 +-
 router/Cargo.toml                    |  2 +-
 service_grpc_object_store/Cargo.toml |  2 +-
 18 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 23bbac25d8..d78dcedd8c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3115,9 +3115,9 @@ dependencies = [
 
 [[package]]
 name = "object_store"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2168fee79ee3e7695905bc3a48777d807f82d956f821186fa7a2601c1295a73e"
+checksum = "56ce10a205d9f610ae3532943039c34c145930065ce0c4284134c897fe6073b1"
 dependencies = [
  "async-trait",
  "base64",
@@ -3127,7 +3127,7 @@ dependencies = [
  "itertools",
  "parking_lot 0.12.1",
  "percent-encoding",
- "quick-xml 0.24.1",
+ "quick-xml 0.25.0",
  "rand",
  "reqwest",
  "ring",
@@ -3927,9 +3927,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.24.1"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37dddbbe9df96afafcb8027fcf263971b726530e12f0787f620a7ba5b4846081"
+checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9"
 dependencies = [
  "memchr",
  "serde",
diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml
index 99e429a067..cbd888fd9d 100644
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@@ -11,7 +11,7 @@ humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0.86"
diff --git a/compactor/Cargo.toml b/compactor/Cargo.toml
index 8a366ab903..7cb6a78574 100644
--- a/compactor/Cargo.toml
+++ b/compactor/Cargo.toml
@@ -14,7 +14,7 @@ datafusion = { path = "../datafusion" }
 futures = "0.3"
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
 predicate = { path = "../predicate" }
diff --git a/garbage_collector/Cargo.toml b/garbage_collector/Cargo.toml
index 0784f0b07a..6fbd3478cb 100644
--- a/garbage_collector/Cargo.toml
+++ b/garbage_collector/Cargo.toml
@@ -11,7 +11,7 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 observability_deps = { path = "../observability_deps" }
 snafu = "0.7"
 tokio = { version = "1", features = ["macros", "rt", "sync"] }
diff --git a/import/Cargo.toml b/import/Cargo.toml
index bdd07e008d..20d0a3cdc3 100644
--- a/import/Cargo.toml
+++ b/import/Cargo.toml
@@ -13,7 +13,7 @@ futures = "0.3"
 generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0", features = ["aws"] }
+object_store = { version = "0.5.1", features = ["aws"] }
 observability_deps = { path = "../observability_deps" }
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index a689df6155..c863492b29 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -25,7 +25,7 @@ ioxd_querier = { path = "../ioxd_querier"}
 ioxd_router = { path = "../ioxd_router"}
 ioxd_test = { path = "../ioxd_test"}
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 object_store_metrics = { path = "../object_store_metrics" }
 observability_deps = { path = "../observability_deps" }
 panic_logging = { path = "../panic_logging" }
diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml
index 51e01c3def..2537f95edb 100644
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@@ -24,7 +24,7 @@ iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch"}
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml
index 8760728d4e..514bfb5754 100644
--- a/iox_tests/Cargo.toml
+++ b/iox_tests/Cargo.toml
@@ -14,7 +14,7 @@ iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
diff --git a/ioxd_compactor/Cargo.toml b/ioxd_compactor/Cargo.toml
index 3fae827159..6cbe04119c 100644
--- a/ioxd_compactor/Cargo.toml
+++ b/ioxd_compactor/Cargo.toml
@@ -15,7 +15,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 iox_query = { path = "../iox_query" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_time = { path = "../iox_time" }
 trace = { path = "../trace" }
 
diff --git a/ioxd_ingester/Cargo.toml b/ioxd_ingester/Cargo.toml
index db8f65e202..11e3118c2d 100644
--- a/ioxd_ingester/Cargo.toml
+++ b/ioxd_ingester/Cargo.toml
@@ -11,7 +11,7 @@ ingester = { path = "../ingester" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_query = { path = "../iox_query" }
 trace = { path = "../trace" }
 write_buffer = { path = "../write_buffer" }
diff --git a/ioxd_querier/Cargo.toml b/ioxd_querier/Cargo.toml
index e90a4a68df..60574ed73d 100644
--- a/ioxd_querier/Cargo.toml
+++ b/ioxd_querier/Cargo.toml
@@ -11,7 +11,7 @@ generated_types = { path = "../generated_types" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 querier = { path = "../querier" }
 iox_query = { path = "../iox_query" }
 router = { path = "../router" }
diff --git a/ioxd_router/Cargo.toml b/ioxd_router/Cargo.toml
index 5797a9cf01..1ae3d3ab2a 100644
--- a/ioxd_router/Cargo.toml
+++ b/ioxd_router/Cargo.toml
@@ -11,7 +11,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 router = { path = "../router" }
 sharder = { path = "../sharder" }
diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml
index 60838a8e28..f04cb909ef 100644
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@@ -10,7 +10,7 @@ bytes = "1.2"
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 metric = { version = "0.1.0", path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 pin-project = "1.0.12"
 tokio = { version = "1.21", features = ["io-util"] }
 workspace-hack = { path = "../workspace-hack" }
diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml
index 6fd9bafa4f..783b1ddca4 100644
--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@@ -14,7 +14,7 @@ datafusion_util = { path = "../datafusion_util" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
 iox_time = { path = "../iox_time" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet = {version = "23.0.0", features = ["experimental"]}
diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml
index 9b4cc08004..5273a01dd3 100644
--- a/parquet_to_line_protocol/Cargo.toml
+++ b/parquet_to_line_protocol/Cargo.toml
@@ -10,7 +10,7 @@ datafusion = { path = "../datafusion" }
 influxdb_line_protocol = { path = "../influxdb_line_protocol" }
 futures = {version = "0.3"}
 num_cpus = "1.13.1"
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 parquet_file  = { path = "../parquet_file" }
 schema = { path = "../schema" }
 tokio = "1.0"
diff --git a/querier/Cargo.toml b/querier/Cargo.toml
index 9d55643c4b..02fe680c07 100644
--- a/querier/Cargo.toml
+++ b/querier/Cargo.toml
@@ -18,7 +18,7 @@ generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 7b655f9f91..fcebd5f360 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -20,7 +20,7 @@ metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 predicate = { path = "../predicate" }
diff --git a/service_grpc_object_store/Cargo.toml b/service_grpc_object_store/Cargo.toml
index d25393c791..6a2bcac921 100644
--- a/service_grpc_object_store/Cargo.toml
+++ b/service_grpc_object_store/Cargo.toml
@@ -8,7 +8,7 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
 iox_catalog = { path = "../iox_catalog" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
 tokio = { version = "1", features = ["rt-multi-thread", "macros"] }

From 33391af973d73dd8ce0a295f31b479cdc03ce28e Mon Sep 17 00:00:00 2001
From: "Carol (Nichols || Goulding)"
 <193874+carols10cents@users.noreply.github.com>
Date: Tue, 11 Oct 2022 05:14:45 -0400
Subject: [PATCH 33/40] feat: Swap Kafka Producer implementation back to
 rdkafka as diagnosis of latency problem (#5800)

* feat: Add back rdkafka dependency

* feat: Remove RSKafkaProducer

* feat: Remove write buffer RecordAggregator

* feat: Add back rdkafka producer

Using code from 58a2a0b9c8311303c796495db4f167c99a2ea3aa then getting it
to compile with the latest

* feat: Add a metric around enqueue

* fix: Remove unused imports

* fix: Increase Kafka timeout to 20s

* docs: Clarify that Kafka topics should only be created in test/dev envs

* fix: Remove metrics that aren't needed for this experiment

Co-authored-by: Dom <dom@itsallbroken.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                                  |  86 +++++
 write_buffer/Cargo.toml                     |   1 +
 write_buffer/src/config.rs                  |  39 +-
 write_buffer/src/core.rs                    |   9 +
 write_buffer/src/kafka/instrumentation.rs   |   1 +
 write_buffer/src/kafka/mod.rs               | 144 +------
 write_buffer/src/kafka/rdkafka.rs           | 404 ++++++++++++++++++++
 write_buffer/src/kafka/record_aggregator.rs | 324 ----------------
 8 files changed, 542 insertions(+), 466 deletions(-)
 create mode 100644 write_buffer/src/kafka/rdkafka.rs
 delete mode 100644 write_buffer/src/kafka/record_aggregator.rs

diff --git a/Cargo.lock b/Cargo.lock
index d78dcedd8c..dc3c3c3be8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2686,6 +2686,18 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565"
 
+[[package]]
+name = "libz-sys"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.0.46"
@@ -3104,6 +3116,27 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "num_enum"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5395665662ef45796a4ff5486c5d41d29e0c09640af4c5f17fd94ee2c119c9"
+dependencies = [
+ "num_enum_derive",
+]
+
+[[package]]
+name = "num_enum_derive"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0498641e53dd6ac1a4f22547548caa6864cc4933784319cd1775271c5a46ce"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "object"
 version = "0.29.0"
@@ -3520,6 +3553,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "pkg-config"
+version = "0.3.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
+
 [[package]]
 name = "pprof"
 version = "0.10.1"
@@ -3623,6 +3662,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "proc-macro-crate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9"
+dependencies = [
+ "once_cell",
+ "thiserror",
+ "toml",
+]
+
 [[package]]
 name = "proc-macro-error"
 version = "1.0.4"
@@ -4017,6 +4067,35 @@ dependencies = [
  "num_cpus",
 ]
 
+[[package]]
+name = "rdkafka"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1de127f294f2dba488ed46760b129d5ecbeabbd337ccbf3739cb29d50db2161c"
+dependencies = [
+ "futures",
+ "libc",
+ "log",
+ "rdkafka-sys",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "slab",
+ "tokio",
+]
+
+[[package]]
+name = "rdkafka-sys"
+version = "4.2.0+1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e542c6863b04ce0fa0c5719bc6b7b348cf8dd21af1bb03c9db5f9805b2a6473"
+dependencies = [
+ "libc",
+ "libz-sys",
+ "num_enum",
+ "pkg-config",
+]
+
 [[package]]
 name = "read_buffer"
 version = "0.1.0"
@@ -5632,6 +5711,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -5981,6 +6066,7 @@ dependencies = [
  "parking_lot 0.12.1",
  "pin-project",
  "prost 0.11.0",
+ "rdkafka",
  "rskafka",
  "schema",
  "tempfile",
diff --git a/write_buffer/Cargo.toml b/write_buffer/Cargo.toml
index 5bab0fe55b..bfc9fcc7b1 100644
--- a/write_buffer/Cargo.toml
+++ b/write_buffer/Cargo.toml
@@ -22,6 +22,7 @@ observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 pin-project = "1.0"
 prost = "0.11"
+rdkafka = "0.28.0"
 rskafka = { git = "https://github.com/influxdata/rskafka.git", rev="3208e4742f08048bbab4e8fc4e0a775507fe3e66", default-features = false, features = ["compression-snappy", "transport-socks5"] }
 schema = { path = "../schema" }
 tokio = { version = "1.21", features = ["fs", "macros", "parking_lot", "rt", "sync", "time"] }
diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs
index 1d02e60cc9..a716a56ed5 100644
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@@ -1,7 +1,7 @@
 use crate::{
     core::{WriteBufferError, WriteBufferReading, WriteBufferWriting},
     file::{FileBufferConsumer, FileBufferProducer},
-    kafka::{RSKafkaConsumer, RSKafkaProducer},
+    kafka::RSKafkaConsumer,
     mock::{
         MockBufferForReading, MockBufferForReadingThatAlwaysErrors, MockBufferForWriting,
         MockBufferForWritingThatAlwaysErrors, MockBufferSharedState,
@@ -152,8 +152,8 @@ impl WriteBufferConfigFactory {
     pub async fn new_config_write(
         &self,
         db_name: &str,
-        partitions: Option<Range<i32>>,
-        trace_collector: Option<&Arc<dyn TraceCollector>>,
+        _partitions: Option<Range<i32>>,
+        _trace_collector: Option<&Arc<dyn TraceCollector>>,
         cfg: &WriteBufferConnection,
     ) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
         let writer = match &cfg.type_[..] {
@@ -168,20 +168,7 @@ impl WriteBufferConfigFactory {
                 .await?;
                 Arc::new(file_buffer) as _
             }
-            "kafka" => {
-                let rskafa_buffer = RSKafkaProducer::new(
-                    cfg.connection.clone(),
-                    db_name.to_owned(),
-                    &cfg.connection_config,
-                    Arc::clone(&self.time_provider),
-                    cfg.creation_config.as_ref(),
-                    partitions,
-                    trace_collector.map(Arc::clone),
-                    &*self.metric_registry,
-                )
-                .await?;
-                Arc::new(rskafa_buffer) as _
-            }
+            "kafka" => self.kafka_buffer_producer(db_name, cfg).await?,
             "mock" => match self.get_mock(&cfg.connection)? {
                 Mock::Normal(state) => {
                     let mock_buffer = MockBufferForWriting::new(
@@ -204,6 +191,24 @@ impl WriteBufferConfigFactory {
         Ok(writer)
     }
 
+    async fn kafka_buffer_producer(
+        &self,
+        db_name: &str,
+        cfg: &WriteBufferConnection,
+    ) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
+        let kafka_buffer = crate::kafka::rdkafka::KafkaBufferProducer::new(
+            &cfg.connection,
+            db_name,
+            &cfg.connection_config,
+            cfg.creation_config.as_ref(),
+            Arc::clone(&self.time_provider),
+            &self.metric_registry,
+        )
+        .await?;
+
+        Ok(Arc::new(kafka_buffer) as _)
+    }
+
     /// Returns a new [`WriteBufferReading`] for the provided [`WriteBufferConnection`]
     pub async fn new_config_read(
         &self,
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index 04d08faa13..afbe16d82e 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -80,6 +80,15 @@ impl From<std::io::Error> for WriteBufferError {
     }
 }
 
+impl From<rdkafka::error::KafkaError> for WriteBufferError {
+    fn from(e: rdkafka::error::KafkaError) -> Self {
+        Self {
+            inner: Box::new(e),
+            kind: WriteBufferErrorKind::IO,
+        }
+    }
+}
+
 impl From<rskafka::client::error::Error> for WriteBufferError {
     fn from(e: rskafka::client::error::Error) -> Self {
         Self {
diff --git a/write_buffer/src/kafka/instrumentation.rs b/write_buffer/src/kafka/instrumentation.rs
index 283dc281fd..4c29cc8ab7 100644
--- a/write_buffer/src/kafka/instrumentation.rs
+++ b/write_buffer/src/kafka/instrumentation.rs
@@ -36,6 +36,7 @@ pub struct KafkaProducerMetrics<P = SystemProvider> {
 impl KafkaProducerMetrics {
     /// Decorate the specified [`ProducerClient`] implementation with an
     /// instrumentation layer.
+    #[allow(dead_code)]
     pub fn new(
         client: Box<dyn ProducerClient>,
         kafka_topic_name: String,
diff --git a/write_buffer/src/kafka/mod.rs b/write_buffer/src/kafka/mod.rs
index 99aad39d1f..c33f65c541 100644
--- a/write_buffer/src/kafka/mod.rs
+++ b/write_buffer/src/kafka/mod.rs
@@ -1,24 +1,17 @@
-use self::{
-    config::{ClientConfig, ConsumerConfig, ProducerConfig, TopicCreationConfig},
-    instrumentation::KafkaProducerMetrics,
-    record_aggregator::RecordAggregator,
-};
+use self::config::{ClientConfig, ConsumerConfig, TopicCreationConfig};
 use crate::{
     codec::IoxHeaders,
     config::WriteBufferCreationConfig,
-    core::{
-        WriteBufferError, WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler,
-        WriteBufferWriting,
-    },
+    core::{WriteBufferError, WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler},
 };
 use async_trait::async_trait;
 use data_types::{Sequence, SequenceNumber, ShardIndex};
-use dml::{DmlMeta, DmlOperation};
+use dml::DmlOperation;
 use futures::{
     stream::{self, BoxStream},
     StreamExt, TryStreamExt,
 };
-use iox_time::{Time, TimeProvider};
+use iox_time::Time;
 use observability_deps::tracing::warn;
 use parking_lot::Mutex;
 use rskafka::{
@@ -26,7 +19,6 @@ use rskafka::{
         consumer::{StartOffset, StreamConsumerBuilder},
         error::{Error as RSKafkaError, ProtocolError},
         partition::{OffsetAt, PartitionClient, UnknownTopicHandling},
-        producer::{BatchProducer, BatchProducerBuilder},
         ClientBuilder,
     },
     record::RecordAndOffset,
@@ -43,112 +35,13 @@ use trace::TraceCollector;
 
 mod config;
 mod instrumentation;
-mod record_aggregator;
+pub(crate) mod rdkafka;
 
 /// Maximum number of jobs buffered and decoded concurrently.
 const CONCURRENT_DECODE_JOBS: usize = 10;
 
 type Result<T, E = WriteBufferError> = std::result::Result<T, E>;
 
-#[derive(Debug)]
-pub struct RSKafkaProducer {
-    producers: BTreeMap<ShardIndex, BatchProducer<RecordAggregator>>,
-}
-
-impl RSKafkaProducer {
-    #[allow(clippy::too_many_arguments)]
-    pub async fn new<'a>(
-        conn: String,
-        topic_name: String,
-        connection_config: &'a BTreeMap<String, String>,
-        time_provider: Arc<dyn TimeProvider>,
-        creation_config: Option<&'a WriteBufferCreationConfig>,
-        partitions: Option<Range<i32>>,
-        _trace_collector: Option<Arc<dyn TraceCollector>>,
-        metric_registry: &'a metric::Registry,
-    ) -> Result<Self> {
-        let partition_clients = setup_topic(
-            conn,
-            topic_name.clone(),
-            connection_config,
-            creation_config,
-            partitions,
-        )
-        .await?;
-
-        let producer_config = ProducerConfig::try_from(connection_config)?;
-
-        let producers = partition_clients
-            .into_iter()
-            .map(|(shard_index, partition_client)| {
-                // Instrument this kafka partition client.
-                let partition_client = KafkaProducerMetrics::new(
-                    Box::new(partition_client),
-                    topic_name.clone(),
-                    shard_index,
-                    metric_registry,
-                );
-
-                let mut producer_builder =
-                    BatchProducerBuilder::new_with_client(Arc::new(partition_client));
-                if let Some(linger) = producer_config.linger {
-                    producer_builder = producer_builder.with_linger(linger);
-                }
-                let producer = producer_builder.build(RecordAggregator::new(
-                    shard_index,
-                    producer_config.max_batch_size,
-                    Arc::clone(&time_provider),
-                ));
-
-                (shard_index, producer)
-            })
-            .collect();
-
-        Ok(Self { producers })
-    }
-}
-
-#[async_trait]
-impl WriteBufferWriting for RSKafkaProducer {
-    fn shard_indexes(&self) -> BTreeSet<ShardIndex> {
-        self.producers.keys().copied().collect()
-    }
-
-    async fn store_operation(
-        &self,
-        shard_index: ShardIndex,
-        operation: DmlOperation,
-    ) -> Result<DmlMeta, WriteBufferError> {
-        // Sanity check to ensure only partitioned writes are pushed into Kafka.
-        if let DmlOperation::Write(w) = &operation {
-            assert!(
-                w.partition_key().is_some(),
-                "enqueuing unpartitioned write into kafka"
-            )
-        }
-
-        let producer = self
-            .producers
-            .get(&shard_index)
-            .ok_or_else::<WriteBufferError, _>(|| {
-                format!("Unknown shard index: {}", shard_index).into()
-            })?;
-
-        Ok(producer.produce(operation).await?)
-    }
-
-    async fn flush(&self) -> Result<(), WriteBufferError> {
-        for producer in self.producers.values() {
-            producer.flush().await?;
-        }
-        Ok(())
-    }
-
-    fn type_name(&self) -> &'static str {
-        "kafka"
-    }
-}
-
 #[derive(Debug)]
 pub struct RSKafkaStreamHandler {
     partition_client: Arc<PartitionClient>,
@@ -525,14 +418,17 @@ async fn setup_topic(
 mod tests {
     use super::*;
     use crate::{
-        core::test_utils::{
-            assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
-            set_pop_first, TestAdapter, TestContext,
+        core::{
+            test_utils::{
+                assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
+                set_pop_first, TestAdapter, TestContext,
+            },
+            WriteBufferWriting,
         },
         maybe_skip_kafka_integration,
     };
     use data_types::{DeletePredicate, PartitionKey, TimestampRange};
-    use dml::{test_util::assert_write_op_eq, DmlDelete, DmlWrite};
+    use dml::{test_util::assert_write_op_eq, DmlDelete, DmlMeta, DmlWrite};
     use futures::{stream::FuturesUnordered, TryStreamExt};
     use iox_time::TimeProvider;
     use rskafka::{client::partition::Compression, record::Record};
@@ -595,19 +491,17 @@ mod tests {
 
     #[async_trait]
     impl TestContext for RSKafkaTestContext {
-        type Writing = RSKafkaProducer;
+        type Writing = rdkafka::KafkaBufferProducer;
 
         type Reading = RSKafkaConsumer;
 
         async fn writing(&self, creation_config: bool) -> Result<Self::Writing, WriteBufferError> {
-            RSKafkaProducer::new(
+            rdkafka::KafkaBufferProducer::new(
                 self.conn.clone(),
                 self.topic_name.clone(),
                 &BTreeMap::default(),
-                Arc::clone(&self.time_provider),
                 self.creation_config(creation_config).as_ref(),
-                None,
-                Some(self.trace_collector() as Arc<_>),
+                Arc::clone(&self.time_provider),
                 &self.metrics,
             )
             .await
@@ -850,9 +744,9 @@ mod tests {
             .unwrap();
     }
 
-    async fn write(
+    async fn write<T: WriteBufferWriting>(
         namespace: &str,
-        producer: &RSKafkaProducer,
+        producer: &T,
         trace_collector: &Arc<RingBufferTraceCollector>,
         shard_index: ShardIndex,
         partition_key: impl Into<PartitionKey> + Send,
@@ -869,9 +763,9 @@ mod tests {
         producer.store_operation(shard_index, op).await.unwrap()
     }
 
-    async fn delete(
+    async fn delete<T: WriteBufferWriting>(
         namespace: &str,
-        producer: &RSKafkaProducer,
+        producer: &T,
         trace_collector: &Arc<RingBufferTraceCollector>,
         shard_index: ShardIndex,
     ) -> DmlMeta {
diff --git a/write_buffer/src/kafka/rdkafka.rs b/write_buffer/src/kafka/rdkafka.rs
new file mode 100644
index 0000000000..33ca8a74f7
--- /dev/null
+++ b/write_buffer/src/kafka/rdkafka.rs
@@ -0,0 +1,404 @@
+use crate::{
+    codec::{ContentType, IoxHeaders},
+    core::{WriteBufferError, WriteBufferWriting},
+    kafka::WriteBufferCreationConfig,
+};
+use async_trait::async_trait;
+use data_types::{Sequence, SequenceNumber, ShardIndex};
+use dml::{DmlMeta, DmlOperation};
+use iox_time::{Time, TimeProvider};
+use metric::{Attributes, DurationHistogram, Metric};
+use observability_deps::tracing::{debug, info};
+use rdkafka::{
+    admin::{AdminClient, AdminOptions, NewTopic, TopicReplication},
+    client::DefaultClientContext,
+    consumer::{BaseConsumer, Consumer},
+    error::KafkaError,
+    message::{Headers, OwnedHeaders},
+    producer::{FutureProducer, FutureRecord, Producer},
+    types::RDKafkaErrorCode,
+    util::Timeout,
+    ClientConfig,
+};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    num::NonZeroU32,
+    sync::Arc,
+    time::Duration,
+};
+
+/// Default timeout supplied to rdkafka client for kafka operations.
+///
+/// Chosen to be a value less than the default gRPC timeout (30
+/// seconds) so we can detect kafka errors and return them prior to
+/// the gRPC requests to IOx timing out.
+///
+/// More context in
+/// <https://github.com/influxdata/influxdb_iox/issues/3029>
+const KAFKA_OPERATION_TIMEOUT_MS: u64 = 20_000;
+
+impl From<&IoxHeaders> for OwnedHeaders {
+    fn from(iox_headers: &IoxHeaders) -> Self {
+        let mut res = Self::new();
+
+        for (header, value) in iox_headers.headers() {
+            res = res.add(header, value.as_ref());
+        }
+
+        res
+    }
+}
+
+pub struct KafkaBufferProducer {
+    conn: String,
+    database_name: String,
+    time_provider: Arc<dyn TimeProvider>,
+    producer: Arc<FutureProducer<DefaultClientContext>>,
+    partitions: BTreeSet<ShardIndex>,
+    enqueue: Metric<DurationHistogram>,
+}
+
+// Needed because rdkafka's FutureProducer doesn't impl Debug
+impl std::fmt::Debug for KafkaBufferProducer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("KafkaBufferProducer")
+            .field("conn", &self.conn)
+            .field("database_name", &self.database_name)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl WriteBufferWriting for KafkaBufferProducer {
+    fn shard_indexes(&self) -> BTreeSet<ShardIndex> {
+        self.partitions.clone()
+    }
+
+    /// Send a [`DmlOperation`] to the write buffer using the specified shard index.
+    async fn store_operation(
+        &self,
+        shard_index: ShardIndex,
+        operation: DmlOperation,
+    ) -> Result<DmlMeta, WriteBufferError> {
+        // Sanity check to ensure only partitioned writes are pushed into Kafka.
+        if let DmlOperation::Write(w) = &operation {
+            assert!(
+                w.partition_key().is_some(),
+                "enqueuing unpartitioned write into kafka"
+            )
+        }
+
+        // Only send writes with known shard indexes to Kafka.
+        if !self.partitions.contains(&shard_index) {
+            return Err(format!("Unknown shard index: {}", shard_index).into());
+        }
+
+        let kafka_partition_id = shard_index.get();
+
+        let enqueue_start = self.time_provider.now();
+
+        // truncate milliseconds from timestamps because that's what Kafka supports
+        let now = operation
+            .meta()
+            .producer_ts()
+            .unwrap_or_else(|| self.time_provider.now());
+
+        let timestamp_millis = now.date_time().timestamp_millis();
+        let timestamp = Time::from_timestamp_millis(timestamp_millis);
+
+        let headers = IoxHeaders::new(
+            ContentType::Protobuf,
+            operation.meta().span_context().cloned(),
+            operation.namespace().to_string(),
+        );
+
+        let mut buf = Vec::new();
+        crate::codec::encode_operation(&self.database_name, &operation, &mut buf)?;
+
+        // This type annotation is necessary because `FutureRecord` is generic over key type, but
+        // key is optional and we're not setting a key. `String` is arbitrary.
+        let record: FutureRecord<'_, String, _> = FutureRecord::to(&self.database_name)
+            .payload(&buf)
+            .partition(kafka_partition_id)
+            .timestamp(timestamp_millis)
+            .headers((&headers).into());
+        let kafka_write_size = estimate_message_size(
+            record.payload.map(|v| v.as_ref()),
+            record.key.map(|s| s.as_bytes()),
+            record.headers.as_ref(),
+        );
+
+        debug!(db_name=%self.database_name, kafka_partition_id, size=buf.len(), "writing to kafka");
+
+        let res = self.producer.send(record, Timeout::Never).await;
+
+        if let Some(delta) = self
+            .time_provider
+            .now()
+            .checked_duration_since(enqueue_start)
+        {
+            let result_attr = match &res {
+                Ok(_) => "success",
+                Err(_) => "error",
+            };
+
+            let attr = Attributes::from([
+                ("kafka_partition", shard_index.to_string().into()),
+                ("kafka_topic", self.database_name.clone().into()),
+                ("result", result_attr.into()),
+            ]);
+
+            let recorder = self.enqueue.recorder(attr);
+            recorder.record(delta);
+        }
+
+        let (partition, offset) = res.map_err(|(e, _owned_message)| e)?;
+
+        debug!(db_name=%self.database_name, %offset, %partition, size=buf.len(), "wrote to kafka");
+
+        Ok(DmlMeta::sequenced(
+            Sequence::new(shard_index, SequenceNumber::new(offset)),
+            timestamp,
+            operation.meta().span_context().cloned(),
+            kafka_write_size,
+        ))
+    }
+
+    async fn flush(&self) -> Result<(), WriteBufferError> {
+        let producer = Arc::clone(&self.producer);
+
+        tokio::task::spawn_blocking(move || {
+            producer.flush(Timeout::Never);
+        })
+        .await
+        .expect("subtask failed");
+
+        Ok(())
+    }
+
+    fn type_name(&self) -> &'static str {
+        "kafka"
+    }
+}
+
+impl KafkaBufferProducer {
+    pub async fn new(
+        conn: impl Into<String> + Send,
+        database_name: impl Into<String> + Send,
+        connection_config: &BTreeMap<String, String>,
+        creation_config: Option<&WriteBufferCreationConfig>,
+        time_provider: Arc<dyn TimeProvider>,
+        metric_registry: &metric::Registry,
+    ) -> Result<Self, WriteBufferError> {
+        let conn = conn.into();
+        let database_name = database_name.into();
+
+        let mut cfg = ClientConfig::new();
+
+        // these configs can be overwritten
+        cfg.set("message.timeout.ms", "5000");
+        cfg.set("message.max.bytes", "31457280");
+        cfg.set("message.send.max.retries", "10");
+        cfg.set("queue.buffering.max.kbytes", "31457280");
+        cfg.set("request.required.acks", "all"); // equivalent to acks=-1
+        cfg.set("compression.type", "snappy");
+        cfg.set("statistics.interval.ms", "15000");
+
+        // user overrides
+        for (k, v) in connection_config {
+            cfg.set(k, v);
+        }
+
+        // these configs are set in stone
+        cfg.set("bootstrap.servers", &conn);
+        cfg.set("allow.auto.create.topics", "false");
+
+        // handle auto-creation
+        let partitions =
+            maybe_auto_create_topics(&conn, &database_name, creation_config, &cfg).await?;
+
+        let producer = cfg.create()?;
+
+        let enqueue = metric_registry.register_metric::<DurationHistogram>(
+            "write_buffer_client_produce_duration",
+            "duration of time taken to push a set of records to kafka \
+             - includes codec, protocol, and network overhead",
+        );
+
+        Ok(Self {
+            conn,
+            database_name,
+            time_provider,
+            producer: Arc::new(producer),
+            partitions,
+            enqueue,
+        })
+    }
+}
+
+/// Iterate over the kafka messages
+fn header_iter<H>(headers: Option<&H>) -> impl Iterator<Item = (&str, &[u8])>
+where
+    H: Headers,
+{
+    headers
+        .into_iter()
+        .flat_map(|headers| (0..headers.count()).map(|idx| headers.get(idx).unwrap()))
+}
+
+/// Estimate size of data read from kafka as payload len + key len + headers
+fn estimate_message_size<H>(
+    payload: Option<&[u8]>,
+    key: Option<&[u8]>,
+    headers: Option<&H>,
+) -> usize
+where
+    H: Headers,
+{
+    payload.map(|payload| payload.len()).unwrap_or_default()
+        + key.map(|key| key.len()).unwrap_or_default()
+        + header_iter(headers)
+            .map(|(key, value)| key.len() + value.len())
+            .sum::<usize>()
+}
+
+/// Get Kafka partition IDs (IOx ShardIndexes) for the database-specific Kafka topic.
+///
+/// Will return `None` if the topic is unknown and has to be created.
+///
+/// This will check that the partition is is non-empty.
+async fn get_partitions(
+    database_name: &str,
+    cfg: &ClientConfig,
+) -> Result<Option<BTreeSet<ShardIndex>>, WriteBufferError> {
+    let database_name = database_name.to_string();
+    let cfg = cfg.clone();
+
+    let metadata = tokio::task::spawn_blocking(move || {
+        let probe_consumer: BaseConsumer = cfg.create()?;
+
+        probe_consumer.fetch_metadata(
+            Some(&database_name),
+            Duration::from_millis(KAFKA_OPERATION_TIMEOUT_MS),
+        )
+    })
+    .await
+    .expect("subtask failed")?;
+
+    let topic_metadata = metadata.topics().get(0).expect("requested a single topic");
+
+    match topic_metadata.error() {
+        None => {
+            let partitions: BTreeSet<_> = topic_metadata
+                .partitions()
+                .iter()
+                .map(|partition_metdata| ShardIndex::new(partition_metdata.id()))
+                .collect();
+
+            if partitions.is_empty() {
+                Err("Topic exists but has no partitions".to_string().into())
+            } else {
+                Ok(Some(partitions))
+            }
+        }
+        Some(error_code) => {
+            let error_code: RDKafkaErrorCode = error_code.into();
+            match error_code {
+                RDKafkaErrorCode::UnknownTopic | RDKafkaErrorCode::UnknownTopicOrPartition => {
+                    // The caller is responsible for creating the topic, so this is somewhat OK.
+                    Ok(None)
+                }
+                _ => Err(KafkaError::MetadataFetch(error_code).into()),
+            }
+        }
+    }
+}
+
+fn admin_client(kafka_connection: &str) -> Result<AdminClient<DefaultClientContext>, KafkaError> {
+    let mut cfg = ClientConfig::new();
+    cfg.set("bootstrap.servers", kafka_connection);
+    cfg.set("message.timeout.ms", "5000");
+    cfg.create()
+}
+
+/// Create Kafka topic based on the provided configs.
+///
+/// This will create a topic with `n_sequencers` Kafka partitions.
+///
+/// This will NOT fail if the topic already exists! `maybe_auto_create_topics` will only call this
+/// if there are no partitions. Production should always have partitions already created, so
+/// `create_kafka_topic` shouldn't run in production and is only for test/dev environments.
+async fn create_kafka_topic(
+    kafka_connection: &str,
+    database_name: &str,
+    n_sequencers: NonZeroU32,
+    cfg: &BTreeMap<String, String>,
+) -> Result<(), WriteBufferError> {
+    let admin = admin_client(kafka_connection)?;
+
+    let mut topic = NewTopic::new(
+        database_name,
+        n_sequencers.get() as i32,
+        TopicReplication::Fixed(1),
+    );
+    for (k, v) in cfg {
+        topic = topic.set(k, v);
+    }
+
+    let opts = AdminOptions::default();
+    let mut results = admin.create_topics([&topic], &opts).await?;
+    assert_eq!(results.len(), 1, "created exactly one topic");
+    let result = results.pop().expect("just checked the vector length");
+    match result {
+        Ok(topic) | Err((topic, RDKafkaErrorCode::TopicAlreadyExists)) => {
+            assert_eq!(topic, database_name);
+            Ok(())
+        }
+        Err((topic, code)) => {
+            assert_eq!(topic, database_name);
+            Err(format!("Cannot create topic '{}': {}", topic, code).into())
+        }
+    }
+}
+
+/// If there are no Kafka partitions, then create a topic. Production should have Kafka partitions
+/// created already, so this should only create a topic in test/dev environments.
+async fn maybe_auto_create_topics(
+    kafka_connection: &str,
+    database_name: &str,
+    creation_config: Option<&WriteBufferCreationConfig>,
+    cfg: &ClientConfig,
+) -> Result<BTreeSet<ShardIndex>, WriteBufferError> {
+    const N_TRIES: usize = 10;
+
+    for i in 0..N_TRIES {
+        if let Some(partitions) = get_partitions(database_name, cfg).await? {
+            return Ok(partitions);
+        }
+
+        // debounce after first round
+        if i > 0 {
+            info!(
+                topic=%database_name,
+                "Topic does not have partitions after creating it, wait a bit and try again."
+            );
+            tokio::time::sleep(Duration::from_millis(250)).await;
+        }
+
+        if let Some(creation_config) = creation_config {
+            create_kafka_topic(
+                kafka_connection,
+                database_name,
+                creation_config.n_shards,
+                &creation_config.options,
+            )
+            .await?;
+        } else {
+            return Err("no partitions found and auto-creation not requested"
+                .to_string()
+                .into());
+        }
+    }
+
+    Err(format!("Could not auto-create topic after {} tries.", N_TRIES).into())
+}
diff --git a/write_buffer/src/kafka/record_aggregator.rs b/write_buffer/src/kafka/record_aggregator.rs
deleted file mode 100644
index 23cc68aefd..0000000000
--- a/write_buffer/src/kafka/record_aggregator.rs
+++ /dev/null
@@ -1,324 +0,0 @@
-use std::sync::Arc;
-
-use data_types::{Sequence, SequenceNumber, ShardIndex};
-use dml::{DmlMeta, DmlOperation};
-use iox_time::{Time, TimeProvider};
-use observability_deps::tracing::warn;
-use rskafka::{
-    client::producer::aggregator::{
-        Aggregator, Error, RecordAggregator as RecordAggregatorDelegate,
-        RecordAggregatorStatusDeaggregator, StatusDeaggregator, TryPush,
-    },
-    record::Record,
-};
-use trace::ctx::SpanContext;
-
-use crate::codec::{ContentType, IoxHeaders};
-
-/// The [`Tag`] is a data-carrying token identifier used to de-aggregate
-/// responses from a batch aggregated of requests using the
-/// [`DmlMetaDeaggregator`].
-#[derive(Debug)]
-pub struct Tag {
-    /// The tag into the batch returned by the
-    /// [`RecordAggregatorDelegate::try_push()`] call.
-    idx: usize,
-
-    /// The timestamp assigned to the resulting Kafka [`Record`].
-    timestamp: Time,
-    /// A span extracted from the original [`DmlOperation`].
-    span_ctx: Option<SpanContext>,
-    /// The approximate byte size of the serialised [`Record`], as calculated by
-    /// [`Record::approximate_size()`].
-    approx_kafka_write_size: usize,
-}
-
-/// A [`RecordAggregator`] implements [rskafka]'s abstract [`Aggregator`]
-/// behaviour to provide batching of requests for a single Kafka partition.
-///
-/// Specifically the [`RecordAggregator`] maps [`DmlOperation`] instances to
-/// Kafka [`Record`] instances, and delegates the batching to the
-/// [`RecordAggregatorDelegate`] implementation maintained within [rskafka]
-/// itself.
-///
-/// [rskafka]: https://github.com/influxdata/rskafka
-#[derive(Debug)]
-pub struct RecordAggregator {
-    time_provider: Arc<dyn TimeProvider>,
-
-    /// The shard index (Kafka partition number) this aggregator batches ops for (from Kafka,
-    /// not the catalog).
-    shard_index: ShardIndex,
-
-    /// The underlying record aggregator the non-IOx-specific batching is
-    /// delegated to.
-    aggregator: RecordAggregatorDelegate,
-}
-
-impl RecordAggregator {
-    /// Initialise a new [`RecordAggregator`] to aggregate up to
-    /// `max_batch_size` number of bytes per message.
-    pub fn new(
-        shard_index: ShardIndex,
-        max_batch_size: usize,
-        time_provider: Arc<dyn TimeProvider>,
-    ) -> Self {
-        Self {
-            shard_index,
-            aggregator: RecordAggregatorDelegate::new(max_batch_size),
-            time_provider,
-        }
-    }
-}
-
-impl RecordAggregator {
-    /// Serialise the [`DmlOperation`] destined for the specified `db_name` into a
-    /// [`Record`], returning the producer timestamp assigned to it.
-    fn to_record(&self, op: &DmlOperation) -> Result<(Record, Time), Error> {
-        let now = op
-            .meta()
-            .producer_ts()
-            .unwrap_or_else(|| self.time_provider.now());
-
-        let headers = IoxHeaders::new(
-            ContentType::Protobuf,
-            op.meta().span_context().cloned(),
-            op.namespace().to_owned(),
-        );
-
-        let mut buf = Vec::new();
-        crate::codec::encode_operation(op.namespace(), op, &mut buf)?;
-        buf.shrink_to_fit();
-
-        let record = Record {
-            key: None,
-            value: Some(buf),
-            headers: headers
-                .headers()
-                .map(|(k, v)| (k.to_owned(), v.as_bytes().to_vec()))
-                .collect(),
-            timestamp: now.date_time(),
-        };
-
-        Ok((record, now))
-    }
-}
-
-impl Aggregator for RecordAggregator {
-    type Input = DmlOperation;
-    type Tag = <DmlMetaDeaggregator as StatusDeaggregator>::Tag;
-    type StatusDeaggregator = DmlMetaDeaggregator;
-
-    /// Callers should retain the returned [`Tag`] in order to de-aggregate the
-    /// [`DmlMeta`] from the request response.
-    fn try_push(&mut self, op: Self::Input) -> Result<TryPush<Self::Input, Self::Tag>, Error> {
-        // Encode the DML op to a Record
-        let (record, timestamp) = self.to_record(&op)?;
-
-        // Capture various metadata necessary to construct the Tag/DmlMeta for
-        // the caller once a batch has been flushed.
-        let span_ctx = op.meta().span_context().cloned();
-        let approx_kafka_write_size = record.approximate_size();
-
-        // And delegate batching to rskafka's RecordAggregator implementation
-        Ok(match self.aggregator.try_push(record)? {
-            // NoCapacity returns the original input to the caller when the
-            // batching fails.
-            //
-            // The RecordBatcher delegate is returning the Record encoded from
-            // op above, but the caller of this fn is expecting the original op.
-            //
-            // Map to the original input op this fn was called with, discarding
-            // the encoded Record.
-            TryPush::NoCapacity(_) => {
-                // Log a warning if this occurs - this allows an operator to
-                // increase the maximum Kafka message size, or lower the linger
-                // time to minimise latency while still producing large enough
-                // batches for it to be worth while.
-                warn!("aggregated batch reached maximum capacity");
-                TryPush::NoCapacity(op)
-            }
-
-            // A successful delegate aggregation returns the tag for offset
-            // de-aggregation later. For simplicity, the tag this layer returns
-            // also carries the various (small) metadata elements needed to
-            // construct the DmlMeta at the point of de-aggregation.
-            TryPush::Aggregated(idx) => TryPush::Aggregated(Tag {
-                idx,
-                timestamp,
-                span_ctx,
-                approx_kafka_write_size,
-            }),
-        })
-    }
-
-    fn flush(&mut self) -> Result<(Vec<Record>, Self::StatusDeaggregator), Error> {
-        let records = self.aggregator.flush()?.0;
-        Ok((records, DmlMetaDeaggregator::new(self.shard_index)))
-    }
-}
-
-/// The de-aggregation half of the [`RecordAggregator`], this type consumes the
-/// caller's [`Tag`] obtained from the aggregator to return the corresponding
-/// [`DmlMeta`] from the batched response.
-///
-/// The [`DmlMetaDeaggregator`] is a stateless wrapper over the (also stateless)
-/// [`RecordAggregatorStatusDeaggregator`] delegate, with most of the metadata
-/// elements carried in the [`Tag`] itself.
-#[derive(Debug)]
-pub struct DmlMetaDeaggregator {
-    shard_index: ShardIndex,
-}
-
-impl DmlMetaDeaggregator {
-    pub fn new(shard_index: ShardIndex) -> Self {
-        Self { shard_index }
-    }
-}
-
-impl StatusDeaggregator for DmlMetaDeaggregator {
-    type Status = DmlMeta;
-    type Tag = Tag;
-
-    fn deaggregate(&self, input: &[i64], tag: Self::Tag) -> Result<Self::Status, Error> {
-        // Delegate de-aggregation to the (stateless) record batch
-        // de-aggregator for forwards compatibility.
-        let offset = RecordAggregatorStatusDeaggregator::default()
-            .deaggregate(input, tag.idx)
-            .expect("invalid de-aggregation index");
-
-        Ok(DmlMeta::sequenced(
-            Sequence::new(self.shard_index, SequenceNumber::new(offset)),
-            tag.timestamp,
-            tag.span_ctx,
-            tag.approx_kafka_write_size,
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use dml::DmlWrite;
-    use hashbrown::HashMap;
-    use iox_time::MockProvider;
-    use mutable_batch::{writer::Writer, MutableBatch};
-    use trace::LogTraceCollector;
-
-    use crate::codec::{
-        CONTENT_TYPE_PROTOBUF, HEADER_CONTENT_TYPE, HEADER_NAMESPACE, HEADER_TRACE_CONTEXT,
-    };
-
-    use super::*;
-
-    const NAMESPACE: &str = "bananas";
-    const SHARD_INDEX: ShardIndex = ShardIndex::new(42);
-    const TIMESTAMP_MILLIS: i64 = 1659990497000;
-
-    fn test_op() -> DmlOperation {
-        let mut batch = MutableBatch::new();
-        let mut writer = Writer::new(&mut batch, 1);
-        writer
-            // Date: "1970-01-01"
-            .write_time("time", [42].into_iter())
-            .unwrap();
-        writer
-            .write_i64("A", Some(&[0b00000001]), [1].into_iter())
-            .unwrap();
-        writer.commit();
-
-        let mut m = HashMap::default();
-        m.insert("table".to_string(), batch);
-
-        let span = SpanContext::new(Arc::new(LogTraceCollector::new()));
-
-        DmlOperation::Write(DmlWrite::new(
-            NAMESPACE.to_string(),
-            m,
-            Some("1970-01-01".into()),
-            DmlMeta::unsequenced(Some(span)),
-        ))
-    }
-
-    #[test]
-    fn test_record_aggregate() {
-        let clock = Arc::new(MockProvider::new(Time::from_timestamp_millis(
-            TIMESTAMP_MILLIS,
-        )));
-        let mut agg = RecordAggregator::new(SHARD_INDEX, usize::MAX, clock);
-        let write = test_op();
-
-        let res = agg.try_push(write).expect("aggregate call should succeed");
-        let tag = match res {
-            TryPush::NoCapacity(_) => panic!("unexpected no capacity"),
-            TryPush::Aggregated(tag) => tag,
-        };
-
-        // Flush the aggregator to acquire the records
-        let (records, deagg) = agg.flush().expect("should flush");
-        assert_eq!(records.len(), 1);
-
-        // Another flush should not yield the same records
-        let (records2, _) = agg.flush().expect("should flush");
-        assert!(records2.is_empty());
-
-        // Assert properties of the resulting record
-        let record = records[0].clone();
-        assert_eq!(record.key, None);
-        assert!(record.value.is_some());
-        assert_eq!(
-            *record
-                .headers
-                .get(HEADER_CONTENT_TYPE)
-                .expect("no content type"),
-            Vec::<u8>::from(CONTENT_TYPE_PROTOBUF),
-        );
-        assert_eq!(
-            *record
-                .headers
-                .get(HEADER_NAMESPACE)
-                .expect("no namespace header"),
-            Vec::<u8>::from(NAMESPACE),
-        );
-        assert!(record.headers.get(HEADER_TRACE_CONTEXT).is_some());
-        assert_eq!(record.timestamp.timestamp(), 1659990497);
-
-        // Extract the DmlMeta from the de-aggregator
-        let got = deagg
-            .deaggregate(&[4242], tag)
-            .expect("de-aggregate should succeed");
-
-        // Assert the metadata properties
-        assert!(got.span_context().is_some());
-        assert_eq!(
-            *got.sequence().expect("should be sequenced"),
-            Sequence::new(SHARD_INDEX, SequenceNumber::new(4242))
-        );
-        assert_eq!(
-            got.producer_ts().expect("no producer timestamp"),
-            Time::from_timestamp_millis(TIMESTAMP_MILLIS)
-        );
-        assert_eq!(
-            got.bytes_read().expect("no approx size"),
-            record.approximate_size()
-        );
-    }
-
-    #[test]
-    fn test_record_aggregate_no_capacity() {
-        let clock = Arc::new(MockProvider::new(Time::from_timestamp_millis(
-            TIMESTAMP_MILLIS,
-        )));
-        let mut agg = RecordAggregator::new(SHARD_INDEX, usize::MIN, clock);
-        let write = test_op();
-
-        let res = agg
-            .try_push(write.clone())
-            .expect("aggregate call should succeed");
-        match res {
-            TryPush::NoCapacity(res) => assert_eq!(res.namespace(), write.namespace()),
-            TryPush::Aggregated(_) => panic!("expected no capacity"),
-        };
-    }
-}

From fda1479db0e4dd622486c5494d4c030c4f0121e0 Mon Sep 17 00:00:00 2001
From: Luke Bond <luke.n.bond@gmail.com>
Date: Tue, 11 Oct 2022 11:33:42 +0100
Subject: [PATCH 34/40] chore: add trace log to ingester to aid debugging
 (#5829)

---
 ingester/src/stream_handler/handler.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ingester/src/stream_handler/handler.rs b/ingester/src/stream_handler/handler.rs
index 3fa563b188..1b163ea325 100644
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@@ -396,6 +396,12 @@ something clever.",
             if let Some(delta) = duration_since_production {
                 // Update the TTBR metric before potentially sleeping.
                 self.time_to_be_readable.set(delta);
+                trace!(
+                    kafka_topic=%self.topic_name,
+                    shard_index=%self.shard_index,
+                    delta=%delta.as_millis(),
+                    "reporting TTBR for shard (ms)"
+                );
             }
 
             if should_pause {

From b77c3540e1a8cd266eb042d0374bfcc5570d7f29 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Tue, 11 Oct 2022 13:01:10 +0200
Subject: [PATCH 35/40] revert: rdkafka/rskafka swapping (#5800)

This reverts commit 33391af973d73dd8ce0a295f31b479cdc03ce28e.
---
 Cargo.lock                                  |  86 -----
 write_buffer/Cargo.toml                     |   1 -
 write_buffer/src/config.rs                  |  39 +-
 write_buffer/src/core.rs                    |   9 -
 write_buffer/src/kafka/instrumentation.rs   |   1 -
 write_buffer/src/kafka/mod.rs               | 144 ++++++-
 write_buffer/src/kafka/rdkafka.rs           | 404 --------------------
 write_buffer/src/kafka/record_aggregator.rs | 324 ++++++++++++++++
 8 files changed, 466 insertions(+), 542 deletions(-)
 delete mode 100644 write_buffer/src/kafka/rdkafka.rs
 create mode 100644 write_buffer/src/kafka/record_aggregator.rs

diff --git a/Cargo.lock b/Cargo.lock
index dc3c3c3be8..d78dcedd8c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2686,18 +2686,6 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565"
 
-[[package]]
-name = "libz-sys"
-version = "1.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.0.46"
@@ -3116,27 +3104,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "num_enum"
-version = "0.5.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf5395665662ef45796a4ff5486c5d41d29e0c09640af4c5f17fd94ee2c119c9"
-dependencies = [
- "num_enum_derive",
-]
-
-[[package]]
-name = "num_enum_derive"
-version = "0.5.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b0498641e53dd6ac1a4f22547548caa6864cc4933784319cd1775271c5a46ce"
-dependencies = [
- "proc-macro-crate",
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "object"
 version = "0.29.0"
@@ -3553,12 +3520,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "pkg-config"
-version = "0.3.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
-
 [[package]]
 name = "pprof"
 version = "0.10.1"
@@ -3662,17 +3623,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "proc-macro-crate"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9"
-dependencies = [
- "once_cell",
- "thiserror",
- "toml",
-]
-
 [[package]]
 name = "proc-macro-error"
 version = "1.0.4"
@@ -4067,35 +4017,6 @@ dependencies = [
  "num_cpus",
 ]
 
-[[package]]
-name = "rdkafka"
-version = "0.28.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1de127f294f2dba488ed46760b129d5ecbeabbd337ccbf3739cb29d50db2161c"
-dependencies = [
- "futures",
- "libc",
- "log",
- "rdkafka-sys",
- "serde",
- "serde_derive",
- "serde_json",
- "slab",
- "tokio",
-]
-
-[[package]]
-name = "rdkafka-sys"
-version = "4.2.0+1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e542c6863b04ce0fa0c5719bc6b7b348cf8dd21af1bb03c9db5f9805b2a6473"
-dependencies = [
- "libc",
- "libz-sys",
- "num_enum",
- "pkg-config",
-]
-
 [[package]]
 name = "read_buffer"
 version = "0.1.0"
@@ -5711,12 +5632,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -6066,7 +5981,6 @@ dependencies = [
  "parking_lot 0.12.1",
  "pin-project",
  "prost 0.11.0",
- "rdkafka",
  "rskafka",
  "schema",
  "tempfile",
diff --git a/write_buffer/Cargo.toml b/write_buffer/Cargo.toml
index bfc9fcc7b1..5bab0fe55b 100644
--- a/write_buffer/Cargo.toml
+++ b/write_buffer/Cargo.toml
@@ -22,7 +22,6 @@ observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 pin-project = "1.0"
 prost = "0.11"
-rdkafka = "0.28.0"
 rskafka = { git = "https://github.com/influxdata/rskafka.git", rev="3208e4742f08048bbab4e8fc4e0a775507fe3e66", default-features = false, features = ["compression-snappy", "transport-socks5"] }
 schema = { path = "../schema" }
 tokio = { version = "1.21", features = ["fs", "macros", "parking_lot", "rt", "sync", "time"] }
diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs
index a716a56ed5..1d02e60cc9 100644
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@@ -1,7 +1,7 @@
 use crate::{
     core::{WriteBufferError, WriteBufferReading, WriteBufferWriting},
     file::{FileBufferConsumer, FileBufferProducer},
-    kafka::RSKafkaConsumer,
+    kafka::{RSKafkaConsumer, RSKafkaProducer},
     mock::{
         MockBufferForReading, MockBufferForReadingThatAlwaysErrors, MockBufferForWriting,
         MockBufferForWritingThatAlwaysErrors, MockBufferSharedState,
@@ -152,8 +152,8 @@ impl WriteBufferConfigFactory {
     pub async fn new_config_write(
         &self,
         db_name: &str,
-        _partitions: Option<Range<i32>>,
-        _trace_collector: Option<&Arc<dyn TraceCollector>>,
+        partitions: Option<Range<i32>>,
+        trace_collector: Option<&Arc<dyn TraceCollector>>,
         cfg: &WriteBufferConnection,
     ) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
         let writer = match &cfg.type_[..] {
@@ -168,7 +168,20 @@ impl WriteBufferConfigFactory {
                 .await?;
                 Arc::new(file_buffer) as _
             }
-            "kafka" => self.kafka_buffer_producer(db_name, cfg).await?,
+            "kafka" => {
+                let rskafa_buffer = RSKafkaProducer::new(
+                    cfg.connection.clone(),
+                    db_name.to_owned(),
+                    &cfg.connection_config,
+                    Arc::clone(&self.time_provider),
+                    cfg.creation_config.as_ref(),
+                    partitions,
+                    trace_collector.map(Arc::clone),
+                    &*self.metric_registry,
+                )
+                .await?;
+                Arc::new(rskafa_buffer) as _
+            }
             "mock" => match self.get_mock(&cfg.connection)? {
                 Mock::Normal(state) => {
                     let mock_buffer = MockBufferForWriting::new(
@@ -191,24 +204,6 @@ impl WriteBufferConfigFactory {
         Ok(writer)
     }
 
-    async fn kafka_buffer_producer(
-        &self,
-        db_name: &str,
-        cfg: &WriteBufferConnection,
-    ) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
-        let kafka_buffer = crate::kafka::rdkafka::KafkaBufferProducer::new(
-            &cfg.connection,
-            db_name,
-            &cfg.connection_config,
-            cfg.creation_config.as_ref(),
-            Arc::clone(&self.time_provider),
-            &self.metric_registry,
-        )
-        .await?;
-
-        Ok(Arc::new(kafka_buffer) as _)
-    }
-
     /// Returns a new [`WriteBufferReading`] for the provided [`WriteBufferConnection`]
     pub async fn new_config_read(
         &self,
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index afbe16d82e..04d08faa13 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -80,15 +80,6 @@ impl From<std::io::Error> for WriteBufferError {
     }
 }
 
-impl From<rdkafka::error::KafkaError> for WriteBufferError {
-    fn from(e: rdkafka::error::KafkaError) -> Self {
-        Self {
-            inner: Box::new(e),
-            kind: WriteBufferErrorKind::IO,
-        }
-    }
-}
-
 impl From<rskafka::client::error::Error> for WriteBufferError {
     fn from(e: rskafka::client::error::Error) -> Self {
         Self {
diff --git a/write_buffer/src/kafka/instrumentation.rs b/write_buffer/src/kafka/instrumentation.rs
index 4c29cc8ab7..283dc281fd 100644
--- a/write_buffer/src/kafka/instrumentation.rs
+++ b/write_buffer/src/kafka/instrumentation.rs
@@ -36,7 +36,6 @@ pub struct KafkaProducerMetrics<P = SystemProvider> {
 impl KafkaProducerMetrics {
     /// Decorate the specified [`ProducerClient`] implementation with an
     /// instrumentation layer.
-    #[allow(dead_code)]
     pub fn new(
         client: Box<dyn ProducerClient>,
         kafka_topic_name: String,
diff --git a/write_buffer/src/kafka/mod.rs b/write_buffer/src/kafka/mod.rs
index c33f65c541..99aad39d1f 100644
--- a/write_buffer/src/kafka/mod.rs
+++ b/write_buffer/src/kafka/mod.rs
@@ -1,17 +1,24 @@
-use self::config::{ClientConfig, ConsumerConfig, TopicCreationConfig};
+use self::{
+    config::{ClientConfig, ConsumerConfig, ProducerConfig, TopicCreationConfig},
+    instrumentation::KafkaProducerMetrics,
+    record_aggregator::RecordAggregator,
+};
 use crate::{
     codec::IoxHeaders,
     config::WriteBufferCreationConfig,
-    core::{WriteBufferError, WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler},
+    core::{
+        WriteBufferError, WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler,
+        WriteBufferWriting,
+    },
 };
 use async_trait::async_trait;
 use data_types::{Sequence, SequenceNumber, ShardIndex};
-use dml::DmlOperation;
+use dml::{DmlMeta, DmlOperation};
 use futures::{
     stream::{self, BoxStream},
     StreamExt, TryStreamExt,
 };
-use iox_time::Time;
+use iox_time::{Time, TimeProvider};
 use observability_deps::tracing::warn;
 use parking_lot::Mutex;
 use rskafka::{
@@ -19,6 +26,7 @@ use rskafka::{
         consumer::{StartOffset, StreamConsumerBuilder},
         error::{Error as RSKafkaError, ProtocolError},
         partition::{OffsetAt, PartitionClient, UnknownTopicHandling},
+        producer::{BatchProducer, BatchProducerBuilder},
         ClientBuilder,
     },
     record::RecordAndOffset,
@@ -35,13 +43,112 @@ use trace::TraceCollector;
 
 mod config;
 mod instrumentation;
-pub(crate) mod rdkafka;
+mod record_aggregator;
 
 /// Maximum number of jobs buffered and decoded concurrently.
 const CONCURRENT_DECODE_JOBS: usize = 10;
 
 type Result<T, E = WriteBufferError> = std::result::Result<T, E>;
 
+#[derive(Debug)]
+pub struct RSKafkaProducer {
+    producers: BTreeMap<ShardIndex, BatchProducer<RecordAggregator>>,
+}
+
+impl RSKafkaProducer {
+    #[allow(clippy::too_many_arguments)]
+    pub async fn new<'a>(
+        conn: String,
+        topic_name: String,
+        connection_config: &'a BTreeMap<String, String>,
+        time_provider: Arc<dyn TimeProvider>,
+        creation_config: Option<&'a WriteBufferCreationConfig>,
+        partitions: Option<Range<i32>>,
+        _trace_collector: Option<Arc<dyn TraceCollector>>,
+        metric_registry: &'a metric::Registry,
+    ) -> Result<Self> {
+        let partition_clients = setup_topic(
+            conn,
+            topic_name.clone(),
+            connection_config,
+            creation_config,
+            partitions,
+        )
+        .await?;
+
+        let producer_config = ProducerConfig::try_from(connection_config)?;
+
+        let producers = partition_clients
+            .into_iter()
+            .map(|(shard_index, partition_client)| {
+                // Instrument this kafka partition client.
+                let partition_client = KafkaProducerMetrics::new(
+                    Box::new(partition_client),
+                    topic_name.clone(),
+                    shard_index,
+                    metric_registry,
+                );
+
+                let mut producer_builder =
+                    BatchProducerBuilder::new_with_client(Arc::new(partition_client));
+                if let Some(linger) = producer_config.linger {
+                    producer_builder = producer_builder.with_linger(linger);
+                }
+                let producer = producer_builder.build(RecordAggregator::new(
+                    shard_index,
+                    producer_config.max_batch_size,
+                    Arc::clone(&time_provider),
+                ));
+
+                (shard_index, producer)
+            })
+            .collect();
+
+        Ok(Self { producers })
+    }
+}
+
+#[async_trait]
+impl WriteBufferWriting for RSKafkaProducer {
+    fn shard_indexes(&self) -> BTreeSet<ShardIndex> {
+        self.producers.keys().copied().collect()
+    }
+
+    async fn store_operation(
+        &self,
+        shard_index: ShardIndex,
+        operation: DmlOperation,
+    ) -> Result<DmlMeta, WriteBufferError> {
+        // Sanity check to ensure only partitioned writes are pushed into Kafka.
+        if let DmlOperation::Write(w) = &operation {
+            assert!(
+                w.partition_key().is_some(),
+                "enqueuing unpartitioned write into kafka"
+            )
+        }
+
+        let producer = self
+            .producers
+            .get(&shard_index)
+            .ok_or_else::<WriteBufferError, _>(|| {
+                format!("Unknown shard index: {}", shard_index).into()
+            })?;
+
+        Ok(producer.produce(operation).await?)
+    }
+
+    async fn flush(&self) -> Result<(), WriteBufferError> {
+        for producer in self.producers.values() {
+            producer.flush().await?;
+        }
+        Ok(())
+    }
+
+    fn type_name(&self) -> &'static str {
+        "kafka"
+    }
+}
+
 #[derive(Debug)]
 pub struct RSKafkaStreamHandler {
     partition_client: Arc<PartitionClient>,
@@ -418,17 +525,14 @@ async fn setup_topic(
 mod tests {
     use super::*;
     use crate::{
-        core::{
-            test_utils::{
-                assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
-                set_pop_first, TestAdapter, TestContext,
-            },
-            WriteBufferWriting,
+        core::test_utils::{
+            assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
+            set_pop_first, TestAdapter, TestContext,
         },
         maybe_skip_kafka_integration,
     };
     use data_types::{DeletePredicate, PartitionKey, TimestampRange};
-    use dml::{test_util::assert_write_op_eq, DmlDelete, DmlMeta, DmlWrite};
+    use dml::{test_util::assert_write_op_eq, DmlDelete, DmlWrite};
     use futures::{stream::FuturesUnordered, TryStreamExt};
     use iox_time::TimeProvider;
     use rskafka::{client::partition::Compression, record::Record};
@@ -491,17 +595,19 @@ mod tests {
 
     #[async_trait]
     impl TestContext for RSKafkaTestContext {
-        type Writing = rdkafka::KafkaBufferProducer;
+        type Writing = RSKafkaProducer;
 
         type Reading = RSKafkaConsumer;
 
         async fn writing(&self, creation_config: bool) -> Result<Self::Writing, WriteBufferError> {
-            rdkafka::KafkaBufferProducer::new(
+            RSKafkaProducer::new(
                 self.conn.clone(),
                 self.topic_name.clone(),
                 &BTreeMap::default(),
-                self.creation_config(creation_config).as_ref(),
                 Arc::clone(&self.time_provider),
+                self.creation_config(creation_config).as_ref(),
+                None,
+                Some(self.trace_collector() as Arc<_>),
                 &self.metrics,
             )
             .await
@@ -744,9 +850,9 @@ mod tests {
             .unwrap();
     }
 
-    async fn write<T: WriteBufferWriting>(
+    async fn write(
         namespace: &str,
-        producer: &T,
+        producer: &RSKafkaProducer,
         trace_collector: &Arc<RingBufferTraceCollector>,
         shard_index: ShardIndex,
         partition_key: impl Into<PartitionKey> + Send,
@@ -763,9 +869,9 @@ mod tests {
         producer.store_operation(shard_index, op).await.unwrap()
     }
 
-    async fn delete<T: WriteBufferWriting>(
+    async fn delete(
         namespace: &str,
-        producer: &T,
+        producer: &RSKafkaProducer,
         trace_collector: &Arc<RingBufferTraceCollector>,
         shard_index: ShardIndex,
     ) -> DmlMeta {
diff --git a/write_buffer/src/kafka/rdkafka.rs b/write_buffer/src/kafka/rdkafka.rs
deleted file mode 100644
index 33ca8a74f7..0000000000
--- a/write_buffer/src/kafka/rdkafka.rs
+++ /dev/null
@@ -1,404 +0,0 @@
-use crate::{
-    codec::{ContentType, IoxHeaders},
-    core::{WriteBufferError, WriteBufferWriting},
-    kafka::WriteBufferCreationConfig,
-};
-use async_trait::async_trait;
-use data_types::{Sequence, SequenceNumber, ShardIndex};
-use dml::{DmlMeta, DmlOperation};
-use iox_time::{Time, TimeProvider};
-use metric::{Attributes, DurationHistogram, Metric};
-use observability_deps::tracing::{debug, info};
-use rdkafka::{
-    admin::{AdminClient, AdminOptions, NewTopic, TopicReplication},
-    client::DefaultClientContext,
-    consumer::{BaseConsumer, Consumer},
-    error::KafkaError,
-    message::{Headers, OwnedHeaders},
-    producer::{FutureProducer, FutureRecord, Producer},
-    types::RDKafkaErrorCode,
-    util::Timeout,
-    ClientConfig,
-};
-use std::{
-    collections::{BTreeMap, BTreeSet},
-    num::NonZeroU32,
-    sync::Arc,
-    time::Duration,
-};
-
-/// Default timeout supplied to rdkafka client for kafka operations.
-///
-/// Chosen to be a value less than the default gRPC timeout (30
-/// seconds) so we can detect kafka errors and return them prior to
-/// the gRPC requests to IOx timing out.
-///
-/// More context in
-/// <https://github.com/influxdata/influxdb_iox/issues/3029>
-const KAFKA_OPERATION_TIMEOUT_MS: u64 = 20_000;
-
-impl From<&IoxHeaders> for OwnedHeaders {
-    fn from(iox_headers: &IoxHeaders) -> Self {
-        let mut res = Self::new();
-
-        for (header, value) in iox_headers.headers() {
-            res = res.add(header, value.as_ref());
-        }
-
-        res
-    }
-}
-
-pub struct KafkaBufferProducer {
-    conn: String,
-    database_name: String,
-    time_provider: Arc<dyn TimeProvider>,
-    producer: Arc<FutureProducer<DefaultClientContext>>,
-    partitions: BTreeSet<ShardIndex>,
-    enqueue: Metric<DurationHistogram>,
-}
-
-// Needed because rdkafka's FutureProducer doesn't impl Debug
-impl std::fmt::Debug for KafkaBufferProducer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("KafkaBufferProducer")
-            .field("conn", &self.conn)
-            .field("database_name", &self.database_name)
-            .finish()
-    }
-}
-
-#[async_trait]
-impl WriteBufferWriting for KafkaBufferProducer {
-    fn shard_indexes(&self) -> BTreeSet<ShardIndex> {
-        self.partitions.clone()
-    }
-
-    /// Send a [`DmlOperation`] to the write buffer using the specified shard index.
-    async fn store_operation(
-        &self,
-        shard_index: ShardIndex,
-        operation: DmlOperation,
-    ) -> Result<DmlMeta, WriteBufferError> {
-        // Sanity check to ensure only partitioned writes are pushed into Kafka.
-        if let DmlOperation::Write(w) = &operation {
-            assert!(
-                w.partition_key().is_some(),
-                "enqueuing unpartitioned write into kafka"
-            )
-        }
-
-        // Only send writes with known shard indexes to Kafka.
-        if !self.partitions.contains(&shard_index) {
-            return Err(format!("Unknown shard index: {}", shard_index).into());
-        }
-
-        let kafka_partition_id = shard_index.get();
-
-        let enqueue_start = self.time_provider.now();
-
-        // truncate milliseconds from timestamps because that's what Kafka supports
-        let now = operation
-            .meta()
-            .producer_ts()
-            .unwrap_or_else(|| self.time_provider.now());
-
-        let timestamp_millis = now.date_time().timestamp_millis();
-        let timestamp = Time::from_timestamp_millis(timestamp_millis);
-
-        let headers = IoxHeaders::new(
-            ContentType::Protobuf,
-            operation.meta().span_context().cloned(),
-            operation.namespace().to_string(),
-        );
-
-        let mut buf = Vec::new();
-        crate::codec::encode_operation(&self.database_name, &operation, &mut buf)?;
-
-        // This type annotation is necessary because `FutureRecord` is generic over key type, but
-        // key is optional and we're not setting a key. `String` is arbitrary.
-        let record: FutureRecord<'_, String, _> = FutureRecord::to(&self.database_name)
-            .payload(&buf)
-            .partition(kafka_partition_id)
-            .timestamp(timestamp_millis)
-            .headers((&headers).into());
-        let kafka_write_size = estimate_message_size(
-            record.payload.map(|v| v.as_ref()),
-            record.key.map(|s| s.as_bytes()),
-            record.headers.as_ref(),
-        );
-
-        debug!(db_name=%self.database_name, kafka_partition_id, size=buf.len(), "writing to kafka");
-
-        let res = self.producer.send(record, Timeout::Never).await;
-
-        if let Some(delta) = self
-            .time_provider
-            .now()
-            .checked_duration_since(enqueue_start)
-        {
-            let result_attr = match &res {
-                Ok(_) => "success",
-                Err(_) => "error",
-            };
-
-            let attr = Attributes::from([
-                ("kafka_partition", shard_index.to_string().into()),
-                ("kafka_topic", self.database_name.clone().into()),
-                ("result", result_attr.into()),
-            ]);
-
-            let recorder = self.enqueue.recorder(attr);
-            recorder.record(delta);
-        }
-
-        let (partition, offset) = res.map_err(|(e, _owned_message)| e)?;
-
-        debug!(db_name=%self.database_name, %offset, %partition, size=buf.len(), "wrote to kafka");
-
-        Ok(DmlMeta::sequenced(
-            Sequence::new(shard_index, SequenceNumber::new(offset)),
-            timestamp,
-            operation.meta().span_context().cloned(),
-            kafka_write_size,
-        ))
-    }
-
-    async fn flush(&self) -> Result<(), WriteBufferError> {
-        let producer = Arc::clone(&self.producer);
-
-        tokio::task::spawn_blocking(move || {
-            producer.flush(Timeout::Never);
-        })
-        .await
-        .expect("subtask failed");
-
-        Ok(())
-    }
-
-    fn type_name(&self) -> &'static str {
-        "kafka"
-    }
-}
-
-impl KafkaBufferProducer {
-    pub async fn new(
-        conn: impl Into<String> + Send,
-        database_name: impl Into<String> + Send,
-        connection_config: &BTreeMap<String, String>,
-        creation_config: Option<&WriteBufferCreationConfig>,
-        time_provider: Arc<dyn TimeProvider>,
-        metric_registry: &metric::Registry,
-    ) -> Result<Self, WriteBufferError> {
-        let conn = conn.into();
-        let database_name = database_name.into();
-
-        let mut cfg = ClientConfig::new();
-
-        // these configs can be overwritten
-        cfg.set("message.timeout.ms", "5000");
-        cfg.set("message.max.bytes", "31457280");
-        cfg.set("message.send.max.retries", "10");
-        cfg.set("queue.buffering.max.kbytes", "31457280");
-        cfg.set("request.required.acks", "all"); // equivalent to acks=-1
-        cfg.set("compression.type", "snappy");
-        cfg.set("statistics.interval.ms", "15000");
-
-        // user overrides
-        for (k, v) in connection_config {
-            cfg.set(k, v);
-        }
-
-        // these configs are set in stone
-        cfg.set("bootstrap.servers", &conn);
-        cfg.set("allow.auto.create.topics", "false");
-
-        // handle auto-creation
-        let partitions =
-            maybe_auto_create_topics(&conn, &database_name, creation_config, &cfg).await?;
-
-        let producer = cfg.create()?;
-
-        let enqueue = metric_registry.register_metric::<DurationHistogram>(
-            "write_buffer_client_produce_duration",
-            "duration of time taken to push a set of records to kafka \
-             - includes codec, protocol, and network overhead",
-        );
-
-        Ok(Self {
-            conn,
-            database_name,
-            time_provider,
-            producer: Arc::new(producer),
-            partitions,
-            enqueue,
-        })
-    }
-}
-
-/// Iterate over the kafka messages
-fn header_iter<H>(headers: Option<&H>) -> impl Iterator<Item = (&str, &[u8])>
-where
-    H: Headers,
-{
-    headers
-        .into_iter()
-        .flat_map(|headers| (0..headers.count()).map(|idx| headers.get(idx).unwrap()))
-}
-
-/// Estimate size of data read from kafka as payload len + key len + headers
-fn estimate_message_size<H>(
-    payload: Option<&[u8]>,
-    key: Option<&[u8]>,
-    headers: Option<&H>,
-) -> usize
-where
-    H: Headers,
-{
-    payload.map(|payload| payload.len()).unwrap_or_default()
-        + key.map(|key| key.len()).unwrap_or_default()
-        + header_iter(headers)
-            .map(|(key, value)| key.len() + value.len())
-            .sum::<usize>()
-}
-
-/// Get Kafka partition IDs (IOx ShardIndexes) for the database-specific Kafka topic.
-///
-/// Will return `None` if the topic is unknown and has to be created.
-///
-/// This will check that the partition is is non-empty.
-async fn get_partitions(
-    database_name: &str,
-    cfg: &ClientConfig,
-) -> Result<Option<BTreeSet<ShardIndex>>, WriteBufferError> {
-    let database_name = database_name.to_string();
-    let cfg = cfg.clone();
-
-    let metadata = tokio::task::spawn_blocking(move || {
-        let probe_consumer: BaseConsumer = cfg.create()?;
-
-        probe_consumer.fetch_metadata(
-            Some(&database_name),
-            Duration::from_millis(KAFKA_OPERATION_TIMEOUT_MS),
-        )
-    })
-    .await
-    .expect("subtask failed")?;
-
-    let topic_metadata = metadata.topics().get(0).expect("requested a single topic");
-
-    match topic_metadata.error() {
-        None => {
-            let partitions: BTreeSet<_> = topic_metadata
-                .partitions()
-                .iter()
-                .map(|partition_metdata| ShardIndex::new(partition_metdata.id()))
-                .collect();
-
-            if partitions.is_empty() {
-                Err("Topic exists but has no partitions".to_string().into())
-            } else {
-                Ok(Some(partitions))
-            }
-        }
-        Some(error_code) => {
-            let error_code: RDKafkaErrorCode = error_code.into();
-            match error_code {
-                RDKafkaErrorCode::UnknownTopic | RDKafkaErrorCode::UnknownTopicOrPartition => {
-                    // The caller is responsible for creating the topic, so this is somewhat OK.
-                    Ok(None)
-                }
-                _ => Err(KafkaError::MetadataFetch(error_code).into()),
-            }
-        }
-    }
-}
-
-fn admin_client(kafka_connection: &str) -> Result<AdminClient<DefaultClientContext>, KafkaError> {
-    let mut cfg = ClientConfig::new();
-    cfg.set("bootstrap.servers", kafka_connection);
-    cfg.set("message.timeout.ms", "5000");
-    cfg.create()
-}
-
-/// Create Kafka topic based on the provided configs.
-///
-/// This will create a topic with `n_sequencers` Kafka partitions.
-///
-/// This will NOT fail if the topic already exists! `maybe_auto_create_topics` will only call this
-/// if there are no partitions. Production should always have partitions already created, so
-/// `create_kafka_topic` shouldn't run in production and is only for test/dev environments.
-async fn create_kafka_topic(
-    kafka_connection: &str,
-    database_name: &str,
-    n_sequencers: NonZeroU32,
-    cfg: &BTreeMap<String, String>,
-) -> Result<(), WriteBufferError> {
-    let admin = admin_client(kafka_connection)?;
-
-    let mut topic = NewTopic::new(
-        database_name,
-        n_sequencers.get() as i32,
-        TopicReplication::Fixed(1),
-    );
-    for (k, v) in cfg {
-        topic = topic.set(k, v);
-    }
-
-    let opts = AdminOptions::default();
-    let mut results = admin.create_topics([&topic], &opts).await?;
-    assert_eq!(results.len(), 1, "created exactly one topic");
-    let result = results.pop().expect("just checked the vector length");
-    match result {
-        Ok(topic) | Err((topic, RDKafkaErrorCode::TopicAlreadyExists)) => {
-            assert_eq!(topic, database_name);
-            Ok(())
-        }
-        Err((topic, code)) => {
-            assert_eq!(topic, database_name);
-            Err(format!("Cannot create topic '{}': {}", topic, code).into())
-        }
-    }
-}
-
-/// If there are no Kafka partitions, then create a topic. Production should have Kafka partitions
-/// created already, so this should only create a topic in test/dev environments.
-async fn maybe_auto_create_topics(
-    kafka_connection: &str,
-    database_name: &str,
-    creation_config: Option<&WriteBufferCreationConfig>,
-    cfg: &ClientConfig,
-) -> Result<BTreeSet<ShardIndex>, WriteBufferError> {
-    const N_TRIES: usize = 10;
-
-    for i in 0..N_TRIES {
-        if let Some(partitions) = get_partitions(database_name, cfg).await? {
-            return Ok(partitions);
-        }
-
-        // debounce after first round
-        if i > 0 {
-            info!(
-                topic=%database_name,
-                "Topic does not have partitions after creating it, wait a bit and try again."
-            );
-            tokio::time::sleep(Duration::from_millis(250)).await;
-        }
-
-        if let Some(creation_config) = creation_config {
-            create_kafka_topic(
-                kafka_connection,
-                database_name,
-                creation_config.n_shards,
-                &creation_config.options,
-            )
-            .await?;
-        } else {
-            return Err("no partitions found and auto-creation not requested"
-                .to_string()
-                .into());
-        }
-    }
-
-    Err(format!("Could not auto-create topic after {} tries.", N_TRIES).into())
-}
diff --git a/write_buffer/src/kafka/record_aggregator.rs b/write_buffer/src/kafka/record_aggregator.rs
new file mode 100644
index 0000000000..23cc68aefd
--- /dev/null
+++ b/write_buffer/src/kafka/record_aggregator.rs
@@ -0,0 +1,324 @@
+use std::sync::Arc;
+
+use data_types::{Sequence, SequenceNumber, ShardIndex};
+use dml::{DmlMeta, DmlOperation};
+use iox_time::{Time, TimeProvider};
+use observability_deps::tracing::warn;
+use rskafka::{
+    client::producer::aggregator::{
+        Aggregator, Error, RecordAggregator as RecordAggregatorDelegate,
+        RecordAggregatorStatusDeaggregator, StatusDeaggregator, TryPush,
+    },
+    record::Record,
+};
+use trace::ctx::SpanContext;
+
+use crate::codec::{ContentType, IoxHeaders};
+
+/// The [`Tag`] is a data-carrying token identifier used to de-aggregate
+/// responses from a batch aggregated of requests using the
+/// [`DmlMetaDeaggregator`].
+#[derive(Debug)]
+pub struct Tag {
+    /// The tag into the batch returned by the
+    /// [`RecordAggregatorDelegate::try_push()`] call.
+    idx: usize,
+
+    /// The timestamp assigned to the resulting Kafka [`Record`].
+    timestamp: Time,
+    /// A span extracted from the original [`DmlOperation`].
+    span_ctx: Option<SpanContext>,
+    /// The approximate byte size of the serialised [`Record`], as calculated by
+    /// [`Record::approximate_size()`].
+    approx_kafka_write_size: usize,
+}
+
+/// A [`RecordAggregator`] implements [rskafka]'s abstract [`Aggregator`]
+/// behaviour to provide batching of requests for a single Kafka partition.
+///
+/// Specifically the [`RecordAggregator`] maps [`DmlOperation`] instances to
+/// Kafka [`Record`] instances, and delegates the batching to the
+/// [`RecordAggregatorDelegate`] implementation maintained within [rskafka]
+/// itself.
+///
+/// [rskafka]: https://github.com/influxdata/rskafka
+#[derive(Debug)]
+pub struct RecordAggregator {
+    time_provider: Arc<dyn TimeProvider>,
+
+    /// The shard index (Kafka partition number) this aggregator batches ops for (from Kafka,
+    /// not the catalog).
+    shard_index: ShardIndex,
+
+    /// The underlying record aggregator the non-IOx-specific batching is
+    /// delegated to.
+    aggregator: RecordAggregatorDelegate,
+}
+
+impl RecordAggregator {
+    /// Initialise a new [`RecordAggregator`] to aggregate up to
+    /// `max_batch_size` number of bytes per message.
+    pub fn new(
+        shard_index: ShardIndex,
+        max_batch_size: usize,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
+        Self {
+            shard_index,
+            aggregator: RecordAggregatorDelegate::new(max_batch_size),
+            time_provider,
+        }
+    }
+}
+
+impl RecordAggregator {
+    /// Serialise the [`DmlOperation`] destined for the specified `db_name` into a
+    /// [`Record`], returning the producer timestamp assigned to it.
+    fn to_record(&self, op: &DmlOperation) -> Result<(Record, Time), Error> {
+        let now = op
+            .meta()
+            .producer_ts()
+            .unwrap_or_else(|| self.time_provider.now());
+
+        let headers = IoxHeaders::new(
+            ContentType::Protobuf,
+            op.meta().span_context().cloned(),
+            op.namespace().to_owned(),
+        );
+
+        let mut buf = Vec::new();
+        crate::codec::encode_operation(op.namespace(), op, &mut buf)?;
+        buf.shrink_to_fit();
+
+        let record = Record {
+            key: None,
+            value: Some(buf),
+            headers: headers
+                .headers()
+                .map(|(k, v)| (k.to_owned(), v.as_bytes().to_vec()))
+                .collect(),
+            timestamp: now.date_time(),
+        };
+
+        Ok((record, now))
+    }
+}
+
+impl Aggregator for RecordAggregator {
+    type Input = DmlOperation;
+    type Tag = <DmlMetaDeaggregator as StatusDeaggregator>::Tag;
+    type StatusDeaggregator = DmlMetaDeaggregator;
+
+    /// Callers should retain the returned [`Tag`] in order to de-aggregate the
+    /// [`DmlMeta`] from the request response.
+    fn try_push(&mut self, op: Self::Input) -> Result<TryPush<Self::Input, Self::Tag>, Error> {
+        // Encode the DML op to a Record
+        let (record, timestamp) = self.to_record(&op)?;
+
+        // Capture various metadata necessary to construct the Tag/DmlMeta for
+        // the caller once a batch has been flushed.
+        let span_ctx = op.meta().span_context().cloned();
+        let approx_kafka_write_size = record.approximate_size();
+
+        // And delegate batching to rskafka's RecordAggregator implementation
+        Ok(match self.aggregator.try_push(record)? {
+            // NoCapacity returns the original input to the caller when the
+            // batching fails.
+            //
+            // The RecordBatcher delegate is returning the Record encoded from
+            // op above, but the caller of this fn is expecting the original op.
+            //
+            // Map to the original input op this fn was called with, discarding
+            // the encoded Record.
+            TryPush::NoCapacity(_) => {
+                // Log a warning if this occurs - this allows an operator to
+                // increase the maximum Kafka message size, or lower the linger
+                // time to minimise latency while still producing large enough
+                // batches for it to be worth while.
+                warn!("aggregated batch reached maximum capacity");
+                TryPush::NoCapacity(op)
+            }
+
+            // A successful delegate aggregation returns the tag for offset
+            // de-aggregation later. For simplicity, the tag this layer returns
+            // also carries the various (small) metadata elements needed to
+            // construct the DmlMeta at the point of de-aggregation.
+            TryPush::Aggregated(idx) => TryPush::Aggregated(Tag {
+                idx,
+                timestamp,
+                span_ctx,
+                approx_kafka_write_size,
+            }),
+        })
+    }
+
+    fn flush(&mut self) -> Result<(Vec<Record>, Self::StatusDeaggregator), Error> {
+        let records = self.aggregator.flush()?.0;
+        Ok((records, DmlMetaDeaggregator::new(self.shard_index)))
+    }
+}
+
+/// The de-aggregation half of the [`RecordAggregator`], this type consumes the
+/// caller's [`Tag`] obtained from the aggregator to return the corresponding
+/// [`DmlMeta`] from the batched response.
+///
+/// The [`DmlMetaDeaggregator`] is a stateless wrapper over the (also stateless)
+/// [`RecordAggregatorStatusDeaggregator`] delegate, with most of the metadata
+/// elements carried in the [`Tag`] itself.
+#[derive(Debug)]
+pub struct DmlMetaDeaggregator {
+    shard_index: ShardIndex,
+}
+
+impl DmlMetaDeaggregator {
+    pub fn new(shard_index: ShardIndex) -> Self {
+        Self { shard_index }
+    }
+}
+
+impl StatusDeaggregator for DmlMetaDeaggregator {
+    type Status = DmlMeta;
+    type Tag = Tag;
+
+    fn deaggregate(&self, input: &[i64], tag: Self::Tag) -> Result<Self::Status, Error> {
+        // Delegate de-aggregation to the (stateless) record batch
+        // de-aggregator for forwards compatibility.
+        let offset = RecordAggregatorStatusDeaggregator::default()
+            .deaggregate(input, tag.idx)
+            .expect("invalid de-aggregation index");
+
+        Ok(DmlMeta::sequenced(
+            Sequence::new(self.shard_index, SequenceNumber::new(offset)),
+            tag.timestamp,
+            tag.span_ctx,
+            tag.approx_kafka_write_size,
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use dml::DmlWrite;
+    use hashbrown::HashMap;
+    use iox_time::MockProvider;
+    use mutable_batch::{writer::Writer, MutableBatch};
+    use trace::LogTraceCollector;
+
+    use crate::codec::{
+        CONTENT_TYPE_PROTOBUF, HEADER_CONTENT_TYPE, HEADER_NAMESPACE, HEADER_TRACE_CONTEXT,
+    };
+
+    use super::*;
+
+    const NAMESPACE: &str = "bananas";
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(42);
+    const TIMESTAMP_MILLIS: i64 = 1659990497000;
+
+    fn test_op() -> DmlOperation {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+        writer
+            // Date: "1970-01-01"
+            .write_time("time", [42].into_iter())
+            .unwrap();
+        writer
+            .write_i64("A", Some(&[0b00000001]), [1].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let mut m = HashMap::default();
+        m.insert("table".to_string(), batch);
+
+        let span = SpanContext::new(Arc::new(LogTraceCollector::new()));
+
+        DmlOperation::Write(DmlWrite::new(
+            NAMESPACE.to_string(),
+            m,
+            Some("1970-01-01".into()),
+            DmlMeta::unsequenced(Some(span)),
+        ))
+    }
+
+    #[test]
+    fn test_record_aggregate() {
+        let clock = Arc::new(MockProvider::new(Time::from_timestamp_millis(
+            TIMESTAMP_MILLIS,
+        )));
+        let mut agg = RecordAggregator::new(SHARD_INDEX, usize::MAX, clock);
+        let write = test_op();
+
+        let res = agg.try_push(write).expect("aggregate call should succeed");
+        let tag = match res {
+            TryPush::NoCapacity(_) => panic!("unexpected no capacity"),
+            TryPush::Aggregated(tag) => tag,
+        };
+
+        // Flush the aggregator to acquire the records
+        let (records, deagg) = agg.flush().expect("should flush");
+        assert_eq!(records.len(), 1);
+
+        // Another flush should not yield the same records
+        let (records2, _) = agg.flush().expect("should flush");
+        assert!(records2.is_empty());
+
+        // Assert properties of the resulting record
+        let record = records[0].clone();
+        assert_eq!(record.key, None);
+        assert!(record.value.is_some());
+        assert_eq!(
+            *record
+                .headers
+                .get(HEADER_CONTENT_TYPE)
+                .expect("no content type"),
+            Vec::<u8>::from(CONTENT_TYPE_PROTOBUF),
+        );
+        assert_eq!(
+            *record
+                .headers
+                .get(HEADER_NAMESPACE)
+                .expect("no namespace header"),
+            Vec::<u8>::from(NAMESPACE),
+        );
+        assert!(record.headers.get(HEADER_TRACE_CONTEXT).is_some());
+        assert_eq!(record.timestamp.timestamp(), 1659990497);
+
+        // Extract the DmlMeta from the de-aggregator
+        let got = deagg
+            .deaggregate(&[4242], tag)
+            .expect("de-aggregate should succeed");
+
+        // Assert the metadata properties
+        assert!(got.span_context().is_some());
+        assert_eq!(
+            *got.sequence().expect("should be sequenced"),
+            Sequence::new(SHARD_INDEX, SequenceNumber::new(4242))
+        );
+        assert_eq!(
+            got.producer_ts().expect("no producer timestamp"),
+            Time::from_timestamp_millis(TIMESTAMP_MILLIS)
+        );
+        assert_eq!(
+            got.bytes_read().expect("no approx size"),
+            record.approximate_size()
+        );
+    }
+
+    #[test]
+    fn test_record_aggregate_no_capacity() {
+        let clock = Arc::new(MockProvider::new(Time::from_timestamp_millis(
+            TIMESTAMP_MILLIS,
+        )));
+        let mut agg = RecordAggregator::new(SHARD_INDEX, usize::MIN, clock);
+        let write = test_op();
+
+        let res = agg
+            .try_push(write.clone())
+            .expect("aggregate call should succeed");
+        match res {
+            TryPush::NoCapacity(res) => assert_eq!(res.namespace(), write.namespace()),
+            TryPush::Aggregated(_) => panic!("expected no capacity"),
+        };
+    }
+}

From c4f542bbe26c1823b9b063dc53c9b73f4e2ead13 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Mon, 10 Oct 2022 15:49:23 +0200
Subject: [PATCH 36/40] refactor(ingester): remove tombstone support

This commit removes tombstone support from the ingester, and deletes
associated code/helpers/tests. This commit does NOT remove tombstone
support from any other service, but MAY include removing overlapping
test coverage.

This also removes the tombstone support from the Ingester -> Querier RPC
response message.

This has the nice side effect of removing a whole lot of thread spawning
in the ingester tests for the Executor, speeding everything up!
---
 .../influxdata/iox/ingester/v1/query.proto    |   5 +-
 .../tests/end_to_end_cases/ingester.rs        |   1 -
 ingester/src/compact.rs                       | 188 +------
 ingester/src/data.rs                          |  47 +-
 ingester/src/data/namespace.rs                |  44 +-
 ingester/src/data/partition.rs                | 224 +--------
 ingester/src/data/partition/buffer.rs         |  48 +-
 ingester/src/data/query_dedup.rs              | 159 ------
 ingester/src/data/shard.rs                    |   6 +-
 ingester/src/data/table.rs                    |  58 +--
 ingester/src/querier_handler.rs               | 182 +------
 ingester/src/query.rs                         | 194 +-------
 ingester/src/server/grpc.rs                   |   8 -
 ingester/src/stream_handler/handler.rs        |   2 +-
 ingester/src/stream_handler/mod.rs            |  10 +-
 .../periodic_watermark_fetcher.rs             |   2 +-
 ingester/src/stream_handler/sink.rs           |   2 +-
 .../stream_handler/sink_instrumentation.rs    |   6 +-
 ingester/src/test_util.rs                     | 218 +-------
 iox_catalog/src/postgres.rs                   |   2 +-
 querier/src/ingester/mod.rs                   |  32 +-
 query_tests/cases/in/delete_all.expected      |  25 -
 query_tests/cases/in/delete_all.sql           |  17 -
 .../in/delete_multi_expr_one_chunk.expected   | 207 --------
 .../cases/in/delete_multi_expr_one_chunk.sql  |  61 ---
 .../in/delete_simple_pred_one_chunk.expected  |  91 ----
 .../cases/in/delete_simple_pred_one_chunk.sql |  37 --
 .../cases/in/delete_three_chunks_1.expected   |  85 ----
 .../cases/in/delete_three_chunks_1.sql        |  23 -
 .../cases/in/delete_three_chunks_2.expected   |  77 ---
 .../cases/in/delete_three_chunks_2.sql        |  19 -
 .../cases/in/delete_three_chunks_3.expected   |  76 ---
 .../cases/in/delete_three_chunks_3.sql        |  27 -
 .../cases/in/delete_three_chunks_4.expected   |  49 --
 .../cases/in/delete_three_chunks_4.sql        |  13 -
 ...lete_two_del_multi_expr_one_chunk.expected |  34 --
 .../delete_two_del_multi_expr_one_chunk.sql   |  15 -
 query_tests/src/cases.rs                      | 248 ++-------
 query_tests/src/influxrpc/field_columns.rs    |  82 ---
 query_tests/src/influxrpc/read_filter.rs      | 256 +---------
 query_tests/src/influxrpc/read_group.rs       |  70 ---
 .../src/influxrpc/read_window_aggregate.rs    |  85 ----
 query_tests/src/influxrpc/table_names.rs      |  75 ---
 query_tests/src/influxrpc/tag_keys.rs         |  18 -
 query_tests/src/influxrpc/tag_values.rs       |  26 -
 query_tests/src/scenarios/library.rs          | 470 ------------------
 query_tests/src/scenarios/util.rs             |   3 -
 47 files changed, 131 insertions(+), 3496 deletions(-)
 delete mode 100644 ingester/src/data/query_dedup.rs
 delete mode 100644 query_tests/cases/in/delete_all.expected
 delete mode 100644 query_tests/cases/in/delete_all.sql
 delete mode 100644 query_tests/cases/in/delete_multi_expr_one_chunk.expected
 delete mode 100644 query_tests/cases/in/delete_multi_expr_one_chunk.sql
 delete mode 100644 query_tests/cases/in/delete_simple_pred_one_chunk.expected
 delete mode 100644 query_tests/cases/in/delete_simple_pred_one_chunk.sql
 delete mode 100644 query_tests/cases/in/delete_three_chunks_1.expected
 delete mode 100644 query_tests/cases/in/delete_three_chunks_1.sql
 delete mode 100644 query_tests/cases/in/delete_three_chunks_2.expected
 delete mode 100644 query_tests/cases/in/delete_three_chunks_2.sql
 delete mode 100644 query_tests/cases/in/delete_three_chunks_3.expected
 delete mode 100644 query_tests/cases/in/delete_three_chunks_3.sql
 delete mode 100644 query_tests/cases/in/delete_three_chunks_4.expected
 delete mode 100644 query_tests/cases/in/delete_three_chunks_4.sql
 delete mode 100644 query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected
 delete mode 100644 query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql

diff --git a/generated_types/protos/influxdata/iox/ingester/v1/query.proto b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
index ff7cc66209..fc0ca483f2 100644
--- a/generated_types/protos/influxdata/iox/ingester/v1/query.proto
+++ b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
@@ -82,8 +82,9 @@ message PartitionStatus {
   // Max sequence number persisted
   optional int64 parquet_max_sequence_number = 1;
 
-  // Max sequence number for a tombstone associated
-  optional int64 tombstone_max_sequence_number = 2;
+  // Deprecated tombstone support in ingester (#5825).
+  reserved "tombstone_max_sequence_number";
+  reserved 2;
 }
 
 // Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s
diff --git a/influxdb_iox/tests/end_to_end_cases/ingester.rs b/influxdb_iox/tests/end_to_end_cases/ingester.rs
index 07ecd8fbbe..edf93bb305 100644
--- a/influxdb_iox/tests/end_to_end_cases/ingester.rs
+++ b/influxdb_iox/tests/end_to_end_cases/ingester.rs
@@ -52,7 +52,6 @@ async fn ingester_flight_api() {
             partition_id,
             status: Some(PartitionStatus {
                 parquet_max_sequence_number: None,
-                tombstone_max_sequence_number: None
             })
         },
     );
diff --git a/ingester/src/compact.rs b/ingester/src/compact.rs
index ce516ffe85..8a280cc751 100644
--- a/ingester/src/compact.rs
+++ b/ingester/src/compact.rs
@@ -18,7 +18,7 @@ use crate::{data::partition::PersistingBatch, query::QueryableBatch};
 
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
+pub(crate) enum Error {
     #[snafu(display("Error while building logical plan for Ingester's compaction"))]
     LogicalPlan {
         source: iox_query::frontend::reorg::Error,
@@ -189,8 +189,8 @@ mod tests {
         create_batches_with_influxtype_same_columns_different_type,
         create_one_record_batch_with_influxtype_duplicates,
         create_one_record_batch_with_influxtype_no_duplicates,
-        create_one_row_record_batch_with_influxtype, create_tombstone, make_meta,
-        make_persisting_batch, make_queryable_batch, make_queryable_batch_with_deletes,
+        create_one_row_record_batch_with_influxtype, make_meta, make_persisting_batch,
+        make_queryable_batch,
     };
 
     // this test was added to guard against https://github.com/influxdata/influxdb_iox/issues/3782
@@ -223,7 +223,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -293,7 +292,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -389,7 +387,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -488,7 +485,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -588,7 +584,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -692,7 +687,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -816,54 +810,6 @@ mod tests {
         assert_batches_eq!(&expected, &output_batches);
     }
 
-    #[tokio::test]
-    async fn test_compact_one_batch_no_dupilcates_with_deletes() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-        // verify no empty record batches - bug #3782
-        assert_eq!(output_batches.len(), 2);
-        assert_eq!(output_batches[0].num_rows(), 1);
-        assert_eq!(output_batches[1].num_rows(), 1);
-
-        // verify compacted data
-        // row with "tag1=UT" no longer available
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
     #[tokio::test]
     async fn test_compact_one_batch_with_duplicates() {
         // create input data
@@ -1010,23 +956,12 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_deletes(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates() {
         // create many-batches input data
         let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![create_tombstone(
-            1,
-            1,
-            1,
-            100,                          // delete's seq_number
-            0,                            // min time of data to get deleted
-            200000,                       // max time of data to get deleted
-            "tag2=CT and field_int=1000", // delete predicate
-        )];
 
         // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
 
         // verify PK
         let schema = compact_batch.schema();
@@ -1049,7 +984,6 @@ mod tests {
 
         // verify compacted data
         // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") are also removed
         // CORRECT RESULT
         let expected = vec![
             "+-----------+------+------+--------------------------------+",
@@ -1058,73 +992,15 @@ mod tests {
             "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
             "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
             "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
             "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
             "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
             "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
-            "+-----------+------+------+--------------------------------+",
-        ];
-
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
-    #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes(
-    ) {
-        // create many-batches input data
-        let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,        // delete's seq_number
-                0,          // min time of data to get deleted
-                200000,     // max time of data to get deleted
-                "tag1!=MT", // delete predicate
-            ),
-        ];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "tag2", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify compacted data
-        // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1!=MT") are also removed
-        let expected = vec![
-            "+-----------+------+------+--------------------------------+",
-            "| field_int | tag1 | tag2 | time                           |",
-            "+-----------+------+------+--------------------------------+",
-            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
-            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
             "+-----------+------+------+--------------------------------+",
         ];
 
@@ -1133,31 +1009,12 @@ mod tests {
 
     // BUG
     #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes_2(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates2() {
         // create many-batches input data
         let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,       // delete's seq_number
-                0,         // min time of data to get deleted
-                200000,    // max time of data to get deleted
-                "tag1=MT", // delete predicate
-            ),
-        ];
 
         // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
 
         // verify PK
         let schema = compact_batch.schema();
@@ -1180,29 +1037,22 @@ mod tests {
 
         // verify compacted data
         // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1=MT") are also removed
-        // CORRECT RESULT
-        // let expected = vec![
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| field_int | tag1 | tag2 | time                           |",
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
-        //     "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
-        //     "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
-        //     "+-----------+------+------+--------------------------------+",
-        // ];
-        // current WRONMG result: "tag1 is null" is also eliminated
         let expected = vec![
             "+-----------+------+------+--------------------------------+",
             "| field_int | tag1 | tag2 | time                           |",
             "+-----------+------+------+--------------------------------+",
+            "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
+            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
+            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
             "+-----------+------+------+--------------------------------+",
         ];
 
diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 4d89b8f976..66f71159bb 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -25,11 +25,10 @@ use crate::{
     lifecycle::LifecycleHandle,
 };
 
-pub mod namespace;
+pub(crate) mod namespace;
 pub mod partition;
-mod query_dedup;
-pub mod shard;
-pub mod table;
+pub(crate) mod shard;
+pub(crate) mod table;
 
 use self::{
     partition::{resolver::PartitionProvider, PartitionStatus},
@@ -52,9 +51,6 @@ pub enum Error {
     #[snafu(display("Table {} not found in buffer", table_name))]
     TableNotFound { table_name: String },
 
-    #[snafu(display("Table must be specified in delete"))]
-    TableNotPresent,
-
     #[snafu(display("Error accessing catalog: {}", source))]
     Catalog {
         source: iox_catalog::interface::Error,
@@ -187,7 +183,7 @@ impl IngesterData {
             .get(&shard_id)
             .context(ShardNotFoundSnafu { shard_id })?;
         shard_data
-            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle, &self.exec)
+            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle)
             .await
     }
 
@@ -1354,7 +1350,6 @@ mod tests {
             Arc::clone(&metrics),
             Arc::new(SystemProvider::new()),
         );
-        let exec = Executor::new(1);
 
         let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog)));
 
@@ -1370,7 +1365,7 @@ mod tests {
         // to 1 already, so it shouldn't be buffered and the buffer should
         // remain empty.
         let should_pause = data
-            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle(), &exec)
+            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle())
             .await
             .unwrap();
         {
@@ -1386,7 +1381,7 @@ mod tests {
         assert!(!should_pause);
 
         // w2 should be in the buffer
-        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle(), &exec)
+        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle())
             .await
             .unwrap();
 
@@ -1478,19 +1473,6 @@ mod tests {
         .await
         .unwrap();
 
-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name.clone().into())
-                .unwrap()
-                .table_data(&"mem".into())
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            None,
-        );
-
         let predicate = DeletePredicate {
             range: TimestampRange::new(1, 2),
             exprs: vec![],
@@ -1509,19 +1491,6 @@ mod tests {
         data.buffer_operation(shard1.id, DmlOperation::Delete(d1), &manager.handle())
             .await
             .unwrap();
-
-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name.into())
-                .unwrap()
-                .table_data(&"mem".into())
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            Some(SequenceNumber::new(2)),
-        );
     }
 
     /// Verifies that the progress in data is the same as expected_progress
@@ -1570,7 +1539,6 @@ mod tests {
                 PartitionId::new(2),
                 PartitionStatus {
                     parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
                 },
             )),
             Err(ArrowError::IoError("some io error".into())),
@@ -1579,7 +1547,6 @@ mod tests {
                 PartitionId::new(1),
                 PartitionStatus {
                     parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
                 },
             )),
         ])));
@@ -1590,7 +1557,6 @@ mod tests {
                 partition_id: PartitionId::new(2),
                 status: PartitionStatus {
                     parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
                 },
             }),
             Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
@@ -1606,7 +1572,6 @@ mod tests {
                 partition_id: PartitionId::new(1),
                 status: PartitionStatus {
                     parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
                 },
             }),
         ];
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 94013b36c8..9aa414a535 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -5,10 +5,10 @@ use std::{collections::HashMap, sync::Arc};
 use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
+use observability_deps::tracing::warn;
 use parking_lot::RwLock;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use write_summary::ShardProgress;
 
 #[cfg(test)]
@@ -70,12 +70,16 @@ impl std::ops::Deref for NamespaceName {
     }
 }
 
+impl std::fmt::Display for NamespaceName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Data of a Namespace that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct NamespaceData {
     namespace_id: NamespaceId,
-
-    #[allow(dead_code)]
     namespace_name: NamespaceName,
 
     /// The catalog ID of the shard this namespace is being populated from.
@@ -138,7 +142,7 @@ pub(crate) struct NamespaceData {
 
 impl NamespaceData {
     /// Initialize new tables with default partition template of daily
-    pub fn new(
+    pub(super) fn new(
         namespace_id: NamespaceId,
         namespace_name: NamespaceName,
         shard_id: ShardId,
@@ -173,7 +177,6 @@ impl NamespaceData {
         dml_operation: DmlOperation,
         catalog: &Arc<dyn Catalog>,
         lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
     ) -> Result<bool, super::Error> {
         let sequence_number = dml_operation
             .meta()
@@ -225,22 +228,17 @@ impl NamespaceData {
                 Ok(pause_writes)
             }
             DmlOperation::Delete(delete) => {
-                let table_name = delete
-                    .table_name()
-                    .context(super::TableNotPresentSnafu)?
-                    .into();
-                let table_data = match self.table_data(&table_name) {
-                    Some(t) => t,
-                    None => self.insert_table(&table_name, catalog).await?,
-                };
+                // Deprecated delete support:
+                // https://github.com/influxdata/influxdb_iox/issues/5825
+                warn!(
+                    shard_id=%self.shard_id,
+                    namespace_name=%self.namespace_name,
+                    namespace_id=%self.namespace_id,
+                    table_name=?delete.table_name(),
+                    sequence_number=?delete.meta().sequence(),
+                    "discarding unsupported delete op"
+                );
 
-                let mut table_data = table_data.write().await;
-
-                table_data
-                    .buffer_delete(delete.predicate(), sequence_number, &**catalog, executor)
-                    .await?;
-
-                // don't pause writes since deletes don't count towards memory limits
                 Ok(false)
             }
         }
@@ -316,6 +314,7 @@ impl NamespaceData {
         catalog: &Arc<dyn Catalog>,
     ) -> Result<Arc<tokio::sync::RwLock<TableData>>, super::Error> {
         let mut repos = catalog.repositories().await;
+
         let info = repos
             .tables()
             .get_table_persist_info(self.shard_id, self.namespace_id, table_name)
@@ -338,7 +337,6 @@ impl NamespaceData {
                     table_name.clone(),
                     self.shard_id,
                     self.namespace_id,
-                    info.tombstone_max_sequence_number,
                     Arc::clone(&self.partition_provider),
                 ))
             }
@@ -455,7 +453,6 @@ mod tests {
         let metrics = Arc::new(metric::Registry::default());
         let catalog: Arc<dyn Catalog> =
             Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
-        let exec = Executor::new(1);
 
         // Populate the catalog with the shard / namespace / table
         let (shard_id, ns_id, table_id) =
@@ -502,7 +499,6 @@ mod tests {
             )),
             &catalog,
             &MockLifecycleHandle::default(),
-            &exec,
         )
         .await
         .expect("buffer op should succeed");
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index b35a2a6d31..2adfa2582c 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -3,10 +3,7 @@
 use std::sync::Arc;
 
 use arrow::record_batch::RecordBatch;
-use data_types::{
-    NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, Tombstone,
-};
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
 use schema::{selection::Selection, sort::SortKey};
 use snafu::ResultExt;
@@ -17,7 +14,7 @@ use self::{
     buffer::{BufferBatch, DataBuffer},
     resolver::DeferredSortKey,
 };
-use crate::{data::query_dedup::query, query::QueryableBatch};
+use crate::query::QueryableBatch;
 
 use super::table::TableName;
 
@@ -42,9 +39,6 @@ pub(crate) struct UnpersistedPartitionData {
 pub struct PartitionStatus {
     /// Max sequence number persisted
     pub parquet_max_sequence_number: Option<SequenceNumber>,
-
-    /// Max sequence number for a tombstone
-    pub tombstone_max_sequence_number: Option<SequenceNumber>,
 }
 
 /// PersistingBatch contains all needed info and data for creating
@@ -266,77 +260,6 @@ impl PartitionData {
         Ok(())
     }
 
-    /// Buffers a new tombstone:
-    ///   . All the data in the `buffer` and `snapshots` will be replaced with one
-    ///     tombstone-applied snapshot
-    ///   . The tombstone is only added in the `deletes_during_persisting` if the `persisting`
-    ///     exists
-    pub(super) async fn buffer_tombstone(&mut self, executor: &Executor, tombstone: Tombstone) {
-        self.data.add_tombstone(tombstone.clone());
-
-        // ----------------------------------------------------------
-        // First apply the tombstone on all in-memory & non-persisting data
-        // Make a QueryableBatch for all buffer + snapshots + the given tombstone
-        let max_sequence_number = tombstone.sequence_number;
-        let query_batch = match self.data.snapshot_to_queryable_batch(
-            &self.table_name,
-            self.id,
-            Some(tombstone.clone()),
-        ) {
-            Some(query_batch) if !query_batch.is_empty() => query_batch,
-            _ => {
-                // No need to proceed further
-                return;
-            }
-        };
-
-        let (min_sequence_number, _) = query_batch.min_max_sequence_numbers();
-        assert!(min_sequence_number <= max_sequence_number);
-
-        // Run query on the QueryableBatch to apply the tombstone.
-        let stream = match query(executor, Arc::new(query_batch)).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to apply tombstones on snapshots: {:?}", e);
-            }
-            Ok(stream) => stream,
-        };
-        let record_batches = match datafusion::physical_plan::common::collect(stream).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to collect record batches: {:?}", e);
-            }
-            Ok(batches) => batches,
-        };
-
-        // Merge all result record batches into one record batch
-        // and make a snapshot for it
-        let snapshot = if !record_batches.is_empty() {
-            let record_batch =
-                arrow::compute::concat_batches(&record_batches[0].schema(), &record_batches)
-                    .unwrap_or_else(|e| {
-                        panic!("unable to concat record batches: {:?}", e);
-                    });
-            let snapshot = SnapshotBatch {
-                min_sequence_number,
-                max_sequence_number,
-                data: Arc::new(record_batch),
-            };
-
-            Some(Arc::new(snapshot))
-        } else {
-            None
-        };
-
-        // ----------------------------------------------------------
-        // Add the tombstone-applied data back in as one snapshot
-        if let Some(snapshot) = snapshot {
-            self.data.snapshots.push(snapshot);
-        }
-    }
-
     /// Return the progress from this Partition
     pub(super) fn progress(&self) -> ShardProgress {
         self.data.progress()
@@ -402,7 +325,6 @@ mod tests {
     use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
 
     use super::*;
-    use crate::test_util::create_tombstone;
 
     #[test]
     fn snapshot_buffer_different_but_compatible_schemas() {
@@ -449,7 +371,7 @@ mod tests {
 
     // Test deletes mixed with writes on a single parittion
     #[tokio::test]
-    async fn writes_and_deletes() {
+    async fn writes() {
         // Make a partition with empty DataBuffer
         let s_id = 1;
         let t_id = 1;
@@ -464,7 +386,6 @@ mod tests {
             SortKeyState::Provided(None),
             None,
         );
-        let exec = Executor::new(1);
 
         // ------------------------------------------
         // Fill `buffer`
@@ -487,42 +408,8 @@ mod tests {
             SequenceNumber::new(2)
         );
         assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
         assert_eq!(p.data.persisting, None);
 
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 3
-        let ts = create_tombstone(
-            1,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            3,         // delete's seq_number
-            0,         // min time of data to get deleted
-            20,        // max time of data to get deleted
-            "day=thu", // delete predicate
-        );
-        // one row will get deleted, the other is moved to snapshot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has one row since the other one got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+--------+-----+------+--------------------------------+",
-            "| city   | day | temp | time                           |",
-            "+--------+-----+------+--------------------------------+",
-            "| Boston | fri | 50   | 1970-01-01T00:00:00.000000010Z |",
-            "+--------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 3);
-
         // ------------------------------------------
         // Fill `buffer`
         // --- seq_num: 4
@@ -542,50 +429,15 @@ mod tests {
         // verify data
         assert_eq!(
             p.data.buffer.as_ref().unwrap().min_sequence_number,
-            SequenceNumber::new(4)
+            SequenceNumber::new(1)
         );
         assert_eq!(
             p.data.buffer.as_ref().unwrap().max_sequence_number,
             SequenceNumber::new(5)
         );
-        assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
+        assert_eq!(p.data.snapshots.len(), 0);
         assert_eq!(p.data.persisting, None);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 6
-        let ts = create_tombstone(
-            2,             // tombstone id
-            t_id,          // table id
-            s_id,          // shard id
-            6,             // delete's seq_number
-            10,            // min time of data to get deleted
-            50,            // max time of data to get deleted
-            "city=Boston", // delete predicate
-        );
-        // two rows will get deleted, one from existing snapshot, one from the buffer being moved
-        // to snpashot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has two rows since the other 2 rows with city=Boston have got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+---------+-----+------+--------------------------------+",
-            "| city    | day | temp | time                           |",
-            "+---------+-----+------+--------------------------------+",
-            "| Andover | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "+---------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 6);
+        assert!(p.data.buffer.is_some());
 
         // ------------------------------------------
         // Persisting
@@ -594,32 +446,12 @@ mod tests {
         // verify data
         assert!(p.data.buffer.is_none()); // always empty after issuing persit
         assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit
-        assert_eq!(p.data.deletes_during_persisting().len(), 0); // deletes not happen yet
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 7
-        let ts = create_tombstone(
-            3,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            7,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=55", // delete predicate
-        );
-        // if a query come while persisting, the row with temp=55 will be deleted before
-        // data is sent back to Querier
-        p.buffer_tombstone(&exec, ts).await;
-
         // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-                                          // no snpashots becasue buffer has not data yet and the
-                                          // snapshot was empty too
-        assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 1); // tombstone added since data is
-                                                                 // persisting
+        assert!(p.data.buffer.is_none());
+        assert_eq!(p.data.snapshots.len(), 0); // no snpashots becasue buffer has not data yet and the
+                                               // snapshot was empty too
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
         // ------------------------------------------
@@ -640,7 +472,6 @@ mod tests {
             SequenceNumber::new(8)
         ); // 1 newly added mutable batch of 3 rows of data
         assert_eq!(p.data.snapshots.len(), 0); // still empty
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
         // ------------------------------------------
@@ -649,7 +480,6 @@ mod tests {
         // verify data
         assert!(p.data.buffer.is_none()); // empty after snapshot
         assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
         // snapshot has three rows moved from buffer
         let data = (*p.data.snapshots[0].data).clone();
@@ -665,41 +495,5 @@ mod tests {
         assert_batches_sorted_eq!(&expected, &[data]);
         assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
         assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 8);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 9
-        let ts = create_tombstone(
-            4,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            9,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=60", // delete predicate
-        );
-        // the row with temp=60 will be removed from the sanphot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied
-        assert_eq!(p.data.deletes_during_persisting().len(), 2); // one more tombstone added make it 2
-        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
-        // snapshot has only 2 rows because the row with tem=60 was removed
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Wilmington | sun | 55   | 1970-01-01T00:00:00.000000035Z |",
-            "| Boston     | sun | 62   | 1970-01-01T00:00:00.000000038Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 9);
-
-        exec.join().await;
     }
 }
diff --git a/ingester/src/data/partition/buffer.rs b/ingester/src/data/partition/buffer.rs
index 3195b9c74d..866e7a966c 100644
--- a/ingester/src/data/partition/buffer.rs
+++ b/ingester/src/data/partition/buffer.rs
@@ -2,7 +2,7 @@
 
 use std::sync::Arc;
 
-use data_types::{PartitionId, SequenceNumber, ShardId, TableId, Tombstone};
+use data_types::{PartitionId, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
 use schema::selection::Selection;
 use snafu::ResultExt;
@@ -40,14 +40,6 @@ pub(crate) struct DataBuffer {
     /// Buffer of incoming writes
     pub(crate) buffer: Option<BufferBatch>,
 
-    /// Buffer of tombstones whose time range may overlap with this partition.
-    /// All tombstones were already applied to corresponding snapshots. This list
-    /// only keep the ones that come during persisting. The reason
-    /// we keep them becasue if a query comes, we need to apply these tombstones
-    /// on the persiting data before sending it to the Querier
-    /// When the `persiting` is done and removed, this list will get empty, too
-    deletes_during_persisting: Vec<Tombstone>,
-
     /// Data in `buffer` will be moved to a `snapshot` when one of these happens:
     ///  . A background persist is called
     ///  . A read request from Querier
@@ -72,14 +64,6 @@ pub(crate) struct DataBuffer {
 }
 
 impl DataBuffer {
-    /// Add a new tombstones into the [`DataBuffer`].
-    pub(super) fn add_tombstone(&mut self, tombstone: Tombstone) {
-        // Only keep this tombstone if some data is being persisted
-        if self.persisting.is_some() {
-            self.deletes_during_persisting.push(tombstone);
-        }
-    }
-
     /// If a [`BufferBatch`] exists, convert it to a [`SnapshotBatch`] and add
     /// it to the list of snapshots.
     ///
@@ -113,7 +97,6 @@ impl DataBuffer {
         &mut self,
         table_name: &TableName,
         partition_id: PartitionId,
-        tombstone: Option<Tombstone>,
     ) -> Option<QueryableBatch> {
         self.generate_snapshot()
             .expect("This mutable batch snapshot error should be impossible.");
@@ -121,21 +104,11 @@ impl DataBuffer {
         let mut data = vec![];
         std::mem::swap(&mut data, &mut self.snapshots);
 
-        let mut tombstones = vec![];
-        if let Some(tombstone) = tombstone {
-            tombstones.push(tombstone);
-        }
-
         // only produce batch if there is any data
         if data.is_empty() {
             None
         } else {
-            Some(QueryableBatch::new(
-                table_name.clone(),
-                partition_id,
-                data,
-                tombstones,
-            ))
+            Some(QueryableBatch::new(table_name.clone(), partition_id, data))
         }
     }
 
@@ -172,9 +145,7 @@ impl DataBuffer {
             panic!("Unable to snapshot while persisting. This is an unexpected state.")
         }
 
-        if let Some(queryable_batch) =
-            self.snapshot_to_queryable_batch(table_name, partition_id, None)
-        {
+        if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, partition_id) {
             let persisting_batch = Arc::new(PersistingBatch {
                 shard_id,
                 table_id,
@@ -199,12 +170,7 @@ impl DataBuffer {
         };
 
         // persisting data
-        let mut queryable_batch = (*persisting.data).clone();
-
-        // Add new tombstones if any
-        queryable_batch.add_tombstones(&self.deletes_during_persisting);
-
-        Some(queryable_batch)
+        Some((*persisting.data).clone())
     }
 
     /// Return the progress in this DataBuffer
@@ -241,12 +207,6 @@ impl DataBuffer {
 
     pub(crate) fn mark_persisted(&mut self) {
         self.persisting = None;
-        self.deletes_during_persisting.clear()
-    }
-
-    #[cfg(test)]
-    pub(super) fn deletes_during_persisting(&self) -> &[Tombstone] {
-        self.deletes_during_persisting.as_ref()
     }
 }
 
diff --git a/ingester/src/data/query_dedup.rs b/ingester/src/data/query_dedup.rs
deleted file mode 100644
index 199e3ae14e..0000000000
--- a/ingester/src/data/query_dedup.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-use std::sync::Arc;
-
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
-use iox_query::{
-    exec::{Executor, ExecutorType},
-    QueryChunk, QueryChunkMeta, ScanPlanBuilder,
-};
-use observability_deps::tracing::debug;
-use snafu::{ResultExt, Snafu};
-
-use crate::query::QueryableBatch;
-
-#[derive(Debug, Snafu)]
-#[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
-    #[snafu(display("Error creating plan for querying Ingester data to send to Querier"))]
-    Frontend {
-        source: iox_query::frontend::common::Error,
-    },
-
-    #[snafu(display("Error building logical plan for querying Ingester data to send to Querier"))]
-    LogicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error building physical plan for querying Ingester data to send to Querier: {}",
-        source
-    ))]
-    PhysicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error executing the query for getting Ingester data to send to Querier: {}",
-        source
-    ))]
-    ExecutePlan { source: DataFusionError },
-}
-
-/// A specialized `Error` for Ingester's Query errors
-pub type Result<T, E = Error> = std::result::Result<T, E>;
-
-/// Query a given Queryable Batch, applying selection and filters as appropriate
-/// Return stream of record batches
-pub(crate) async fn query(
-    executor: &Executor,
-    data: Arc<QueryableBatch>,
-) -> Result<SendableRecordBatchStream> {
-    // Build logical plan for filtering data
-    // Note that this query will also apply the delete predicates that go with the QueryableBatch
-
-    // TODO: Since we have different type of servers (router,
-    // ingester, compactor, and querier), we may want to add more
-    // types into the ExecutorType to have better log and resource
-    // managment
-    let ctx = executor.new_context(ExecutorType::Query);
-
-    // Creates an execution plan for a scan and filter data of a single chunk
-    let schema = data.schema();
-    let table_name = data.table_name().to_string();
-
-    debug!(%table_name, "Creating single chunk scan plan");
-
-    let logical_plan = ScanPlanBuilder::new(schema, ctx.child_ctx("scan_and_filter planning"))
-        .with_chunks([data as _])
-        .build()
-        .context(FrontendSnafu)?
-        .plan_builder
-        .build()
-        .context(LogicalPlanSnafu)?;
-
-    debug!(%table_name, plan=%logical_plan.display_indent_schema(),
-           "created single chunk scan plan");
-
-    // Build physical plan
-    let physical_plan = ctx
-        .create_physical_plan(&logical_plan)
-        .await
-        .context(PhysicalPlanSnafu {})?;
-
-    // Execute the plan and return the filtered stream
-    let output_stream = ctx
-        .execute_stream(physical_plan)
-        .await
-        .context(ExecutePlanSnafu {})?;
-
-    Ok(output_stream)
-}
-
-#[cfg(test)]
-mod tests {
-    use arrow_util::assert_batches_eq;
-
-    use super::*;
-    use crate::test_util::{
-        create_one_record_batch_with_influxtype_no_duplicates, create_tombstone,
-        make_queryable_batch, make_queryable_batch_with_deletes,
-    };
-
-    #[tokio::test]
-    async fn test_query() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch("test_table", 0, 1, batches);
-
-        // query without filters
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data: all rows and columns should be returned
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 70        | UT   | 1970-01-01T00:00:00.000020Z |",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-
-    #[tokio::test]
-    async fn test_query_with_delete() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data:
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-}
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 041001126b..b01504085f 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -5,7 +5,6 @@ use std::{collections::HashMap, sync::Arc};
 use data_types::{NamespaceId, ShardId, ShardIndex};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
 use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
@@ -100,7 +99,6 @@ impl ShardData {
         dml_operation: DmlOperation,
         catalog: &Arc<dyn Catalog>,
         lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
     ) -> Result<bool, super::Error> {
         let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) {
             Some(d) => d,
@@ -111,7 +109,7 @@ impl ShardData {
         };
 
         namespace_data
-            .buffer_operation(dml_operation, catalog, lifecycle_handle, executor)
+            .buffer_operation(dml_operation, catalog, lifecycle_handle)
             .await
     }
 
@@ -218,7 +216,6 @@ mod tests {
         let metrics = Arc::new(metric::Registry::default());
         let catalog: Arc<dyn Catalog> =
             Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
-        let exec = Executor::new(1);
 
         // Populate the catalog with the shard / namespace / table
         let (shard_id, ns_id, table_id) =
@@ -262,7 +259,6 @@ mod tests {
                 )),
                 &catalog,
                 &MockLifecycleHandle::default(),
-                &exec,
             )
             .await
             .expect("buffer op should succeed");
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 008f74d149..357c3edd6c 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -2,14 +2,8 @@
 
 use std::{collections::HashMap, sync::Arc};
 
-use data_types::{
-    DeletePredicate, NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
-    Timestamp,
-};
-use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
-use snafu::ResultExt;
 use write_summary::ShardProgress;
 
 use super::partition::{
@@ -90,9 +84,6 @@ pub(crate) struct TableData {
     shard_id: ShardId,
     namespace_id: NamespaceId,
 
-    // the max sequence number for a tombstone associated with this table
-    tombstone_max_sequence_number: Option<SequenceNumber>,
-
     /// An abstract constructor of [`PartitionData`] instances for a given
     /// `(key, shard, table)` triplet.
     partition_provider: Arc<dyn PartitionProvider>,
@@ -117,7 +108,6 @@ impl TableData {
         table_name: TableName,
         shard_id: ShardId,
         namespace_id: NamespaceId,
-        tombstone_max_sequence_number: Option<SequenceNumber>,
         partition_provider: Arc<dyn PartitionProvider>,
     ) -> Self {
         Self {
@@ -125,7 +115,6 @@ impl TableData {
             table_name,
             shard_id,
             namespace_id,
-            tombstone_max_sequence_number,
             partition_data: Default::default(),
             partition_provider,
         }
@@ -141,12 +130,6 @@ impl TableData {
             .flatten()
     }
 
-    /// Return tombstone_max_sequence_number
-    #[allow(dead_code)] // Used in tests
-    pub(super) fn tombstone_max_sequence_number(&self) -> Option<SequenceNumber> {
-        self.tombstone_max_sequence_number
-    }
-
     // buffers the table write and returns true if the lifecycle manager indicates that
     // ingest should be paused.
     pub(super) async fn buffer_table_write(
@@ -204,41 +187,6 @@ impl TableData {
         Ok(should_pause)
     }
 
-    pub(super) async fn buffer_delete(
-        &mut self,
-        predicate: &DeletePredicate,
-        sequence_number: SequenceNumber,
-        catalog: &dyn Catalog,
-        executor: &Executor,
-    ) -> Result<(), super::Error> {
-        let min_time = Timestamp::new(predicate.range.start());
-        let max_time = Timestamp::new(predicate.range.end());
-
-        let mut repos = catalog.repositories().await;
-        let tombstone = repos
-            .tombstones()
-            .create_or_get(
-                self.table_id,
-                self.shard_id,
-                sequence_number,
-                min_time,
-                max_time,
-                &predicate.expr_sql_string(),
-            )
-            .await
-            .context(super::CatalogSnafu)?;
-
-        // remember "persisted" state
-        self.tombstone_max_sequence_number = Some(sequence_number);
-
-        // modify one partition at a time
-        for data in self.partition_data.by_key.values_mut() {
-            data.buffer_tombstone(executor, tombstone.clone()).await;
-        }
-
-        Ok(())
-    }
-
     /// Return the [`PartitionData`] for the specified ID.
     #[allow(unused)]
     pub(crate) fn get_partition(
@@ -277,7 +225,6 @@ impl TableData {
                 persisting: p.get_persisting_data(),
                 partition_status: PartitionStatus {
                     parquet_max_sequence_number: p.max_persisted_sequence_number(),
-                    tombstone_max_sequence_number: self.tombstone_max_sequence_number,
                 },
             })
             .collect()
@@ -316,6 +263,7 @@ mod tests {
 
     use assert_matches::assert_matches;
     use data_types::{PartitionId, ShardIndex};
+    use iox_catalog::interface::Catalog;
     use mutable_batch::writer;
     use mutable_batch_lp::lines_to_batches;
     use schema::{InfluxColumnType, InfluxFieldType};
@@ -367,7 +315,6 @@ mod tests {
             TABLE_NAME.into(),
             shard_id,
             ns_id,
-            None,
             partition_provider,
         );
 
@@ -427,7 +374,6 @@ mod tests {
             TABLE_NAME.into(),
             shard_id,
             ns_id,
-            None,
             partition_provider,
         );
 
diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs
index 7ff7494af0..88371e2c40 100644
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@@ -155,7 +155,6 @@ fn prepare_data_to_querier_for_partition(
                 request.table.clone().into(),
                 unpersisted_partition_data.partition_id,
                 vec![],
-                vec![],
             )
         })
         .with_data(unpersisted_partition_data.non_persisted);
@@ -201,10 +200,7 @@ mod tests {
     use super::*;
     use crate::{
         data::FlatIngesterQueryResponse,
-        test_util::{
-            make_ingester_data, make_ingester_data_with_tombstones, DataLocation, TEST_NAMESPACE,
-            TEST_TABLE,
-        },
+        test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE},
     };
 
     #[tokio::test]
@@ -362,182 +358,6 @@ mod tests {
         }
     }
 
-    #[tokio::test]
-    async fn test_prepare_data_to_querier_with_tombstones() {
-        test_helpers::maybe_start_logging();
-
-        // make 7 scenarios for ingester data with tombstones
-        let mut scenarios = vec![];
-        for loc in &[
-            DataLocation::BUFFER,
-            DataLocation::BUFFER_SNAPSHOT,
-            DataLocation::BUFFER_PERSISTING,
-            DataLocation::BUFFER_SNAPSHOT_PERSISTING,
-            DataLocation::SNAPSHOT,
-            DataLocation::SNAPSHOT_PERSISTING,
-            DataLocation::PERSISTING,
-        ] {
-            let scenario = Arc::new(make_ingester_data_with_tombstones(*loc).await);
-            scenarios.push((loc, scenario));
-        }
-
-        // read data from all scenarios without any filters
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec![],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
-        }
-
-        // read data from all scenarios and filter out column day
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
-        }
-
-        // read data from all scenarios, filter out column day, city Medford, time outside range [0, 42)
-        let expr = col("city").not_eq(lit("Medford"));
-        let pred = Predicate::default().with_expr(expr).with_range(0, 42);
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            Some(pred),
-        ));
-        // predicates and de-dup are NOT applied!, otherwise this would look like this:
-        // let expected = vec![
-        //     "+------------+------+--------------------------------+",
-        //     "| city       | temp | time                           |",
-        //     "+------------+------+--------------------------------+",
-        //     "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-        //     "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-        //     "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-        //     "+------------+------+--------------------------------+",
-        // ];
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
-        }
-    }
-
     /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
     ///
     /// If the response contains multiple snapshots, this will merge the schemas into a single one and create
diff --git a/ingester/src/query.rs b/ingester/src/query.rs
index 219dcbcf6e..dc38001e4f 100644
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@@ -6,7 +6,7 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::util::ensure_schema;
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary,
-    TimestampMinMax, Tombstone,
+    TimestampMinMax,
 };
 use datafusion::{
     error::DataFusionError,
@@ -21,10 +21,7 @@ use iox_query::{
     QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
-use predicate::{
-    delete_predicate::{tombstones_to_delete_predicates, tombstones_to_delete_predicates_iter},
-    Predicate,
-};
+use predicate::Predicate;
 use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
 use snafu::{ResultExt, Snafu};
 
@@ -56,9 +53,6 @@ pub(crate) struct QueryableBatch {
     /// data
     pub(crate) data: Vec<Arc<SnapshotBatch>>,
 
-    /// Delete predicates of the tombstones
-    pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
-
     /// This is needed to return a reference for a trait function
     pub(crate) table_name: TableName,
 
@@ -72,12 +66,9 @@ impl QueryableBatch {
         table_name: TableName,
         partition_id: PartitionId,
         data: Vec<Arc<SnapshotBatch>>,
-        deletes: Vec<Tombstone>,
     ) -> Self {
-        let delete_predicates = tombstones_to_delete_predicates(&deletes);
         Self {
             data,
-            delete_predicates,
             table_name,
             partition_id,
         }
@@ -89,12 +80,6 @@ impl QueryableBatch {
         self
     }
 
-    /// Add more tombstones
-    pub(crate) fn add_tombstones(&mut self, deletes: &[Tombstone]) {
-        let delete_predicates = tombstones_to_delete_predicates_iter(deletes);
-        self.delete_predicates.extend(delete_predicates);
-    }
-
     /// return min and max of all the snapshots
     pub(crate) fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) {
         let min = self
@@ -113,11 +98,6 @@ impl QueryableBatch {
 
         (min, max)
     }
-
-    /// return true if it has no data
-    pub(crate) fn is_empty(&self) -> bool {
-        self.data.is_empty()
-    }
 }
 
 impl QueryChunkMeta for QueryableBatch {
@@ -147,16 +127,16 @@ impl QueryChunkMeta for QueryableBatch {
         None // Ingester data is not sorted
     }
 
-    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
-        self.delete_predicates.as_ref()
-    }
-
     fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
         // Note: we need to consider which option we want to go with
         //  . Return None here and avoid taking time to compute time's min max of RecordBacthes (current choice)
         //  . Compute time's min max here and avoid compacting non-overlapped QueryableBatches in the Ingester
         None
     }
+
+    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
+        &[]
+    }
 }
 
 impl QueryChunk for QueryableBatch {
@@ -265,165 +245,3 @@ impl QueryChunk for QueryableBatch {
         self
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use arrow::{
-        array::{
-            ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
-            TimestampNanosecondArray, UInt64Array,
-        },
-        datatypes::{DataType, Int32Type, TimeUnit},
-    };
-    use data_types::{DeleteExpr, Op, Scalar, TimestampRange};
-
-    use super::*;
-    use crate::test_util::create_tombstone;
-
-    #[tokio::test]
-    async fn test_merge_batch_schema() {
-        // Merge schema of the batches
-        // The fields in the schema are sorted by column name
-        let batches = create_batches();
-        let merged_schema = (*merge_record_batch_schemas(&batches)).clone();
-
-        // Expected Arrow schema
-        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![
-            arrow::datatypes::Field::new(
-                "dict",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                true,
-            ),
-            arrow::datatypes::Field::new("int64", DataType::Int64, true),
-            arrow::datatypes::Field::new("string", DataType::Utf8, true),
-            arrow::datatypes::Field::new("bool", DataType::Boolean, true),
-            arrow::datatypes::Field::new(
-                "time",
-                DataType::Timestamp(TimeUnit::Nanosecond, None),
-                false,
-            ),
-            arrow::datatypes::Field::new("uint64", DataType::UInt64, false),
-            arrow::datatypes::Field::new("float64", DataType::Float64, true),
-        ]));
-        let expected_schema = Schema::try_from(arrow_schema)
-            .unwrap()
-            .sort_fields_by_name();
-
-        assert_eq!(
-            expected_schema, merged_schema,
-            "\nExpected:\n{:#?}\nActual:\n{:#?}",
-            expected_schema, merged_schema
-        );
-    }
-
-    #[tokio::test]
-    async fn test_tombstones_to_delete_predicates() {
-        // create tombstones
-        let tombstones = vec![
-            create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"),
-            create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"),
-        ];
-
-        // This new queryable batch will convert tombstone to delete predicates
-        let query_batch =
-            QueryableBatch::new("test_table".into(), PartitionId::new(0), vec![], tombstones);
-        let predicates = query_batch.delete_predicates();
-        let expected = vec![
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 200),
-                exprs: vec![DeleteExpr {
-                    column: String::from("temp"),
-                    op: Op::Eq,
-                    scalar: Scalar::I64(10),
-                }],
-            }),
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 350),
-                exprs: vec![
-                    DeleteExpr {
-                        column: String::from("temp"),
-                        op: Op::Ne,
-                        scalar: Scalar::I64(10),
-                    },
-                    DeleteExpr {
-                        column: String::from("city"),
-                        op: Op::Eq,
-                        scalar: Scalar::String(String::from(r#"Boston"#)),
-                    },
-                ],
-            }),
-        ];
-
-        assert_eq!(expected, predicates);
-    }
-
-    // ----------------------------------------------------------------------------------------------
-    // Data for testing
-
-    // Create pure RecordBatches without knowledge of Influx datatype
-    fn create_batches() -> Vec<Arc<RecordBatch>> {
-        // Batch 1: <dict, i64, str, bool, time>  & 3 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![Some("a"), None, Some("b")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let int64_array: ArrayRef =
-            Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("and"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new(
-            [Some(true), None, Some(false)]
-                .iter()
-                .collect::<BooleanArray>(),
-        );
-        let ts_array: ArrayRef = Arc::new(
-            [Some(150), Some(200), Some(1526823730000000000)]
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("int64", int64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        // Batch 2: <dict, u64, f64, str, bool, time> & 2 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![None, Some("d")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::<UInt64Array>()); // not null
-        let float64_array: ArrayRef =
-            Arc::new([Some(1.0), Some(2.0)].iter().collect::<Float64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::<BooleanArray>());
-        let ts_array: ArrayRef = Arc::new(
-            [Some(100), Some(1626823730000000000)] // not null
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("uint64", uint64_array, false), // not null
-            ("float64", float64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        vec![Arc::new(batch1), Arc::new(batch2)]
-    }
-}
diff --git a/ingester/src/server/grpc.rs b/ingester/src/server/grpc.rs
index 4f06a93a46..8cbd26afe1 100644
--- a/ingester/src/server/grpc.rs
+++ b/ingester/src/server/grpc.rs
@@ -410,9 +410,6 @@ impl Stream for GetStream {
                             parquet_max_sequence_number: status
                                 .parquet_max_sequence_number
                                 .map(|x| x.get()),
-                            tombstone_max_sequence_number: status
-                                .tombstone_max_sequence_number
-                                .map(|x| x.get()),
                         }),
                     };
                     prost::Message::encode(&app_metadata, &mut bytes)
@@ -489,7 +486,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
                 Ok(FlatIngesterQueryResponse::StartSnapshot { schema }),
@@ -502,7 +498,6 @@ mod tests {
                         partition_id: 1,
                         status: Some(proto::PartitionStatus {
                             parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                         }),
                     },
                 }),
@@ -527,7 +522,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
                 Err(ArrowError::IoError("foo".into())),
@@ -535,7 +529,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
             ],
@@ -546,7 +539,6 @@ mod tests {
                         partition_id: 1,
                         status: Some(proto::PartitionStatus {
                             parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                         }),
                     },
                 }),
diff --git a/ingester/src/stream_handler/handler.rs b/ingester/src/stream_handler/handler.rs
index 1b163ea325..9a52b10505 100644
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@@ -945,7 +945,7 @@ mod tests {
             Ok(DmlOperation::Write(make_write("good_op", 2)))
         ]],
         sink_rets = [
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound{namespace: "bananas".to_string() }),
             Ok(true),
         ],
         want_ttbr = 2,
diff --git a/ingester/src/stream_handler/mod.rs b/ingester/src/stream_handler/mod.rs
index 296f158e1a..5e9a351fe4 100644
--- a/ingester/src/stream_handler/mod.rs
+++ b/ingester/src/stream_handler/mod.rs
@@ -17,7 +17,7 @@
 //! [`LifecycleManager`]: crate::lifecycle::LifecycleManager
 //! [`LifecycleHandle::can_resume_ingest()`]: crate::lifecycle::LifecycleHandle::can_resume_ingest()
 
-pub mod handler;
+pub(crate) mod handler;
 mod periodic_watermark_fetcher;
 mod sink;
 
@@ -25,8 +25,8 @@ mod sink;
 pub mod mock_sink;
 #[cfg(test)]
 pub mod mock_watermark_fetcher;
-pub mod sink_adaptor;
-pub mod sink_instrumentation;
+pub(crate) mod sink_adaptor;
+pub(crate) mod sink_instrumentation;
 
-pub use periodic_watermark_fetcher::*;
-pub use sink::*;
+pub(crate) use periodic_watermark_fetcher::*;
+pub(crate) use sink::*;
diff --git a/ingester/src/stream_handler/periodic_watermark_fetcher.rs b/ingester/src/stream_handler/periodic_watermark_fetcher.rs
index 43c8cf52c9..37f99663cc 100644
--- a/ingester/src/stream_handler/periodic_watermark_fetcher.rs
+++ b/ingester/src/stream_handler/periodic_watermark_fetcher.rs
@@ -24,7 +24,7 @@ use super::sink_instrumentation::WatermarkFetcher;
 /// Emits an error metric named `write_buffer_watermark_fetch_errors` that
 /// increments once per fetch error.
 #[derive(Debug)]
-pub struct PeriodicWatermarkFetcher {
+pub(crate) struct PeriodicWatermarkFetcher {
     last_watermark: Arc<AtomicI64>,
     poll_handle: JoinHandle<()>,
 }
diff --git a/ingester/src/stream_handler/sink.rs b/ingester/src/stream_handler/sink.rs
index 5f8220a942..825b012ce9 100644
--- a/ingester/src/stream_handler/sink.rs
+++ b/ingester/src/stream_handler/sink.rs
@@ -5,7 +5,7 @@ use dml::DmlOperation;
 
 /// A [`DmlSink`] handles [`DmlOperation`] instances read from a shard.
 #[async_trait]
-pub trait DmlSink: Debug + Send + Sync {
+pub(crate) trait DmlSink: Debug + Send + Sync {
     /// Apply `op` read from a shard, returning `Ok(true)` if ingest should
     /// be paused.
     async fn apply(&self, op: DmlOperation) -> Result<bool, crate::data::Error>;
diff --git a/ingester/src/stream_handler/sink_instrumentation.rs b/ingester/src/stream_handler/sink_instrumentation.rs
index 24b05cbf21..998e14bb48 100644
--- a/ingester/src/stream_handler/sink_instrumentation.rs
+++ b/ingester/src/stream_handler/sink_instrumentation.rs
@@ -414,11 +414,13 @@ mod tests {
         let got = test(
             op,
             &metrics,
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound {
+                namespace: "bananas".to_string(),
+            }),
             Some(12345),
         )
         .await;
-        assert_matches!(got, Err(crate::data::Error::TableNotPresent));
+        assert_matches!(got, Err(crate::data::Error::NamespaceNotFound { .. }));
 
         // Validate the various write buffer metrics
         assert_matches!(
diff --git a/ingester/src/test_util.rs b/ingester/src/test_util.rs
index cde40ac9c2..05dc226f90 100644
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@@ -9,17 +9,16 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::assert_batches_eq;
 use bitflags::bitflags;
 use data_types::{
-    CompactionLevel, NamespaceId, NonEmptyString, PartitionId, PartitionKey, Sequence,
-    SequenceNumber, ShardId, ShardIndex, TableId, Timestamp, Tombstone, TombstoneId,
+    CompactionLevel, NamespaceId, PartitionId, PartitionKey, Sequence, SequenceNumber, ShardId,
+    ShardIndex, TableId,
 };
-use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
+use dml::{DmlMeta, DmlOperation, DmlWrite};
 use iox_catalog::{interface::Catalog, mem::MemCatalog};
 use iox_query::test::{raw_data, TestChunk};
 use iox_time::{SystemProvider, Time};
 use mutable_batch_lp::lines_to_batches;
 use object_store::memory::InMemory;
 use parquet_file::metadata::IoxMetadata;
-use predicate::delete_predicate::parse_delete_predicate;
 use schema::sort::SortKey;
 use uuid::Uuid;
 
@@ -28,31 +27,10 @@ use crate::{
         partition::{resolver::CatalogPartitionResolver, PersistingBatch, SnapshotBatch},
         IngesterData,
     },
-    lifecycle::{LifecycleConfig, LifecycleHandle, LifecycleManager},
+    lifecycle::{LifecycleConfig, LifecycleManager},
     query::QueryableBatch,
 };
 
-/// Create tombstone for testing
-pub(crate) fn create_tombstone(
-    id: i64,
-    table_id: i64,
-    shard_id: i64,
-    seq_num: i64,
-    min_time: i64,
-    max_time: i64,
-    predicate: &str,
-) -> Tombstone {
-    Tombstone {
-        id: TombstoneId::new(id),
-        table_id: TableId::new(table_id),
-        shard_id: ShardId::new(shard_id),
-        sequence_number: SequenceNumber::new(seq_num),
-        min_time: Timestamp::new(min_time),
-        max_time: Timestamp::new(max_time),
-        serialized_predicate: predicate.to_string(),
-    }
-}
-
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn make_meta(
     object_store_id: Uuid,
@@ -93,15 +71,8 @@ pub(crate) fn make_persisting_batch(
     partition_id: i64,
     object_store_id: Uuid,
     batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<PersistingBatch> {
-    let queryable_batch = make_queryable_batch_with_deletes(
-        table_name,
-        partition_id,
-        seq_num_start,
-        batches,
-        tombstones,
-    );
+    let queryable_batch = make_queryable_batch(table_name, partition_id, seq_num_start, batches);
     Arc::new(PersistingBatch {
         shard_id: ShardId::new(shard_id),
         table_id: TableId::new(table_id),
@@ -116,16 +87,6 @@ pub(crate) fn make_queryable_batch(
     partition_id: i64,
     seq_num_start: i64,
     batches: Vec<Arc<RecordBatch>>,
-) -> Arc<QueryableBatch> {
-    make_queryable_batch_with_deletes(table_name, partition_id, seq_num_start, batches, vec![])
-}
-
-pub(crate) fn make_queryable_batch_with_deletes(
-    table_name: &str,
-    partition_id: i64,
-    seq_num_start: i64,
-    batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<QueryableBatch> {
     // make snapshots for the batches
     let mut snapshots = vec![];
@@ -140,7 +101,6 @@ pub(crate) fn make_queryable_batch_with_deletes(
         table_name.into(),
         PartitionId::new(partition_id),
         snapshots,
-        tombstones,
     ))
 }
 
@@ -673,47 +633,6 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
     ingester
 }
 
-pub(crate) async fn make_ingester_data_with_tombstones(loc: DataLocation) -> IngesterData {
-    // Whatever data because they won't be used in the tests
-    let metrics: Arc<metric::Registry> = Default::default();
-    let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
-    let object_store = Arc::new(InMemory::new());
-    let exec = Arc::new(iox_query::exec::Executor::new(1));
-    let lifecycle = LifecycleManager::new(
-        LifecycleConfig::new(
-            200_000_000,
-            100_000_000,
-            100_000_000,
-            Duration::from_secs(100_000_000),
-            Duration::from_secs(100_000_000),
-            100_000_000,
-        ),
-        Arc::clone(&metrics),
-        Arc::new(SystemProvider::default()),
-    );
-
-    // Make data for one shard and two tables
-    let shard_index = ShardIndex::new(0);
-    let (shard_id, _, _) =
-        populate_catalog(&*catalog, shard_index, TEST_NAMESPACE, TEST_TABLE).await;
-
-    let ingester = IngesterData::new(
-        object_store,
-        Arc::clone(&catalog),
-        [(shard_id, shard_index)],
-        exec,
-        Arc::new(CatalogPartitionResolver::new(catalog)),
-        backoff::BackoffConfig::default(),
-        metrics,
-    );
-
-    // Make partitions per requested
-    make_one_partition_with_tombstones(&ingester, &lifecycle.handle(), loc, shard_index, shard_id)
-        .await;
-
-    ingester
-}
-
 /// Make data for one or two partitions per requested
 pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> Vec<DmlOperation> {
     // In-memory data includes these rows but split between 4 groups go into
@@ -783,133 +702,6 @@ pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) ->
     ops
 }
 
-/// Make data for one partition with tombstones
-async fn make_one_partition_with_tombstones(
-    ingester: &IngesterData,
-    lifecycle_handle: &dyn LifecycleHandle,
-    loc: DataLocation,
-    shard_index: ShardIndex,
-    shard_id: ShardId,
-) {
-    // In-memory data includes these rows but split between 4 groups go into
-    // different batches of parittion 1 or partittion 2  as requeted
-    // let expected = vec![
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| city       | day | temp | time                           |",
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |", // in group 1 - seq_num: 2
-    //         "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |", // in group 2 - seq_num: 3
-    //         "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |", // in group 1 - seq_num: 1  --> will get deleted
-    //         "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |", // in group 3 - seq_num: 5  --> will get deleted
-    //         "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 8  (after the tombstone's seq num)
-    //         "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4
-    //         "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 9
-    //         "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6
-    //         "+------------+-----+------+--------------------------------+",
-    //     ];
-
-    let (ops, seq_num) =
-        make_first_partition_data(&PartitionKey::from(TEST_PARTITION_1), shard_index);
-
-    // Apply all ops
-    for op in ops {
-        ingester
-            .buffer_operation(shard_id, op, lifecycle_handle)
-            .await
-            .unwrap();
-    }
-
-    if loc.contains(DataLocation::PERSISTING) {
-        // Move partition 1 data to persisting
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(&TEST_NAMESPACE.into())
-            .unwrap()
-            .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    } else if loc.contains(DataLocation::SNAPSHOT) {
-        // move partition 1 data to snapshot
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(&TEST_NAMESPACE.into())
-            .unwrap()
-            .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    }
-
-    // Add tombstones
-    // Depending on where the existing data is, they (buffer & snapshot) will be either moved to a new snapshot after
-    // applying the tombstone or (persisting) stay where they are and the tombstones is kept to get applied later
-    // ------------------------------------------
-    // Delete
-    let mut seq_num = seq_num.get();
-    seq_num += 1;
-
-    let delete = parse_delete_predicate(
-        "1970-01-01T00:00:00.000000010Z",
-        "1970-01-01T00:00:00.000000050Z",
-        "city=Boston",
-    )
-    .unwrap();
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Delete(DmlDelete::new(
-                TEST_NAMESPACE.to_string(),
-                delete,
-                NonEmptyString::new(TEST_TABLE),
-                DmlMeta::sequenced(
-                    Sequence {
-                        shard_index,
-                        sequence_number: SequenceNumber::new(seq_num),
-                    },
-                    Time::MIN,
-                    None,
-                    42,
-                ),
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-
-    // Group 4: in buffer of p1 after the tombstone
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Medford day="sun",temp=55 22"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-    seq_num += 1;
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Reading day="mon",temp=58 40"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-}
-
 pub(crate) fn make_write_op(
     partition_key: &PartitionKey,
     shard_index: ShardIndex,
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 7544e65370..d28a5f310d 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -1878,7 +1878,7 @@ LIMIT $4;
         sqlx::query_as::<_, PartitionParam>(
             r#"
 SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id,
-       parquet_file.table_id, 
+       parquet_file.table_id,
        count(case when to_delete is null then 1 end) total_count,
        max(case when compaction_level= $4 then parquet_file.created_at end)
 FROM   parquet_file
diff --git a/querier/src/ingester/mod.rs b/querier/src/ingester/mod.rs
index 86946d2c54..9c9f7a8910 100644
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@@ -613,9 +613,7 @@ impl IngesterStreamDecoder {
                     partition_id,
                     shard_id,
                     status.parquet_max_sequence_number.map(SequenceNumber::new),
-                    status
-                        .tombstone_max_sequence_number
-                        .map(SequenceNumber::new),
+                    None,
                     partition_sort_key,
                 );
                 self.current_partition = Some(partition);
@@ -1338,7 +1336,6 @@ mod tests {
                             partition_id: 1,
                             status: Some(PartitionStatus {
                                 parquet_max_sequence_number: None,
-                                tombstone_max_sequence_number: None,
                             }),
                         },
                     ))],
@@ -1394,7 +1391,6 @@ mod tests {
                                 partition_id: 1,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1404,7 +1400,6 @@ mod tests {
                                 partition_id: 2,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1414,7 +1409,6 @@ mod tests {
                                 partition_id: 1,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1494,7 +1488,6 @@ mod tests {
                                     partition_id: 1,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                     }),
                                 },
                             )),
@@ -1524,7 +1517,6 @@ mod tests {
                                     partition_id: 2,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(21),
-                                        tombstone_max_sequence_number: Some(22),
                                     }),
                                 },
                             )),
@@ -1549,7 +1541,6 @@ mod tests {
                                     partition_id: 3,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(31),
-                                        tombstone_max_sequence_number: Some(32),
                                     }),
                                 },
                             )),
@@ -1579,10 +1570,7 @@ mod tests {
             p1.parquet_max_sequence_number,
             Some(SequenceNumber::new(11))
         );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
         assert_eq!(p1.chunks.len(), 2);
         assert_eq!(p1.chunks[0].schema().as_arrow(), schema_1_1);
         assert_eq!(p1.chunks[0].batches.len(), 2);
@@ -1599,10 +1587,7 @@ mod tests {
             p2.parquet_max_sequence_number,
             Some(SequenceNumber::new(21))
         );
-        assert_eq!(
-            p2.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(22))
-        );
+        assert_eq!(p2.tombstone_max_sequence_number, None);
         assert_eq!(p2.chunks.len(), 1);
         assert_eq!(p2.chunks[0].schema().as_arrow(), schema_2_1);
         assert_eq!(p2.chunks[0].batches.len(), 1);
@@ -1615,10 +1600,7 @@ mod tests {
             p3.parquet_max_sequence_number,
             Some(SequenceNumber::new(31))
         );
-        assert_eq!(
-            p3.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(32))
-        );
+        assert_eq!(p3.tombstone_max_sequence_number, None);
         assert_eq!(p3.chunks.len(), 1);
         assert_eq!(p3.chunks[0].schema().as_arrow(), schema_3_1);
         assert_eq!(p3.chunks[0].batches.len(), 1);
@@ -1738,7 +1720,6 @@ mod tests {
                                     partition_id: 1,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                     }),
                                 },
                             )),
@@ -1778,10 +1759,7 @@ mod tests {
             p1.parquet_max_sequence_number,
             Some(SequenceNumber::new(11))
         );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
         assert_eq!(p1.chunks.len(), 1);
     }
 
diff --git a/query_tests/cases/in/delete_all.expected b/query_tests/cases/in/delete_all.expected
deleted file mode 100644
index ba828eab9a..0000000000
--- a/query_tests/cases/in/delete_all.expected
+++ /dev/null
@@ -1,25 +0,0 @@
--- Test Setup: OneDeleteSimpleExprOneChunkDeleteAll
--- SQL: SELECT * from cpu;
-++
-++
--- SQL: SELECT time from cpu;
-++
-++
--- SQL: SELECT count(*), count(bar), count(time) from cpu;
-+-----------------+----------------+-----------------+
-| COUNT(UInt8(1)) | COUNT(cpu.bar) | COUNT(cpu.time) |
-+-----------------+----------------+-----------------+
-| 0               | 0              | 0               |
-+-----------------+----------------+-----------------+
--- SQL: SELECT min(bar), max(bar), min(time), max(time) from cpu;
-+--------------+--------------+---------------+---------------+
-| MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+--------------+--------------+---------------+---------------+
-|              |              |               |               |
-+--------------+--------------+---------------+---------------+
--- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-|              |
-+--------------+
diff --git a/query_tests/cases/in/delete_all.sql b/query_tests/cases/in/delete_all.sql
deleted file mode 100644
index b79612846e..0000000000
--- a/query_tests/cases/in/delete_all.sql
+++ /dev/null
@@ -1,17 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteSimpleExprOneChunkDeleteAll
-
--- select *
-SELECT * from cpu;
-
--- select one specific column
-SELECT time from cpu;
-
--- select aggregate of every column inlcuding star
-SELECT count(*), count(bar), count(time) from cpu;
-
--- select aggregate of every column
-SELECT min(bar), max(bar), min(time), max(time) from cpu;
-
--- select aggregate of one column
-SELECT max(bar) from cpu;
\ No newline at end of file
diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_multi_expr_one_chunk.expected
deleted file mode 100644
index f0765f7c16..0000000000
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.expected
+++ /dev/null
@@ -1,207 +0,0 @@
--- Test Setup: OneDeleteMultiExprsOneChunk
--- SQL: SELECT * from cpu order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu order by time, bar;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-| 1970-01-01T00:00:00.000000040Z | 1   |
-+--------------------------------+-----+
--- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 2   |
-+-----+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 2               | 2               | 2              | 1            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000040Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 2               |
-+-----------------+
--- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 2              |
-+----------------+
--- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 2              |
-+----------------+
--- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 2               |
-+-----------------+
--- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT foo from cpu;
--- Results After Sorting
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
--- SQL: SELECT min(foo) as min_foo from cpu order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-+---------+
--- SQL: SELECT max(foo) as max_foo from cpu order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| you     |
-+---------+
--- SQL: SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-+--------------------------------+---------+
-| time                           | max_foo |
-+--------------------------------+---------+
-| 1970-01-01T00:00:00.000000020Z | you     |
-| 1970-01-01T00:00:00.000000040Z | me      |
-+--------------------------------+---------+
--- SQL: SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-+-----+---------+
-| bar | max_foo |
-+-----+---------+
-| 1   | me      |
-| 2   | you     |
-+-----+---------+
--- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT min(time) as min_time from cpu order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT min(time) as min_time from cpu group by bar order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-+-----+--------------------------------+
-| bar | min_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000040Z |
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu group by foo order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-+-----+--------------------------------+
-| foo | max_time                       |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT time from cpu;
--- Results After Sorting
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT max(bar) from cpu order by 1;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 2            |
-+--------------+
--- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
--- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
--- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
-+----+--------------------------------+
-| mi | ma                             |
-+----+--------------------------------+
-| 2  | 1970-01-01T00:00:00.000000020Z |
-+----+--------------------------------+
diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_multi_expr_one_chunk.sql
deleted file mode 100644
index 5295c53055..0000000000
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.sql
+++ /dev/null
@@ -1,61 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteMultiExprsOneChunk
-
--- select *
-SELECT * from cpu order by bar, foo, time;
-
-SELECT time, bar from cpu order by time, bar;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
--- IOX_COMPARE: sorted
-SELECT foo from cpu;
-
-SELECT min(foo) as min_foo from cpu order by min_foo;
-SELECT max(foo) as max_foo from cpu order by max_foo;
-
-SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-
-SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-
-SELECT min(time) as min_time from cpu order by min_time;
-SELECT max(time) as max_time from cpu order by max_time;
-
-SELECT min(time) as min_time from cpu group by bar order by min_time;
-SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-SELECT max(time) as max_time from cpu group by foo order by max_time;
-SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-
--- IOX_COMPARE: sorted
-SELECT time from cpu;
-
-SELECT max(bar) from cpu order by 1;
-
---------------------------------------------------------
--- With selection predicate
-
-SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-
-SELECT foo from cpu where bar >= 1.0 order by foo;
-
-SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-
-SELECT * from cpu where foo = 'you' order by bar, foo, time;
-
-SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.expected b/query_tests/cases/in/delete_simple_pred_one_chunk.expected
deleted file mode 100644
index f367cdefef..0000000000
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.expected
+++ /dev/null
@@ -1,91 +0,0 @@
--- Test Setup: OneDeleteSimpleExprOneChunk
--- SQL: SELECT * from cpu;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
--- SQL: SELECT min(bar), max(bar) from cpu;
-+--------------+--------------+
-| MIN(cpu.bar) | MAX(cpu.bar) |
-+--------------+--------------+
-| 2            | 2            |
-+--------------+--------------+
--- SQL: SELECT time from cpu;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT max(time)  from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT min(time)  from cpu group by bar;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT bar, min(time)  from cpu group by bar;
-+-----+--------------------------------+
-| bar | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT count(time), max(time)  from cpu;
-+-----------------+--------------------------------+
-| COUNT(cpu.time) | MAX(cpu.time)                  |
-+-----------------+--------------------------------+
-| 1               | 1970-01-01T00:00:00.000000020Z |
-+-----------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 1               |
-+-----------------+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT * from cpu where bar = 2.0;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT * from cpu where bar != 2.0;
-++
-++
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| 0               | 0               | 0              |              |              |               |               |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
--- SQL: SELECT time from cpu where bar=2;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT bar from cpu where bar!= 2;
-++
-++
diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.sql b/query_tests/cases/in/delete_simple_pred_one_chunk.sql
deleted file mode 100644
index 7b22641c63..0000000000
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.sql
+++ /dev/null
@@ -1,37 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteSimpleExprOneChunk
-
--- select *
-SELECT * from cpu;
-
-SELECT time, bar from cpu;
-
-SELECT min(bar), max(bar) from cpu;
-
-SELECT time from cpu;
-
-SELECT max(time)  from cpu;
-SELECT min(time)  from cpu group by bar;
-SELECT bar, min(time)  from cpu group by bar;
-
-SELECT count(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
-----------------------------------------------------------------
--- Now add selection predicate
-SELECT * from cpu where bar = 2.0;
-
-SELECT * from cpu where bar != 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-
-SELECT time from cpu where bar=2;
-
-SELECT bar from cpu where bar!= 2;
-
-
diff --git a/query_tests/cases/in/delete_three_chunks_1.expected b/query_tests/cases/in/delete_three_chunks_1.expected
deleted file mode 100644
index 47ec3d3de4..0000000000
--- a/query_tests/cases/in/delete_three_chunks_1.expected
+++ /dev/null
@@ -1,85 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu order by foo, bar, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000042Z | 1   |
-| 1970-01-01T00:00:00.000000062Z | 1   |
-| 1970-01-01T00:00:00.000000070Z | 3   |
-| 1970-01-01T00:00:00.000000050Z | 4   |
-| 1970-01-01T00:00:00.000000060Z | 5   |
-| 1970-01-01T00:00:00.000000080Z | 7   |
-+--------------------------------+-----+
--- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 1   |
-| 1   |
-| 3   |
-| 4   |
-| 5   |
-| 7   |
-+-----+
--- SQL: SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-+---+---+---+----+--------------------------------+--------------------------------+
-| t | c | b | mi | mt                             | mat                            |
-+---+---+---+----+--------------------------------+--------------------------------+
-| 7 | 7 | 7 | 1  | 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+---+---+---+----+--------------------------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 7               |
-+-----------------+
--- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 7              |
-+----------------+
--- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 7              |
-+----------------+
--- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 7               |
-+-----------------+
--- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT foo from cpu order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| you |
-+-----+
diff --git a/query_tests/cases/in/delete_three_chunks_1.sql b/query_tests/cases/in/delete_three_chunks_1.sql
deleted file mode 100644
index c0105412e9..0000000000
--- a/query_tests/cases/in/delete_three_chunks_1.sql
+++ /dev/null
@@ -1,23 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
--- select *
-SELECT * from cpu order by foo, bar, time;
-
-SELECT time, bar from cpu order by bar, time;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
-SELECT foo from cpu order by foo;
diff --git a/query_tests/cases/in/delete_three_chunks_2.expected b/query_tests/cases/in/delete_three_chunks_2.expected
deleted file mode 100644
index 99fda88e70..0000000000
--- a/query_tests/cases/in/delete_three_chunks_2.expected
+++ /dev/null
@@ -1,77 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT min(foo) from cpu;
-+--------------+
-| MIN(cpu.foo) |
-+--------------+
-| me           |
-+--------------+
--- SQL: SELECT max(foo) from cpu;
-+--------------+
-| MAX(cpu.foo) |
-+--------------+
-| you          |
-+--------------+
--- SQL: SELECT min(time) from cpu;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT max(time) from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT foo, min(time) from cpu group by foo;
--- Results After Sorting
-+-----+--------------------------------+
-| foo | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000070Z |
-+-----+--------------------------------+
--- SQL: SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-+-----+--------------------------------+
-| bar | max_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000062Z |
-| 3   | 1970-01-01T00:00:00.000000070Z |
-| 4   | 1970-01-01T00:00:00.000000050Z |
-| 5   | 1970-01-01T00:00:00.000000060Z |
-| 7   | 1970-01-01T00:00:00.000000080Z |
-+-----+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu group by bar order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT time from cpu order by time;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-| 1970-01-01T00:00:00.000000042Z |
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 7            |
-+--------------+
--- SQL: SELECT min(time), max(time) from cpu;
-+--------------------------------+--------------------------------+
-| MIN(cpu.time)                  | MAX(cpu.time)                  |
-+--------------------------------+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+--------------------------------+
diff --git a/query_tests/cases/in/delete_three_chunks_2.sql b/query_tests/cases/in/delete_three_chunks_2.sql
deleted file mode 100644
index bb35711393..0000000000
--- a/query_tests/cases/in/delete_three_chunks_2.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
-SELECT min(foo) from cpu;
-SELECT max(foo) from cpu;
-
-SELECT min(time) from cpu;
-SELECT max(time) from cpu;
-
--- IOX_COMPARE: sorted
-SELECT foo, min(time) from cpu group by foo;
-SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-SELECT max(time) as max_time from cpu group by bar order by max_time;
-
-SELECT time from cpu order by time;
-
-SELECT max(bar) from cpu;
-
-SELECT min(time), max(time) from cpu;
diff --git a/query_tests/cases/in/delete_three_chunks_3.expected b/query_tests/cases/in/delete_three_chunks_3.expected
deleted file mode 100644
index 3e0c5fb2f6..0000000000
--- a/query_tests/cases/in/delete_three_chunks_3.expected
+++ /dev/null
@@ -1,76 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu where bar != 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where bar = 1 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time;
-++
-++
--- SQL: SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------+
-| MAX(cpu.foo) |
-+--------------+
-| me           |
-+--------------+
--- SQL: SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 6              |
-+----------------+
--- SQL: SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 6               |
-+-----------------+
--- SQL: SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 6               |
-+-----------------+
diff --git a/query_tests/cases/in/delete_three_chunks_3.sql b/query_tests/cases/in/delete_three_chunks_3.sql
deleted file mode 100644
index 146fcaf95e..0000000000
--- a/query_tests/cases/in/delete_three_chunks_3.sql
+++ /dev/null
@@ -1,27 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
---------------------------------------------------------
--- With selection predicate
-
-SELECT * from cpu where bar != 1.0 order by bar, foo, time;
-
-SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time;
-
-SELECT * from cpu where bar = 1 order by bar, foo, time;
-
-SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time;
-
-SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time;
-
-SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
diff --git a/query_tests/cases/in/delete_three_chunks_4.expected b/query_tests/cases/in/delete_three_chunks_4.expected
deleted file mode 100644
index 2283d15375..0000000000
--- a/query_tests/cases/in/delete_three_chunks_4.expected
+++ /dev/null
@@ -1,49 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| you |
-+-----+
--- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000042Z | 1   |
-| 1970-01-01T00:00:00.000000062Z | 1   |
-| 1970-01-01T00:00:00.000000070Z | 3   |
-| 1970-01-01T00:00:00.000000050Z | 4   |
-| 1970-01-01T00:00:00.000000060Z | 5   |
-| 1970-01-01T00:00:00.000000080Z | 7   |
-+--------------------------------+-----+
--- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma;
-+----+--------------------------------+
-| mi | ma                             |
-+----+--------------------------------+
-| 3  | 1970-01-01T00:00:00.000000070Z |
-+----+--------------------------------+
diff --git a/query_tests/cases/in/delete_three_chunks_4.sql b/query_tests/cases/in/delete_three_chunks_4.sql
deleted file mode 100644
index 95442f6b07..0000000000
--- a/query_tests/cases/in/delete_three_chunks_4.sql
+++ /dev/null
@@ -1,13 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
-----------
-SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-
-SELECT foo from cpu where bar >= 1.0 order by foo;
-
-SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-
-SELECT * from cpu where foo = 'you' order by bar, foo, time;
-
-SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma;
diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected
deleted file mode 100644
index 6871fa7358..0000000000
--- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected
+++ /dev/null
@@ -1,34 +0,0 @@
--- Test Setup: TwoDeletesMultiExprsOneChunk
--- SQL: SELECT * from cpu;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu;
-+-----+
-| foo |
-+-----+
-| me  |
-+-----+
--- SQL: SELECT * from cpu where cast(time as bigint) > 30;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT count(bar) from cpu where cast(time as bigint) > 30;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 1              |
-+----------------+
--- SQL: SELECT * from cpu where cast(time as bigint) > 40;
-++
-++
--- SQL: SELECT max(time) from cpu where cast(time as bigint) > 40;
-+---------------+
-| MAX(cpu.time) |
-+---------------+
-|               |
-+---------------+
diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql
deleted file mode 100644
index 132d6f42cf..0000000000
--- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql
+++ /dev/null
@@ -1,15 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: TwoDeletesMultiExprsOneChunk
-
--- select *
-SELECT * from cpu;
-
-SELECT foo from cpu;
-
-SELECT * from cpu where cast(time as bigint) > 30;
-
-SELECT count(bar) from cpu where cast(time as bigint) > 30;
-
-SELECT * from cpu where cast(time as bigint) > 40;
-
-SELECT max(time) from cpu where cast(time as bigint) > 40;
diff --git a/query_tests/src/cases.rs b/query_tests/src/cases.rs
index 9946819fac..69caf0dfe5 100644
--- a/query_tests/src/cases.rs
+++ b/query_tests/src/cases.rs
@@ -1,8 +1,7 @@
-
 //! This file is auto generated by query_tests/generate.
 //! Do not edit manually --> will result in sadness
-use std::path::Path;
 use crate::runner::Runner;
+use std::path::Path;
 
 #[tokio::test]
 // Tests from "basic.sql",
@@ -11,141 +10,8 @@ async fn test_cases_basic_sql() {
 
     let input_path = Path::new("cases").join("in").join("basic.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_all.sql",
-async fn test_cases_delete_all_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_all.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_multi_expr_one_chunk.sql",
-async fn test_cases_delete_multi_expr_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_multi_expr_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_simple_pred_one_chunk.sql",
-async fn test_cases_delete_simple_pred_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_simple_pred_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_1.sql",
-async fn test_cases_delete_three_chunks_1_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_1.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_2.sql",
-async fn test_cases_delete_three_chunks_2_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_2.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_3.sql",
-async fn test_cases_delete_three_chunks_3_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_3.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_4.sql",
-async fn test_cases_delete_three_chunks_4_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_4.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_two_del_multi_expr_one_chunk.sql",
-async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_two_del_multi_expr_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -153,15 +19,12 @@ async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() {
 async fn test_cases_duplicates_ingester_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("duplicates_ingester.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("duplicates_ingester.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -171,13 +34,8 @@ async fn test_cases_duplicates_parquet_sql() {
 
     let input_path = Path::new("cases").join("in").join("duplicates_parquet.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -185,15 +43,12 @@ async fn test_cases_duplicates_parquet_sql() {
 async fn test_cases_new_sql_system_tables_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("new_sql_system_tables.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("new_sql_system_tables.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -203,13 +58,8 @@ async fn test_cases_pushdown_sql() {
 
     let input_path = Path::new("cases").join("in").join("pushdown.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -219,13 +69,8 @@ async fn test_cases_selectors_sql() {
 
     let input_path = Path::new("cases").join("in").join("selectors.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -235,13 +80,8 @@ async fn test_cases_several_chunks_sql() {
 
     let input_path = Path::new("cases").join("in").join("several_chunks.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -249,15 +89,12 @@ async fn test_cases_several_chunks_sql() {
 async fn test_cases_sql_information_schema_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("sql_information_schema.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("sql_information_schema.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -267,13 +104,8 @@ async fn test_cases_timestamps_sql() {
 
     let input_path = Path::new("cases").join("in").join("timestamps.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -283,13 +115,8 @@ async fn test_cases_two_chunks_sql() {
 
     let input_path = Path::new("cases").join("in").join("two_chunks.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -297,13 +124,10 @@ async fn test_cases_two_chunks_sql() {
 async fn test_cases_two_chunks_missing_columns_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("two_chunks_missing_columns.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("two_chunks_missing_columns.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
\ No newline at end of file
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
+}
diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs
index eecb583e6b..8d7339dafc 100644
--- a/query_tests/src/influxrpc/field_columns.rs
+++ b/query_tests/src/influxrpc/field_columns.rs
@@ -56,8 +56,6 @@ async fn test_field_columns_no_predicate() {
     run_field_columns_test_case(TwoMeasurementsManyFields {}, predicate, expected_fields).await;
 }
 
-// NGA todo: add delete tests when the TwoMeasurementsManyFieldsWithDelete available
-
 #[tokio::test]
 async fn test_field_columns_with_pred() {
     // get only fields from h20 (but both chunks)
@@ -201,86 +199,6 @@ async fn test_field_name_plan() {
     run_field_columns_test_case(OneMeasurementManyFields {}, predicate, expected_fields).await;
 }
 
-#[tokio::test]
-async fn test_field_name_plan_with_delete() {
-    test_helpers::maybe_start_logging();
-
-    let predicate = Predicate::default().with_range(0, 2000);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_fields = FieldList {
-        fields: vec![
-            Field {
-                name: "field1".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 100,
-            },
-            Field {
-                name: "field2".into(),
-                data_type: DataType::Utf8,
-                last_timestamp: 100,
-            },
-            Field {
-                name: "field3".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 100,
-            },
-        ],
-    };
-
-    run_field_columns_test_case(
-        OneMeasurementManyFieldsWithDelete {},
-        predicate,
-        expected_fields,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_field_name_plan_with_delete_all_time() {
-    test_helpers::maybe_start_logging();
-
-    let predicate = Predicate::default();
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_fields = FieldList {
-        fields: vec![
-            Field {
-                name: "field1".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 0, // all time queries are optimized but do not return timestamps
-            },
-            Field {
-                name: "field2".into(),
-                data_type: DataType::Utf8,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field3".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field4".into(),
-                data_type: DataType::Boolean,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field5".into(),
-                data_type: DataType::Boolean,
-                last_timestamp: 0,
-            },
-        ],
-    };
-
-    run_field_columns_test_case(
-        OneMeasurementManyFieldsWithDelete {},
-        predicate,
-        expected_fields,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_field_columns_all_time() {
     let predicate = Predicate::default().with_range(MIN_NANO_TIME, MAX_NANO_TIME);
diff --git a/query_tests/src/influxrpc/read_filter.rs b/query_tests/src/influxrpc/read_filter.rs
index 71cf9495c0..7f32084f46 100644
--- a/query_tests/src/influxrpc/read_filter.rs
+++ b/query_tests/src/influxrpc/read_filter.rs
@@ -4,15 +4,13 @@ use std::sync::Arc;
 #[cfg(test)]
 use crate::scenarios::{
     DbScenario, DbSetup, EndToEndTest, TwoMeasurements, TwoMeasurementsManyFields,
-    TwoMeasurementsWithDelete, TwoMeasurementsWithDeleteAll,
 };
 use crate::{
     db::AbstractDb,
     influxrpc::util::run_series_set_plan_maybe_error,
     scenarios::{
         MeasurementStatusCode, MeasurementsForDefect2845, MeasurementsSortableTags,
-        MeasurementsSortableTagsWithDelete, TwoMeasurementsMultiSeries,
-        TwoMeasurementsMultiSeriesWithDelete, TwoMeasurementsMultiSeriesWithDeleteAll,
+        TwoMeasurementsMultiSeries,
     },
 };
 use datafusion::{
@@ -220,39 +218,6 @@ async fn test_read_filter_unknown_column_in_predicate() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_no_pred_with_delete() {
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [100], values: [70.4]",
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_no_pred_with_delete_all() {
-    // nothing from h2o table because all rows were deleted
-    let expected_results = vec![
-    "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-    "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_filter() {
     // filter out one row in h20
@@ -281,58 +246,6 @@ async fn test_read_filter_data_filter() {
     run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_filter_with_delete() {
-    // filter out one row in h20 but the leftover row was deleted to nothing will be returned
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        .with_expr(col("state").eq(lit("CA"))); // state=CA
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results.clone(),
-    )
-    .await;
-
-    // Same results via a != predicate.
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        .with_expr(col("state").not_eq(lit("MA"))); // state=CA
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-
-    // Use different predicate to have data returned
-    let predicate = Predicate::default()
-        .with_range(100, 300)
-        .with_expr(col("state").eq(lit("MA"))) // state=MA
-        .with_expr(col("_measurement").eq(lit("h2o")));
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [100], values: [70.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_filter_fields() {
     // filter out one row in h20
@@ -350,8 +263,6 @@ async fn test_read_filter_data_filter_fields() {
     run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await;
 }
 
-// NGA todo: add delete tests here after we have delete scenarios for 2 chunks for 1 table
-
 #[tokio::test]
 async fn test_read_filter_data_filter_measurement_pred() {
     // use an expr on table name to pick just the last row from o2
@@ -378,16 +289,6 @@ async fn test_read_filter_data_pred_refers_to_non_existent_column() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_refers_to_non_existent_column_with_delete() {
-    let predicate = Predicate::default().with_expr(col("tag_not_in_h20").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![] as Vec<&str>;
-
-    run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_no_columns() {
     // predicate with no columns,
@@ -402,59 +303,6 @@ async fn test_read_filter_data_pred_no_columns() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_no_columns_with_delete() {
-    // predicate with no columns,
-    let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=user, _measurement=cpu, region=west}\n  FloatPoints timestamps: [100], values: [23.2]",
-        "Series tags={_field=bytes, _measurement=disk, region=east}\n  IntegerPoints timestamps: [200], values: [99]",
-    ];
-
-    run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_pred_no_columns_with_delete_all() {
-    // predicate with no columns,
-    let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // Only table disk has no deleted data
-    let expected_results = vec![
-    "Series tags={_field=bytes, _measurement=disk, region=east}\n  IntegerPoints timestamps: [200], values: [99]",
-    ];
-
-    run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_pred_refers_to_good_and_non_existent_columns() {
-    // predicate with both a column that does and does not appear
-    let predicate = Predicate::default()
-        .with_expr(col("state").eq(lit("MA")))
-        .with_expr(col("tag_not_in_h20").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![] as Vec<&str>;
-
-    run_read_filter_test_case(
-        TwoMeasurements {},
-        predicate.clone(),
-        expected_results.clone(),
-    )
-    .await;
-    run_read_filter_test_case(
-        TwoMeasurementsWithDelete {},
-        predicate.clone(),
-        expected_results.clone(),
-    )
-    .await;
-    run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_using_regex_match() {
     let predicate = Predicate::default()
@@ -487,50 +335,6 @@ async fn test_read_filter_data_pred_using_regex_match_on_field() {
     run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_using_regex_match_with_delete() {
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        // will match CA state
-        .with_regex_match_expr("state", "C.*");
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // the selected row was soft deleted
-    let expected_results = vec![];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-
-    // Different predicate to have data returned
-    let predicate = Predicate::default()
-        .with_range(200, 400)
-        // will match CA state
-        .with_regex_match_expr("state", "C.*");
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-    ];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate.clone(),
-        expected_results,
-    )
-    .await;
-
-    // Try same predicate but on delete_all data
-    let expected_results = vec![];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_using_regex_not_match() {
     let predicate = Predicate::default()
@@ -600,45 +404,6 @@ async fn test_read_filter_data_pred_unsupported_in_scan() {
     run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_unsupported_in_scan_with_delete() {
-    test_helpers::maybe_start_logging();
-
-    // These predicates can't be pushed down into chunks, but they can
-    // be evaluated by the general purpose DataFusion plan
-
-    // (STATE = 'CA') OR (READING > 0)
-    let predicate =
-        Predicate::default().with_expr(col("state").eq(lit("CA")).or(col("reading").gt(lit(0))));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // Note these results include data from both o2 and h2o
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate.clone(),
-        expected_results,
-    )
-    .await;
-
-    // With delete all from h2o, no rows from h2p should be returned
-    let expected_results = vec![
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_plan_order() {
     test_helpers::maybe_start_logging();
@@ -659,25 +424,6 @@ async fn test_read_filter_data_plan_order() {
     .await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_plan_order_with_delete() {
-    test_helpers::maybe_start_logging();
-    let expected_results = vec![
-        "Series tags={_field=other, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [250], values: [5.0]",
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [250], values: [70.5]",
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA, zz_tag=A}\n  FloatPoints timestamps: [1000], values: [70.4]",
-        "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=A}\n  FloatPoints timestamps: [800], values: [70.1]",
-        "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=B}\n  FloatPoints timestamps: [100], values: [70.2]",
-    ];
-
-    run_read_filter_test_case(
-        MeasurementsSortableTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_filter_on_value() {
     test_helpers::maybe_start_logging();
diff --git a/query_tests/src/influxrpc/read_group.rs b/query_tests/src/influxrpc/read_group.rs
index 25a0be0732..8867710b65 100644
--- a/query_tests/src/influxrpc/read_group.rs
+++ b/query_tests/src/influxrpc/read_group.rs
@@ -5,7 +5,6 @@ use crate::{
         AnotherMeasurementForAggs, DbScenario, DbSetup, MeasurementForDefect2691,
         MeasurementForGroupByField, MeasurementForGroupKeys, MeasurementForMax, MeasurementForMin,
         MeasurementForSelectors, OneMeasurementForAggs, OneMeasurementNoTags2,
-        OneMeasurementNoTagsWithDelete, OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk,
         TwoMeasurementForAggs, TwoMeasurementsManyFields, TwoMeasurementsManyFieldsOneChunk,
     },
 };
@@ -93,75 +92,6 @@ async fn test_read_group_data_no_tag_columns() {
     .await;
 }
 
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_count_with_delete() {
-    let agg = Aggregate::Count;
-    let group_columns = vec![];
-    let expected_results = vec![
-        "Group tag_keys: _field, _measurement partition_key_vals: ",
-        "Series tags={_field=foo, _measurement=m0}\n  IntegerPoints timestamps: [2], values: [1]",
-    ];
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_min_with_delete() {
-    let agg = Aggregate::Min;
-    let group_columns = vec![];
-    let expected_results = vec![
-        "Group tag_keys: _field, _measurement partition_key_vals: ",
-        "Series tags={_field=foo, _measurement=m0}\n  FloatPoints timestamps: [2], values: [2.0]",
-    ];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_count_with_delete_all() {
-    let agg = Aggregate::Count;
-    let group_columns = vec![];
-    let expected_results = vec![];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_min_with_delete_all() {
-    let agg = Aggregate::Min;
-    let group_columns = vec![];
-    let expected_results = vec![];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_group_data_pred() {
     let predicate = Predicate::default()
diff --git a/query_tests/src/influxrpc/read_window_aggregate.rs b/query_tests/src/influxrpc/read_window_aggregate.rs
index d1547dd6d2..1c3d1b44a0 100644
--- a/query_tests/src/influxrpc/read_window_aggregate.rs
+++ b/query_tests/src/influxrpc/read_window_aggregate.rs
@@ -170,47 +170,6 @@ async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697() {
     .await;
 }
 
-#[tokio::test]
-async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697_with_delete() {
-    let predicate = Predicate::default()
-        // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z'
-        .with_range(1609459201000000001, 1609459201000000031);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let agg = Aggregate::Min;
-    let every = WindowDuration::from_nanoseconds(10);
-    let offset = WindowDuration::from_nanoseconds(0);
-
-    // one row deleted
-    let expected_results = vec![
-        "Series tags={_field=bar, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000011], values: [5.0]",
-        "Series tags={_field=foo, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000001, 1609459201000000024], values: [1.0, 11.24]",
-        "Series tags={_field=bar, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000009, 1609459201000000015], values: [4.0, 6.0]",
-        "Series tags={_field=foo, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000002], values: [2.0]",
-    ];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDelete {},
-        predicate.clone(),
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-
-    // all rows deleted
-    let expected_results = vec![];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDeleteAll {},
-        predicate,
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-}
-
 // See https://github.com/influxdata/influxdb_iox/issues/2697
 #[tokio::test]
 async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697() {
@@ -276,50 +235,6 @@ async fn test_grouped_series_set_plan_group_aggregate_filter_on_field() {
     .await;
 }
 
-#[tokio::test]
-async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697_with_delete() {
-    let predicate = Predicate::default()
-        // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z'
-        .with_range(1609459201000000001, 1609459201000000031);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let agg = Aggregate::Sum;
-    let every = WindowDuration::from_nanoseconds(10);
-    let offset = WindowDuration::from_nanoseconds(0);
-
-    // one row deleted
-
-    // The windowed aggregate is using a non-selector aggregate (SUM, COUNT, MEAD).
-    // For each distinct series the window defines the `time` column
-    let expected_results = vec![
-        "Series tags={_field=bar, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000020], values: [5.0]",
-        "Series tags={_field=foo, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000010, 1609459201000000030], values: [4.0, 11.24]",
-        "Series tags={_field=bar, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000010, 1609459201000000020], values: [4.0, 6.0]",
-        "Series tags={_field=foo, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000010], values: [2.0]",
-    ];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDelete {},
-        predicate.clone(),
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-
-    // all rows deleted
-    let expected_results = vec![];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDeleteAll {},
-        predicate,
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_window_aggregate_overflow() {
     let predicate = Predicate::default().with_range(1609459201000000001, 1609459201000000024);
diff --git a/query_tests/src/influxrpc/table_names.rs b/query_tests/src/influxrpc/table_names.rs
index e18710d099..c7f23c3cd1 100644
--- a/query_tests/src/influxrpc/table_names.rs
+++ b/query_tests/src/influxrpc/table_names.rs
@@ -100,106 +100,31 @@ async fn list_table_names_no_non_null_general_data_passes() {
     run_table_names_test_case(TwoMeasurementsManyFields {}, predicate, vec![]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_no_data_pred_with_delete() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDelete {},
-        InfluxRpcPredicate::default(),
-        vec!["cpu", "disk"],
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_table_names_no_data_pred_with_delete_all() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDeleteAll {},
-        InfluxRpcPredicate::default(),
-        vec!["disk"],
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_0_201() {
     run_table_names_test_case(TwoMeasurements {}, tsp(0, 201), vec!["cpu", "disk"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_0_201_with_delete() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDelete {},
-        tsp(0, 201),
-        vec!["cpu", "disk"],
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_0_201_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 201), vec!["disk"]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_0_200() {
     run_table_names_test_case(TwoMeasurements {}, tsp(0, 200), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_0_200_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(0, 200), vec!["cpu"]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_0_200_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 200), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_50_101() {
     run_table_names_test_case(TwoMeasurements {}, tsp(50, 101), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_50_101_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(50, 101), vec!["cpu"]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_50_101_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(50, 101), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_101_160() {
     run_table_names_test_case(TwoMeasurements {}, tsp(101, 160), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_101_160_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(101, 160), vec![]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_101_160_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(101, 160), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_250_300() {
     run_table_names_test_case(TwoMeasurements {}, tsp(250, 300), vec![]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_250_300_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(250, 300), vec![]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_250_300_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(250, 300), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_max_time_included() {
     run_table_names_test_case(
diff --git a/query_tests/src/influxrpc/tag_keys.rs b/query_tests/src/influxrpc/tag_keys.rs
index da21ca52d7..a15672fde0 100644
--- a/query_tests/src/influxrpc/tag_keys.rs
+++ b/query_tests/src/influxrpc/tag_keys.rs
@@ -169,24 +169,6 @@ async fn list_tag_name_end_to_end() {
     run_tag_keys_test_case(EndToEndTest {}, predicate, expected_tag_keys).await;
 }
 
-#[tokio::test]
-async fn list_tag_name_end_to_end_with_delete_and_pred() {
-    let predicate = Predicate::default()
-        .with_range(0, 10000)
-        .with_expr(col("host").eq(lit("server01")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-    let expected_tag_keys = vec!["host", "region"];
-    run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await;
-}
-
-#[tokio::test]
-async fn list_tag_name_end_to_end_with_delete() {
-    let predicate = Predicate::default().with_expr(col("_measurement").eq(lit("swap")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-    let expected_tag_keys = vec!["host", "name"];
-    run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await;
-}
-
 #[tokio::test]
 async fn list_tag_name_max_time() {
     test_helpers::maybe_start_logging();
diff --git a/query_tests/src/influxrpc/tag_values.rs b/query_tests/src/influxrpc/tag_values.rs
index 7a99ab59e7..0e9e2c532a 100644
--- a/query_tests/src/influxrpc/tag_values.rs
+++ b/query_tests/src/influxrpc/tag_values.rs
@@ -80,32 +80,6 @@ async fn list_tag_values_no_predicate_state_col() {
     .await;
 }
 
-#[tokio::test]
-async fn list_tag_values_no_predicate_state_col_with_delete() {
-    let tag_name = "state";
-    let expected_tag_keys = vec!["CA", "MA"];
-    run_tag_values_test_case(
-        OneMeasurementManyNullTagsWithDelete {},
-        tag_name,
-        InfluxRpcPredicate::default(),
-        expected_tag_keys,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_tag_values_no_predicate_state_col_with_delete_all() {
-    let tag_name = "state";
-    let expected_tag_keys = vec![];
-    run_tag_values_test_case(
-        OneMeasurementManyNullTagsWithDeleteAll {},
-        tag_name,
-        InfluxRpcPredicate::default(),
-        expected_tag_keys,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_tag_values_no_predicate_city_col() {
     let tag_name = "city";
diff --git a/query_tests/src/scenarios/library.rs b/query_tests/src/scenarios/library.rs
index 5e52a1ba9c..f3fb22c756 100644
--- a/query_tests/src/scenarios/library.rs
+++ b/query_tests/src/scenarios/library.rs
@@ -6,7 +6,6 @@ use super::{
 };
 use crate::scenarios::util::{make_n_chunks_scenario, ChunkData};
 use async_trait::async_trait;
-use data_types::{DeleteExpr, DeletePredicate, Op, Scalar, TimestampRange};
 use iox_query::frontend::sql::SqlQueryPlanner;
 
 #[derive(Debug)]
@@ -83,82 +82,6 @@ impl DbSetup for OneMeasurementManyNullTags {
     }
 }
 
-#[derive(Debug)]
-pub struct OneMeasurementManyNullTagsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyNullTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,state=CA,city=LA,county=LA temp=70.4 100",
-            "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250",
-            "h2o,state=MA,city=Boston temp=50.4 200",
-            "h2o,state=CA temp=79.0 300",
-            "h2o,state=NY temp=60.8 400",
-            "h2o,state=NY,city=NYC temp=61.0 500",
-            "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600",
-        ];
-
-        // pred: delete from h2o where 400 <= time <= 602 and state=NY
-        // 3 rows of h2o & NY state will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(400, 602),
-            exprs: vec![DeleteExpr::new(
-                "state".to_string(),
-                Op::Eq,
-                Scalar::String(("NY").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-#[derive(Debug)]
-pub struct OneMeasurementManyNullTagsWithDeleteAll {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyNullTagsWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,state=CA,city=LA,county=LA temp=70.4 100",
-            "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250",
-            "h2o,state=MA,city=Boston temp=50.4 200",
-            "h2o,state=CA temp=79.0 300",
-            "h2o,state=NY temp=60.8 400",
-            "h2o,state=NY,city=NYC temp=61.0 500",
-            "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600",
-        ];
-
-        // pred: delete from h2o where 100 <= time <= 602
-        // all rows of h2o  will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(100, 602),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 /// Two measurements data in different chunk scenarios
 #[derive(Debug)]
 pub struct TwoMeasurements {}
@@ -177,85 +100,6 @@ impl DbSetup for TwoMeasurements {
     }
 }
 
-/// Two measurements data in different chunk scenarios
-/// with one delete applied at different stages of the chunk
-#[derive(Debug)]
-pub struct TwoMeasurementsWithDelete {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "cpu,region=west user=23.2 100",
-            "cpu,region=west user=21.0 150",
-            "disk,region=east bytes=99i 200",
-        ];
-
-        // pred: delete from cpu where 120 <= time <= 160 and region="west"
-        // delete 1 row from cpu with timestamp 150
-        let table_name = "cpu";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 160),
-            exprs: vec![DeleteExpr::new(
-                "region".to_string(),
-                Op::Eq,
-                Scalar::String("west".to_string()),
-            )],
-        };
-
-        // return all possible combination scenarios of a chunk stage and when the delete
-        // predicates are applied
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp_lines, table_name, partition_key).await
-    }
-}
-
-/// Two measurements data in different chunk scenarios
-/// with 2 deletes that remove all data from one table
-#[derive(Debug)]
-pub struct TwoMeasurementsWithDeleteAll {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "cpu,region=west user=23.2 100",
-            "cpu,region=west user=21.0 150",
-            "disk,region=east bytes=99i 200",
-        ];
-
-        // pred: delete from cpu where 120 <= time <= 160 and region="west"
-        // which will delete second row of the cpu
-        let table_name = "cpu";
-        let pred1 = DeletePredicate {
-            range: TimestampRange::new(120, 160),
-            exprs: vec![DeleteExpr::new(
-                "region".to_string(),
-                Op::Eq,
-                Scalar::String("west".to_string()),
-            )],
-        };
-
-        // delete the first row of the cpu
-        let pred2 = DeletePredicate {
-            range: TimestampRange::new(0, 110),
-            exprs: vec![],
-        };
-
-        // return all possible combination scenarios of a chunk stage and when the delete
-        // predicates are applied
-        all_scenarios_for_one_chunk(
-            vec![&pred1],
-            vec![&pred2],
-            lp_lines,
-            table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 #[derive(Debug)]
 pub struct TwoMeasurementsUnsignedType {}
 #[async_trait]
@@ -710,44 +554,6 @@ impl DbSetup for OneMeasurementManyFields {
         all_scenarios_for_one_chunk(vec![], vec![], lp_lines, "h2o", partition_key).await
     }
 }
-
-#[derive(Debug)]
-pub struct OneMeasurementManyFieldsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyFieldsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        // Order this so field3 comes before field2
-        // (and thus the columns need to get reordered)
-        let lp_lines = vec![
-            "h2o,tag1=foo,tag2=bar field1=70.6,field3=2 100",
-            "h2o,tag1=foo,tag2=bar field1=70.4,field2=\"ss\" 100",
-            "h2o,tag1=foo,tag2=bar field1=70.5,field2=\"ss\" 100",
-            "h2o,tag1=foo,tag2=bar field1=70.6,field4=true 1000",
-            "h2o,tag1=foo,tag2=bar field1=70.3,field5=false 3000",
-        ];
-
-        // pred: delete from h2o where 1000 <= time <= 1100
-        // 1 rows of h2o with timestamp 1000 will be deleted which means
-        // field4 no longer available
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1000, 1100),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 /// This data (from end to end test)
 #[derive(Debug)]
 pub struct EndToEndTest {}
@@ -772,48 +578,6 @@ impl DbSetup for EndToEndTest {
     }
 }
 
-#[derive(Debug)]
-pub struct EndToEndTestWithDelete {}
-#[async_trait]
-impl DbSetup for EndToEndTestWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let lp_lines = vec![
-            "cpu_load_short,host=server01,region=us-west value=0.64 0000",
-            "cpu_load_short,host=server01 value=27.99 1000",
-            "cpu_load_short,host=server02,region=us-west value=3.89 2000",
-            "cpu_load_short,host=server01,region=us-east value=1234567.891011 3000",
-            "cpu_load_short,host=server01,region=us-west value=0.000003 4000",
-            "system,host=server03 uptime=1303385 5000",
-            "swap,host=server01,name=disk0 in=3,out=4 6000",
-            "status active=t 7000",
-            "attributes color=\"blue\" 8000",
-        ];
-
-        let partition_key = "1970-01-01T00";
-
-        // pred: delete from swap where 6000 <= time <= 6000 and name=disk0
-        // 1 rows of swap with name=disk0 will be deleted
-        let delete_table_name = "swap";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(6000, 6000),
-            exprs: vec![DeleteExpr::new(
-                "name".to_string(),
-                Op::Eq,
-                Scalar::String(("disk0").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 #[derive(Debug)]
 pub struct TwoMeasurementsMultiSeries {}
 #[async_trait]
@@ -838,84 +602,6 @@ impl DbSetup for TwoMeasurementsMultiSeries {
     }
 }
 
-#[derive(Debug)]
-pub struct TwoMeasurementsMultiSeriesWithDelete {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsMultiSeriesWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let mut lp_lines = vec![
-            "h2o,state=MA,city=Boston temp=70.4 100", // to row 2
-            "h2o,state=MA,city=Boston temp=72.4 250", // to row 1
-            "h2o,state=CA,city=LA temp=90.0 200",     // to row 0
-            "h2o,state=CA,city=LA temp=90.0 350",     // to row 3
-            "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5
-            "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4
-        ];
-
-        // Swap around data is not inserted in series order
-        lp_lines.swap(0, 2);
-        lp_lines.swap(4, 5);
-
-        // pred: delete from h2o where 120 <= time <= 250
-        // 2 rows of h2o with timestamp 200 and 350 will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 250),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-#[derive(Debug)]
-pub struct TwoMeasurementsMultiSeriesWithDeleteAll {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsMultiSeriesWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let mut lp_lines = vec![
-            "h2o,state=MA,city=Boston temp=70.4 100", // to row 2
-            "h2o,state=MA,city=Boston temp=72.4 250", // to row 1
-            "h2o,state=CA,city=LA temp=90.0 200",     // to row 0
-            "h2o,state=CA,city=LA temp=90.0 350",     // to row 3
-            "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5
-            "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4
-        ];
-
-        // Swap around data is not inserted in series order
-        lp_lines.swap(0, 2);
-        lp_lines.swap(4, 5);
-
-        // Delete all data form h2o
-        // pred: delete from h20 where 100 <= time <= 360
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(100, 360),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 pub struct MeasurementStatusCode {}
 #[async_trait]
 impl DbSetup for MeasurementStatusCode {
@@ -950,44 +636,6 @@ impl DbSetup for MeasurementsSortableTags {
     }
 }
 
-#[derive(Debug)]
-pub struct MeasurementsSortableTagsWithDelete {}
-#[async_trait]
-impl DbSetup for MeasurementsSortableTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,zz_tag=A,state=MA,city=Kingston temp=70.1 800",
-            "h2o,state=MA,city=Kingston,zz_tag=B temp=70.2 100",
-            "h2o,state=CA,city=Boston temp=70.3 250", // soft deleted
-            "h2o,state=MA,city=Boston,zz_tag=A temp=70.4 1000",
-            "h2o,state=MA,city=Boston temp=70.5,other=5.0 250",
-        ];
-
-        // pred: delete from h2o where 120 <= time <= 350 and state=CA
-        // 1 rows of h2o with timestamp 250 will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 350),
-            exprs: vec![DeleteExpr::new(
-                "state".to_string(),
-                Op::Eq,
-                Scalar::String(("CA").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 // See issue: https://github.com/influxdata/influxdb_iox/issues/2845
 #[derive(Debug)]
 pub struct MeasurementsForDefect2845 {}
@@ -1019,65 +667,6 @@ impl DbSetup for OneMeasurementNoTags2 {
     }
 }
 
-pub struct OneMeasurementNoTagsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementNoTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-        let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"];
-
-        // pred: delete from m0 where 1 <= time <= 1 and foo=1.0
-        // 1 row of m0 with timestamp 1
-        let delete_table_name = "m0";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 1),
-            exprs: vec![DeleteExpr::new(
-                "foo".to_string(),
-                Op::Eq,
-                Scalar::F64((1.0).into()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-/// This will create many scenarios: some have a chunk with soft deleted data, some have no chunks
-/// because there is no point to create compacted chunks with all deleted data.
-pub struct OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {}
-#[async_trait]
-impl DbSetup for OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-        let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"];
-
-        // pred: delete from m0 where 1 <= time <= 2
-        let delete_table_name = "m0";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 2),
-            exprs: vec![],
-        };
-
-        // Apply predicate before the chunk is moved if any. There will be scenarios without chunks
-        // as a consequence of not-compacting-deleted-data
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 pub struct OneMeasurementForAggs {}
 #[async_trait]
 impl DbSetup for OneMeasurementForAggs {
@@ -1310,65 +899,6 @@ impl DbSetup for MeasurementForDefect2697 {
     }
 }
 
-pub struct MeasurementForDefect2697WithDelete {}
-#[async_trait]
-impl DbSetup for MeasurementForDefect2697WithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "2021-01-01T00";
-
-        let lp = vec![
-            "mm,section=1a bar=5.0 1609459201000000011",
-            "mm,section=1a bar=0.28 1609459201000000031",
-            "mm,section=2b bar=4.0 1609459201000000009",
-            "mm,section=2b bar=6.0 1609459201000000015",
-            "mm,section=2b bar=1.2 1609459201000000022",
-            "mm,section=1a foo=1.0 1609459201000000001",
-            "mm,section=1a foo=3.0 1609459201000000005",
-            "mm,section=1a foo=11.24 1609459201000000024",
-            "mm,section=2b foo=2.0 1609459201000000002",
-        ];
-
-        // pred: delete from mm where 1609459201000000022 <= time <= 1609459201000000022
-        // 1 row of m0 with timestamp 1609459201000000022 (section=2b bar=1.2)
-        let delete_table_name = "mm";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1609459201000000022, 1609459201000000022),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await
-    }
-}
-
-pub struct MeasurementForDefect2697WithDeleteAll {}
-#[async_trait]
-impl DbSetup for MeasurementForDefect2697WithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "2021-01-01T00";
-
-        let lp = vec![
-            "mm,section=1a bar=5.0 1609459201000000011",
-            "mm,section=1a bar=0.28 1609459201000000031",
-            "mm,section=2b bar=4.0 1609459201000000009",
-            "mm,section=2b bar=6.0 1609459201000000015",
-            "mm,section=2b bar=1.2 1609459201000000022",
-            "mm,section=1a foo=1.0 1609459201000000001",
-            "mm,section=1a foo=3.0 1609459201000000005",
-            "mm,section=1a foo=11.24 1609459201000000024",
-            "mm,section=2b foo=2.0 1609459201000000002",
-        ];
-
-        // pred: delete from mm where 1 <= time <= 1609459201000000031
-        let delete_table_name = "mm";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 1609459201000000031),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await
-    }
-}
-
 // Test data to validate fix for:
 // https://github.com/influxdata/influxdb_iox/issues/2890
 pub struct MeasurementForDefect2890 {}
diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs
index 6b2249dc20..293280c066 100644
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@@ -1048,9 +1048,6 @@ impl QueryDataAdapter {
                                     parquet_max_sequence_number: status
                                         .parquet_max_sequence_number
                                         .map(|x| x.get()),
-                                    tombstone_max_sequence_number: status
-                                        .tombstone_max_sequence_number
-                                        .map(|x| x.get()),
                                 }),
                             },
                         ),

From b294bb98aa09e484f57c4aad83d86fb48624d065 Mon Sep 17 00:00:00 2001
From: Dom Dwyer <dom@itsallbroken.com>
Date: Tue, 11 Oct 2022 16:58:41 +0200
Subject: [PATCH 37/40] refactor: move query types to query_handler

Moves types that are only used for handling queries to the query_handler
module.
---
 ingester/src/data.rs              | 293 +----------------------------
 ingester/src/data/partition.rs    |  13 +-
 ingester/src/data/table.rs        |   6 +-
 ingester/src/handler.rs           |   4 +-
 ingester/src/querier_handler.rs   | 301 +++++++++++++++++++++++++++++-
 ingester/src/server/grpc.rs       |   5 +-
 query_tests/src/scenarios/util.rs |   7 +-
 7 files changed, 309 insertions(+), 320 deletions(-)

diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 66f71159bb..d1ec7d39a2 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -1,15 +1,12 @@
 //! Data for the lifecycle of the Ingester
 
-use std::{collections::BTreeMap, pin::Pin, sync::Arc};
+use std::{collections::BTreeMap, sync::Arc};
 
-use arrow::{error::ArrowError, record_batch::RecordBatch};
-use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use async_trait::async_trait;
 use backoff::{Backoff, BackoffConfig};
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId};
-use datafusion::physical_plan::SendableRecordBatchStream;
+
 use dml::DmlOperation;
-use futures::{Stream, StreamExt};
 use iox_catalog::interface::{get_table_schema_by_id, Catalog};
 use iox_query::exec::Executor;
 use iox_time::SystemProvider;
@@ -30,11 +27,7 @@ pub mod partition;
 pub(crate) mod shard;
 pub(crate) mod table;
 
-use self::{
-    partition::{resolver::PartitionProvider, PartitionStatus},
-    shard::ShardData,
-    table::TableName,
-};
+use self::{partition::resolver::PartitionProvider, shard::ShardData, table::TableName};
 
 #[cfg(test)]
 mod triggers;
@@ -482,172 +475,24 @@ impl Persister for IngesterData {
     }
 }
 
-/// Stream of snapshots.
-///
-/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
-pub(crate) type SnapshotStream =
-    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
-
-/// Response data for a single partition.
-pub(crate) struct IngesterQueryPartition {
-    /// Stream of snapshots.
-    snapshots: SnapshotStream,
-
-    /// Partition ID.
-    id: PartitionId,
-
-    /// Partition persistence status.
-    status: PartitionStatus,
-}
-
-impl std::fmt::Debug for IngesterQueryPartition {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryPartition")
-            .field("snapshots", &"<SNAPSHOT STREAM>")
-            .field("id", &self.id)
-            .field("status", &self.status)
-            .finish()
-    }
-}
-
-impl IngesterQueryPartition {
-    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
-        Self {
-            snapshots,
-            id,
-            status,
-        }
-    }
-}
-
-/// Stream of partitions in this response.
-pub(crate) type IngesterQueryPartitionStream =
-    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
-
-/// Response streams for querier<>ingester requests.
-///
-/// The data structure is constructed to allow lazy/streaming data generation. For easier
-/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
-pub struct IngesterQueryResponse {
-    /// Stream of partitions.
-    partitions: IngesterQueryPartitionStream,
-}
-
-impl std::fmt::Debug for IngesterQueryResponse {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryResponse")
-            .field("partitions", &"<PARTITION STREAM>")
-            .finish()
-    }
-}
-
-impl IngesterQueryResponse {
-    /// Make a response
-    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
-        Self { partitions }
-    }
-
-    /// Flattens the data according to the wire protocol.
-    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
-        self.partitions
-            .flat_map(|partition_res| match partition_res {
-                Ok(partition) => {
-                    let head = futures::stream::once(async move {
-                        Ok(FlatIngesterQueryResponse::StartPartition {
-                            partition_id: partition.id,
-                            status: partition.status,
-                        })
-                    });
-                    let tail = partition
-                        .snapshots
-                        .flat_map(|snapshot_res| match snapshot_res {
-                            Ok(snapshot) => {
-                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
-
-                                let schema_captured = Arc::clone(&schema);
-                                let head = futures::stream::once(async {
-                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
-                                        schema: schema_captured,
-                                    })
-                                });
-
-                                let tail = snapshot.map(move |batch_res| match batch_res {
-                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
-                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
-                                    }),
-                                    Err(e) => Err(e),
-                                });
-
-                                head.chain(tail).boxed()
-                            }
-                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-                        });
-
-                    head.chain(tail).boxed()
-                }
-                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-            })
-            .boxed()
-    }
-}
-
-/// Flattened version of [`IngesterQueryResponse`].
-pub(crate) type FlatIngesterQueryResponseStream =
-    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
-
-/// Element within the flat wire protocol.
-#[derive(Debug, PartialEq)]
-pub enum FlatIngesterQueryResponse {
-    /// Start a new partition.
-    StartPartition {
-        /// Partition ID.
-        partition_id: PartitionId,
-
-        /// Partition persistence status.
-        status: PartitionStatus,
-    },
-
-    /// Start a new snapshot.
-    ///
-    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
-    /// message.
-    StartSnapshot {
-        /// Snapshot schema.
-        schema: Arc<arrow::datatypes::Schema>,
-    },
-
-    /// Add a record batch to the snapshot that was announced by the last
-    /// [`StartSnapshot`](Self::StartSnapshot) message.
-    RecordBatch {
-        /// Record batch.
-        batch: RecordBatch,
-    },
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{
-        ops::DerefMut,
-        sync::Arc,
-        task::{Context, Poll},
-        time::Duration,
-    };
+    use std::{ops::DerefMut, sync::Arc, time::Duration};
 
-    use arrow::datatypes::SchemaRef;
     use assert_matches::assert_matches;
     use data_types::{
         ColumnId, ColumnSet, CompactionLevel, DeletePredicate, NamespaceSchema, NonEmptyString,
         ParquetFileParams, Sequence, Timestamp, TimestampRange,
     };
-    use datafusion::physical_plan::RecordBatchStream;
+
     use dml::{DmlDelete, DmlMeta, DmlWrite};
     use futures::TryStreamExt;
     use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
     use iox_time::Time;
     use metric::{MetricObserver, Observation};
-    use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch};
+    use mutable_batch_lp::lines_to_batches;
     use object_store::memory::InMemory;
-    use schema::selection::Selection;
+
     use uuid::Uuid;
 
     use super::*;
@@ -1506,128 +1351,4 @@ mod tests {
 
         assert_eq!(progresses, expected_progresses);
     }
-
-    #[tokio::test]
-    async fn test_ingester_query_response_flatten() {
-        let batch_1_1 = lp_to_batch("table x=1 0");
-        let batch_1_2 = lp_to_batch("table x=2 1");
-        let batch_2 = lp_to_batch("table y=1 10");
-        let batch_3 = lp_to_batch("table z=1 10");
-
-        let schema_1 = batch_1_1.schema();
-        let schema_2 = batch_2.schema();
-        let schema_3 = batch_3.schema();
-
-        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![
-                            Ok(batch_1_1.clone()),
-                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-                            Ok(batch_1_2.clone()),
-                        ],
-                        Arc::clone(&schema_1),
-                    )) as _),
-                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![Ok(batch_2.clone())],
-                        Arc::clone(&schema_2),
-                    )) as _),
-                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
-                ])),
-                PartitionId::new(2),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                },
-            )),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([])),
-                PartitionId::new(1),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                },
-            )),
-        ])));
-
-        let actual: Vec<_> = response.flatten().collect().await;
-        let expected = vec![
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(2),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                },
-            }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
-            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
-            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(1),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                },
-            }),
-        ];
-
-        assert_eq!(actual.len(), expected.len());
-        for (actual, expected) in actual.into_iter().zip(expected) {
-            match (actual, expected) {
-                (Ok(actual), Ok(expected)) => {
-                    assert_eq!(actual, expected);
-                }
-                (Err(_), Err(_)) => {
-                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
-                }
-                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
-                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
-            }
-        }
-    }
-
-    fn lp_to_batch(lp: &str) -> RecordBatch {
-        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
-    }
-
-    pub struct TestRecordBatchStream {
-        schema: SchemaRef,
-        batches: Vec<Result<RecordBatch, ArrowError>>,
-    }
-
-    impl TestRecordBatchStream {
-        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
-            Self { schema, batches }
-        }
-    }
-
-    impl RecordBatchStream for TestRecordBatchStream {
-        fn schema(&self) -> SchemaRef {
-            Arc::clone(&self.schema)
-        }
-    }
-
-    impl futures::Stream for TestRecordBatchStream {
-        type Item = Result<RecordBatch, ArrowError>;
-
-        fn poll_next(
-            mut self: std::pin::Pin<&mut Self>,
-            _: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            if self.batches.is_empty() {
-                Poll::Ready(None)
-            } else {
-                Poll::Ready(Some(self.batches.remove(0)))
-            }
-        }
-
-        fn size_hint(&self) -> (usize, Option<usize>) {
-            (self.batches.len(), Some(self.batches.len()))
-        }
-    }
 }
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 2adfa2582c..6e31899d82 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -14,7 +14,7 @@ use self::{
     buffer::{BufferBatch, DataBuffer},
     resolver::DeferredSortKey,
 };
-use crate::query::QueryableBatch;
+use crate::{querier_handler::PartitionStatus, query::QueryableBatch};
 
 use super::table::TableName;
 
@@ -30,17 +30,6 @@ pub(crate) struct UnpersistedPartitionData {
     pub(crate) partition_status: PartitionStatus,
 }
 
-/// Status of a partition that has unpersisted data.
-///
-/// Note that this structure is specific to a partition (which itself is bound to a table and
-/// shard)!
-#[derive(Debug, Clone, PartialEq, Eq)]
-#[allow(missing_copy_implementations)]
-pub struct PartitionStatus {
-    /// Max sequence number persisted
-    pub parquet_max_sequence_number: Option<SequenceNumber>,
-}
-
 /// PersistingBatch contains all needed info and data for creating
 /// a parquet file for given set of SnapshotBatches
 #[derive(Debug, PartialEq, Clone)]
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 357c3edd6c..709055dd88 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -6,10 +6,8 @@ use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId
 use mutable_batch::MutableBatch;
 use write_summary::ShardProgress;
 
-use super::partition::{
-    resolver::PartitionProvider, PartitionData, PartitionStatus, UnpersistedPartitionData,
-};
-use crate::lifecycle::LifecycleHandle;
+use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData};
+use crate::{lifecycle::LifecycleHandle, querier_handler::PartitionStatus};
 
 /// A double-referenced map where [`PartitionData`] can be looked up by
 /// [`PartitionKey`], or ID.
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index 67b34342dd..981a43cd57 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -30,11 +30,11 @@ use crate::{
     data::{
         partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
         shard::ShardData,
-        IngesterData, IngesterQueryResponse,
+        IngesterData,
     },
     lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
     poison::PoisonCabinet,
-    querier_handler::prepare_data_to_querier,
+    querier_handler::{prepare_data_to_querier, IngesterQueryResponse},
     stream_handler::{
         handler::SequencedStreamHandler, sink_adaptor::IngestSinkAdaptor,
         sink_instrumentation::SinkInstrumentation, PeriodicWatermarkFetcher,
diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs
index 88371e2c40..59996f94cf 100644
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@@ -1,10 +1,13 @@
 //! Handle all requests from Querier
 
-use std::sync::Arc;
+use std::{pin::Pin, sync::Arc};
 
+use arrow::{error::ArrowError, record_batch::RecordBatch};
+use arrow_util::optimize::{optimize_record_batch, optimize_schema};
+use data_types::{PartitionId, SequenceNumber};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
-use futures::StreamExt;
+use futures::{Stream, StreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use observability_deps::tracing::debug;
 use schema::selection::Selection;
@@ -13,7 +16,7 @@ use snafu::{ensure, Snafu};
 use crate::{
     data::{
         namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName,
-        IngesterData, IngesterQueryPartition, IngesterQueryResponse,
+        IngesterData,
     },
     query::QueryableBatch,
 };
@@ -47,6 +50,159 @@ pub enum Error {
 /// A specialized `Error` for Ingester's Query errors
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+/// Stream of snapshots.
+///
+/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
+pub(crate) type SnapshotStream =
+    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
+
+/// Status of a partition that has unpersisted data.
+///
+/// Note that this structure is specific to a partition (which itself is bound to a table and
+/// shard)!
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[allow(missing_copy_implementations)]
+pub struct PartitionStatus {
+    /// Max sequence number persisted
+    pub parquet_max_sequence_number: Option<SequenceNumber>,
+}
+
+/// Response data for a single partition.
+pub(crate) struct IngesterQueryPartition {
+    /// Stream of snapshots.
+    snapshots: SnapshotStream,
+
+    /// Partition ID.
+    id: PartitionId,
+
+    /// Partition persistence status.
+    status: PartitionStatus,
+}
+
+impl std::fmt::Debug for IngesterQueryPartition {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryPartition")
+            .field("snapshots", &"<SNAPSHOT STREAM>")
+            .field("id", &self.id)
+            .field("status", &self.status)
+            .finish()
+    }
+}
+
+impl IngesterQueryPartition {
+    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
+        Self {
+            snapshots,
+            id,
+            status,
+        }
+    }
+}
+
+/// Stream of partitions in this response.
+pub(crate) type IngesterQueryPartitionStream =
+    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
+
+/// Response streams for querier<>ingester requests.
+///
+/// The data structure is constructed to allow lazy/streaming data generation. For easier
+/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
+pub struct IngesterQueryResponse {
+    /// Stream of partitions.
+    partitions: IngesterQueryPartitionStream,
+}
+
+impl std::fmt::Debug for IngesterQueryResponse {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryResponse")
+            .field("partitions", &"<PARTITION STREAM>")
+            .finish()
+    }
+}
+
+impl IngesterQueryResponse {
+    /// Make a response
+    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
+        Self { partitions }
+    }
+
+    /// Flattens the data according to the wire protocol.
+    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
+        self.partitions
+            .flat_map(|partition_res| match partition_res {
+                Ok(partition) => {
+                    let head = futures::stream::once(async move {
+                        Ok(FlatIngesterQueryResponse::StartPartition {
+                            partition_id: partition.id,
+                            status: partition.status,
+                        })
+                    });
+                    let tail = partition
+                        .snapshots
+                        .flat_map(|snapshot_res| match snapshot_res {
+                            Ok(snapshot) => {
+                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
+
+                                let schema_captured = Arc::clone(&schema);
+                                let head = futures::stream::once(async {
+                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
+                                        schema: schema_captured,
+                                    })
+                                });
+
+                                let tail = snapshot.map(move |batch_res| match batch_res {
+                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
+                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
+                                    }),
+                                    Err(e) => Err(e),
+                                });
+
+                                head.chain(tail).boxed()
+                            }
+                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+                        });
+
+                    head.chain(tail).boxed()
+                }
+                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+            })
+            .boxed()
+    }
+}
+
+/// Flattened version of [`IngesterQueryResponse`].
+pub(crate) type FlatIngesterQueryResponseStream =
+    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
+
+/// Element within the flat wire protocol.
+#[derive(Debug, PartialEq)]
+pub enum FlatIngesterQueryResponse {
+    /// Start a new partition.
+    StartPartition {
+        /// Partition ID.
+        partition_id: PartitionId,
+
+        /// Partition persistence status.
+        status: PartitionStatus,
+    },
+
+    /// Start a new snapshot.
+    ///
+    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
+    /// message.
+    StartSnapshot {
+        /// Snapshot schema.
+        schema: Arc<arrow::datatypes::Schema>,
+    },
+
+    /// Add a record batch to the snapshot that was announced by the last
+    /// [`StartSnapshot`](Self::StartSnapshot) message.
+    RecordBatch {
+        /// Record batch.
+        batch: RecordBatch,
+    },
+}
+
 /// Return data to send as a response back to the Querier per its request
 pub async fn prepare_data_to_querier(
     ingest_data: &Arc<IngesterData>,
@@ -189,19 +345,106 @@ fn prepare_data_to_querier_for_partition(
 
 #[cfg(test)]
 mod tests {
-    use arrow::{array::new_null_array, record_batch::RecordBatch};
+    use std::task::{Context, Poll};
+
+    use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch};
     use arrow_util::assert_batches_sorted_eq;
     use assert_matches::assert_matches;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        physical_plan::RecordBatchStream,
+    };
     use futures::TryStreamExt;
+    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
     use predicate::Predicate;
     use schema::merge::SchemaMerger;
 
     use super::*;
-    use crate::{
-        data::FlatIngesterQueryResponse,
-        test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE},
-    };
+    use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE};
+
+    #[tokio::test]
+    async fn test_ingester_query_response_flatten() {
+        let batch_1_1 = lp_to_batch("table x=1 0");
+        let batch_1_2 = lp_to_batch("table x=2 1");
+        let batch_2 = lp_to_batch("table y=1 10");
+        let batch_3 = lp_to_batch("table z=1 10");
+
+        let schema_1 = batch_1_1.schema();
+        let schema_2 = batch_2.schema();
+        let schema_3 = batch_3.schema();
+
+        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![
+                            Ok(batch_1_1.clone()),
+                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+                            Ok(batch_1_2.clone()),
+                        ],
+                        Arc::clone(&schema_1),
+                    )) as _),
+                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![Ok(batch_2.clone())],
+                        Arc::clone(&schema_2),
+                    )) as _),
+                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
+                ])),
+                PartitionId::new(2),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([])),
+                PartitionId::new(1),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+        ])));
+
+        let actual: Vec<_> = response.flatten().collect().await;
+        let expected = vec![
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(2),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
+            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
+            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(1),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+        ];
+
+        assert_eq!(actual.len(), expected.len());
+        for (actual, expected) in actual.into_iter().zip(expected) {
+            match (actual, expected) {
+                (Ok(actual), Ok(expected)) => {
+                    assert_eq!(actual, expected);
+                }
+                (Err(_), Err(_)) => {
+                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
+                }
+                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
+                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
+            }
+        }
+    }
 
     #[tokio::test]
     async fn test_prepare_data_to_querier() {
@@ -358,6 +601,46 @@ mod tests {
         }
     }
 
+    pub struct TestRecordBatchStream {
+        schema: SchemaRef,
+        batches: Vec<Result<RecordBatch, ArrowError>>,
+    }
+
+    impl TestRecordBatchStream {
+        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
+            Self { schema, batches }
+        }
+    }
+
+    impl RecordBatchStream for TestRecordBatchStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    impl futures::Stream for TestRecordBatchStream {
+        type Item = Result<RecordBatch, ArrowError>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            _: &mut Context<'_>,
+        ) -> Poll<Option<Self::Item>> {
+            if self.batches.is_empty() {
+                Poll::Ready(None)
+            } else {
+                Poll::Ready(Some(self.batches.remove(0)))
+            }
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (self.batches.len(), Some(self.batches.len()))
+        }
+    }
+
+    fn lp_to_batch(lp: &str) -> RecordBatch {
+        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
+    }
+
     /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
     ///
     /// If the response contains multiple snapshots, this will merge the schemas into a single one and create
diff --git a/ingester/src/server/grpc.rs b/ingester/src/server/grpc.rs
index 8cbd26afe1..3bf785843d 100644
--- a/ingester/src/server/grpc.rs
+++ b/ingester/src/server/grpc.rs
@@ -30,8 +30,8 @@ use trace::ctx::SpanContext;
 use write_summary::WriteSummary;
 
 use crate::{
-    data::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
     handler::IngestHandler,
+    querier_handler::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
 };
 
 /// This type is responsible for managing all gRPC services exposed by
@@ -464,8 +464,9 @@ mod tests {
     use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
     use schema::selection::Selection;
 
+    use crate::querier_handler::PartitionStatus;
+
     use super::*;
-    use crate::data::partition::PartitionStatus;
 
     #[tokio::test]
     async fn test_get_stream_empty() {
diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs
index 293280c066..477503504b 100644
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@@ -14,12 +14,9 @@ use generated_types::{
 };
 use influxdb_iox_client::flight::{low_level::LowLevelMessage, Error as FlightError};
 use ingester::{
-    data::{
-        partition::resolver::CatalogPartitionResolver, FlatIngesterQueryResponse, IngesterData,
-        IngesterQueryResponse, Persister,
-    },
+    data::{partition::resolver::CatalogPartitionResolver, IngesterData, Persister},
     lifecycle::mock_handle::MockLifecycleHandle,
-    querier_handler::prepare_data_to_querier,
+    querier_handler::{prepare_data_to_querier, FlatIngesterQueryResponse, IngesterQueryResponse},
 };
 use iox_catalog::interface::get_schema_by_name;
 use iox_query::exec::{Executor, ExecutorConfig};

From b7153862b0f5cf0934444c8a0988b5e270b71b9e Mon Sep 17 00:00:00 2001
From: Nga Tran <nga-tran@live.com>
Date: Tue, 11 Oct 2022 13:22:19 -0400
Subject: [PATCH 38/40] refactor: due to limit in size uplaoed to S3, we need
 to split output file of cold compaction, too (#5834)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 compactor/src/cold.rs | 84 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 20 deletions(-)

diff --git a/compactor/src/cold.rs b/compactor/src/cold.rs
index 1eb3aad4ee..f4a59dcf65 100644
--- a/compactor/src/cold.rs
+++ b/compactor/src/cold.rs
@@ -45,7 +45,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
         compaction_type,
         CompactionLevel::Initial,
         compact_in_parallel,
-        false, // no split
+        true, // split
         candidates.clone().into(),
     )
     .await;
@@ -57,7 +57,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
             compaction_type,
             CompactionLevel::FileNonOverlapped,
             compact_in_parallel,
-            false, // don't split
+            true, // split
             candidates.into(),
         )
         .await;
@@ -812,24 +812,42 @@ mod tests {
 
         compact(compactor, true).await;
 
-        // Should have 1 non-soft-deleted file:
+        // Should have 2 non-soft-deleted file:
         //
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - the 2 level-2 files created after combining all 3 level 1 files created by the first step
         //   of compaction to compact remaining level 0 files
         let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 1, "{files:?}");
+        assert_eq!(files.len(), 2, "{files:?}");
         let files_and_levels: Vec<_> = files
             .iter()
             .map(|f| (f.id.get(), f.compaction_level))
             .collect();
 
         // The initial files are: L0 1-4, L1 5-6. The first step of cold compaction took files 1-5
-        // and compacted them into a l-1 file 7. The second step of cold compaction
-        // took 6 and 7 and combined them all into file 8.
-        assert_eq!(files_and_levels, vec![(8, CompactionLevel::Final)]);
+        // and compacted them into two l-1 files 7, 8. The second step of cold compaction
+        // took 6, 7, and 8 and combined them all into two files 9 and 10.
+        assert_eq!(
+            files_and_levels,
+            vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)]
+        );
 
         // ------------------------------------------------
         // Verify the parquet file content
+        // first file:
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z |",
+                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // second file
         let file = files.pop().unwrap();
         let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
@@ -847,9 +865,7 @@ mod tests {
                 "| 20        |      | VT   | 20   | 1970-01-01T00:00:00.000026Z    |",
                 "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
                 "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
-                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z    |",
                 "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
-                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z    |",
                 "+-----------+------+------+------+--------------------------------+",
             ],
             &batches
@@ -1027,14 +1043,14 @@ mod tests {
 
         compact(compactor, true).await;
 
-        // Should have 3 non-soft-deleted files:
+        // Should have 4 non-soft-deleted files:
         //
         // - pf4, the level 1 file untouched because it didn't fit in the memory budget
         // - pf6, the level 2 file untouched because it doesn't overlap anything
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - two level-2 files created after combining all 3 level 1 files created by the first step
         //   of compaction to compact remaining level 0 files
         let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 3, "{files:?}");
+        assert_eq!(files.len(), 4, "{files:?}");
         let files_and_levels: Vec<_> = files
             .iter()
             .map(|f| (f.id.get(), f.compaction_level))
@@ -1042,20 +1058,35 @@ mod tests {
 
         // File 4 was L1 but didn't fit in the memory budget, so was untouched.
         // File 6 was already L2 and did not overlap with anything, so was untouched.
-        // Cold compaction took files 1, 2, 3, 5 and compacted them into file 7.
+        // Cold compaction took files 1, 2, 3, 5 and compacted them into 2 files 7 and 8.
         assert_eq!(
             files_and_levels,
             vec![
                 (4, CompactionLevel::FileNonOverlapped),
                 (6, CompactionLevel::Final),
                 (7, CompactionLevel::Final),
+                (8, CompactionLevel::Final),
             ]
         );
 
         // ------------------------------------------------
         // Verify the parquet file content
-        let file1 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file1).await;
+        // newly created L-2 with largest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // newly created L-2 with smallest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
             &[
                 "+-----------+------+------+------+--------------------------------+",
@@ -1068,15 +1099,14 @@ mod tests {
                 "| 1500      | WA   |      |      | 1970-01-01T00:00:00.000008Z    |",
                 "| 1601      |      | PA   | 15   | 1970-01-01T00:00:00.000000009Z |",
                 "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
-                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
                 "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
                 "+-----------+------+------+------+--------------------------------+",
             ],
             &batches
         );
-
-        let file0 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file0).await;
+        // available L2 that does not overlap
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
             &[
                 "+-----------+------+------+-----------------------------+",
@@ -1088,6 +1118,20 @@ mod tests {
             ],
             &batches
         );
+        // available L1 that did not fit in the memory budget
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+-----------------------------+",
+                "| field_int | tag2 | tag3 | time                        |",
+                "+-----------+------+------+-----------------------------+",
+                "| 1600      | WA   | 10   | 1970-01-01T00:00:00.000028Z |",
+                "| 20        | VT   | 20   | 1970-01-01T00:00:00.000026Z |",
+                "+-----------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
     }
 
     struct TestDb {

From 11900cea4d2fde77d33ccaf8569c9422f888fc2e Mon Sep 17 00:00:00 2001
From: Luke Bond <luke.n.bond@gmail.com>
Date: Wed, 12 Oct 2022 13:10:20 +0100
Subject: [PATCH 39/40] chore: add some tracing logs to the ingester (#5839)

---
 ingester/src/data/partition.rs | 14 +++++++++++---
 ingester/src/data/table.rs     |  6 ++++++
 ingester/src/lifecycle.rs      | 20 +++++++++++++++++++-
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 6e31899d82..61dd4c36d2 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use arrow::record_batch::RecordBatch;
 use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
+use observability_deps::tracing::*;
 use schema::{selection::Selection, sort::SortKey};
 use snafu::ResultExt;
 use uuid::Uuid;
@@ -232,19 +233,26 @@ impl PartitionData {
         sequence_number: SequenceNumber,
         mb: MutableBatch,
     ) -> Result<(), super::Error> {
-        match &mut self.data.buffer {
+        let (min_sequence_number, max_sequence_number) = match &mut self.data.buffer {
             Some(buf) => {
                 buf.max_sequence_number = sequence_number.max(buf.max_sequence_number);
                 buf.data.extend_from(&mb).context(super::BufferWriteSnafu)?;
+                (buf.min_sequence_number, buf.max_sequence_number)
             }
             None => {
                 self.data.buffer = Some(BufferBatch {
                     min_sequence_number: sequence_number,
                     max_sequence_number: sequence_number,
                     data: mb,
-                })
+                });
+                (sequence_number, sequence_number)
             }
-        }
+        };
+        trace!(
+            min_sequence_number=?min_sequence_number,
+            max_sequence_number=?max_sequence_number,
+            "buffered write"
+        );
 
         Ok(())
     }
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 709055dd88..8ebaa7a192 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -4,6 +4,7 @@ use std::{collections::HashMap, sync::Arc};
 
 use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
+use observability_deps::tracing::*;
 use write_summary::ShardProgress;
 
 use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData};
@@ -159,6 +160,11 @@ impl TableData {
         // skip the write if it has already been persisted
         if let Some(max) = partition_data.max_persisted_sequence_number() {
             if max >= sequence_number {
+                trace!(
+                    shard_id=%self.shard_id,
+                    op_sequence_number=?sequence_number,
+                    "skipping already-persisted write"
+                );
                 return Ok(false);
             }
         }
diff --git a/ingester/src/lifecycle.rs b/ingester/src/lifecycle.rs
index 01b9ff2f33..d15389ed60 100644
--- a/ingester/src/lifecycle.rs
+++ b/ingester/src/lifecycle.rs
@@ -12,7 +12,7 @@ use std::{collections::BTreeMap, sync::Arc, time::Duration};
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
 use iox_time::{Time, TimeProvider};
 use metric::{Metric, U64Counter};
-use observability_deps::tracing::{error, info, warn};
+use observability_deps::tracing::{error, info, trace, warn};
 use parking_lot::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracker::TrackedFutureExt;
@@ -97,6 +97,18 @@ impl LifecycleHandle for LifecycleHandleImpl {
         stats.last_write = now;
         stats.rows_written += rows_written;
 
+        trace!(
+            shard_id=%stats.shard_id,
+            partition_id=%stats.partition_id,
+            namespace_id=%stats.namespace_id,
+            table_id=%stats.table_id,
+            first_write=%stats.first_write,
+            last_write=%stats.last_write,
+            bytes_written=%stats.bytes_written,
+            first_sequence_number=?stats.first_sequence_number,
+            "logged write"
+        );
+
         s.total_bytes += bytes_written;
 
         // Pause if the server has exceeded the configured memory limit.
@@ -538,6 +550,12 @@ impl LifecycleManager {
                     .map(|s| s.first_sequence_number)
                     .min()
                     .unwrap_or(sequence_number);
+                trace!(
+                    min_unpersisted_sequence_number=?min,
+                    shard_id=%shard_id,
+                    sequence_number=?sequence_number,
+                    "updated min_unpersisted_sequence_number for persisted shard"
+                );
                 persister
                     .update_min_unpersisted_sequence_number(shard_id, min)
                     .await;

From 7202dddab6d9ede46c74664c0675fe349da2fd13 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Oct 2022 12:37:24 +0000
Subject: [PATCH 40/40] chore(deps): Bump tokio-stream from 0.1.10 to 0.1.11
 (#5838)

Bumps [tokio-stream](https://github.com/tokio-rs/tokio) from 0.1.10 to 0.1.11.
- [Release notes](https://github.com/tokio-rs/tokio/releases)
- [Commits](https://github.com/tokio-rs/tokio/compare/tokio-stream-0.1.10...tokio-stream-0.1.11)

---
updated-dependencies:
- dependency-name: tokio-stream
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 Cargo.lock                     | 4 ++--
 influxdb_iox_client/Cargo.toml | 2 +-
 ingester/Cargo.toml            | 2 +-
 router/Cargo.toml              | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d78dcedd8c..c3dc9fd467 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5213,9 +5213,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af"
+checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
 dependencies = [
  "futures-core",
  "pin-project-lite",
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
index 3cb742bf38..42a886d98c 100644
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@@ -25,6 +25,6 @@ prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
 tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
-tokio-stream = "0.1.10"
+tokio-stream = "0.1.11"
 thiserror = "1.0.37"
 tonic = { version = "0.8" }
diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml
index 2537f95edb..b12ed95c1d 100644
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@@ -53,4 +53,4 @@ bitflags = {version = "1.3.2"}
 once_cell = "1"
 paste = "1.0.9"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
-tokio-stream = {version = "0.1.10", default_features = false }
+tokio-stream = {version = "0.1.11", default_features = false }
diff --git a/router/Cargo.toml b/router/Cargo.toml
index fcebd5f360..d19ecf8b4d 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -47,7 +47,7 @@ pretty_assertions = "1.3.0"
 rand = "0.8.3"
 schema = { path = "../schema" }
 test_helpers = { version = "0.1.0", path = "../test_helpers", features = ["future_timeout"] }
-tokio-stream = { version = "0.1.10", default_features = false, features = [] }
+tokio-stream = { version = "0.1.11", default_features = false, features = [] }
 
 [lib]
 # Allow --save-baseline to work