Merge branch 'main' into dependabot/cargo/clap-4.0.2

2022-10-12 14:01:28 +00:00 · 2022-10-12 14:01:28 +00:00 · 266b8f2a58
parent 63a4a1051a 7202dddab6
commit 266b8f2a58
129 changed files with 4605 additions and 5043 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1050,7 +1050,7 @@ dependencies = [
 "influxdb_line_protocol",
 "iox_time",
 "observability_deps",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
 "percent-encoding",
 "schema",
 "serde",
@ -1094,7 +1094,7 @@ dependencies = [
 "log",
 "num_cpus",
 "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
 "parking_lot 0.12.1",
 "parquet",
 "paste",
@ -1116,7 +1116,7 @@ source = "git+https://github.com/apache/arrow-datafusion.git?rev=c7f3a70a79ee840
 dependencies = [
 "arrow",
 "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
 "parquet",
 "sqlparser 0.23.0",
 ]
@ -1163,7 +1163,7 @@ dependencies = [
 "hashbrown",
 "lazy_static",
 "md-5",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
 "paste",
 "rand",
 "regex",
@ -1741,9 +1741,9 @@ dependencies = [

 [[package]]
 name = "handlebars"
-version = "4.3.4"
+version = "4.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56b224eaa4987c03c30b251de7ef0c15a6a59f34222905850dbc3026dfb24d5f"
+checksum = "433e4ab33f1213cdc25b5fa45c76881240cfe79284cf2b395e8b9e312a30a2fd"
 dependencies = [
 "log",
 "pest",
@ -2061,7 +2061,9 @@ dependencies = [
 "data_types",
 "datafusion 0.1.0",
 "dotenvy",
+ "flate2",
 "futures",
+ "futures-util",
 "generated_types",
 "hashbrown",
 "http",
@ -2126,12 +2128,13 @@ dependencies = [
 "client_util",
 "futures-util",
 "generated_types",
- "mockito",
+ "influxdb_line_protocol",
 "prost 0.11.0",
 "rand",
 "reqwest",
 "thiserror",
 "tokio",
+ "tokio-stream",
 "tonic",
 ]

@ -2182,7 +2185,7 @@ version = "0.1.0"
 dependencies = [
 "generated_types",
 "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
 "workspace-hack",
 ]

@ -2222,6 +2225,7 @@ dependencies = [
 "pin-project",
 "predicate",
 "prost 0.11.0",
+ "rand",
 "schema",
 "snafu",
 "test_helpers",
@ -2681,9 +2685,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.134"
+version = "0.2.135"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
+checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"

 [[package]]
 name = "libloading"
@ -3130,9 +3134,9 @@ dependencies = [

 [[package]]
 name = "object_store"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2168fee79ee3e7695905bc3a48777d807f82d956f821186fa7a2601c1295a73e"
+checksum = "56ce10a205d9f610ae3532943039c34c145930065ce0c4284134c897fe6073b1"
 dependencies = [
 "async-trait",
 "base64",
@ -3142,7 +3146,7 @@ dependencies = [
 "itertools",
 "parking_lot 0.12.1",
 "percent-encoding",
- "quick-xml 0.24.1",
+ "quick-xml 0.25.0",
 "rand",
 "reqwest",
 "ring",
@ -3207,9 +3211,9 @@ dependencies = [

 [[package]]
 name = "ordered-float"
-version = "3.1.0"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a"
+checksum = "129d36517b53c461acc6e1580aeb919c8ae6708a4b1eae61c4463a615d4f0411"
 dependencies = [
 "num-traits",
 ]
@ -3581,7 +3585,7 @@ dependencies = [
 "schema",
 "serde_json",
 "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
 "test_helpers",
 "workspace-hack",
 ]
@ -3670,9 +3674,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"

 [[package]]
 name = "proc-macro2"
-version = "1.0.43"
+version = "1.0.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
 dependencies = [
 "unicode-ident",
 ]
@ -3942,9 +3946,9 @@ dependencies = [

 [[package]]
 name = "quick-xml"
-version = "0.24.1"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37dddbbe9df96afafcb8027fcf263971b726530e12f0787f620a7ba5b4846081"
+checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9"
 dependencies = [
 "memchr",
 "serde",
@ -4412,9 +4416,9 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.85"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
+checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
 dependencies = [
 "itoa 1.0.3",
 "ryu",
@ -4669,15 +4673,15 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"

 [[package]]
 name = "snafu"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2"
+checksum = "dd726aec4ebad65756394ff89a9b9598793d4e30121cd71690244c1e497b3aee"
 dependencies = [
 "doc-comment",
 "snafu-derive",
@ -4685,9 +4689,9 @@ dependencies = [

 [[package]]
 name = "snafu-derive"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5"
+checksum = "712529e9b0b014eabaa345b38e06032767e3dc393e8b017e853b1d7247094e74"
 dependencies = [
 "heck",
 "proc-macro2",
@ -4748,9 +4752,9 @@ dependencies = [

 [[package]]
 name = "sqlparser"
-version = "0.24.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dac9c312566fdfc45a38ecf1924013c82af2a7d5315e46f67b1cc987f12be260"
+checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd"
 dependencies = [
 "log",
 ]
@ -4953,9 +4957,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "1.0.101"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
+checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
 dependencies = [
 "proc-macro2",
 "quote",
@ -5228,9 +5232,9 @@ dependencies = [

 [[package]]
 name = "tokio-stream"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af"
+checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
 dependencies = [
 "futures-core",
 "pin-project-lite",
@ -5434,9 +5438,9 @@ dependencies = [

 [[package]]
 name = "tracing"
-version = "0.1.36"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
 "cfg-if",
 "log",
@ -5447,9 +5451,9 @@ dependencies = [

 [[package]]
 name = "tracing-attributes"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2"
+checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
 dependencies = [
 "proc-macro2",
 "quote",
@ -5458,9 +5462,9 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.29"
+version = "0.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
+checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
 dependencies = [
 "once_cell",
 "valuable",
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@ -11,10 +11,10 @@ humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tempfile = "3.1.0"
 trace = { path = "../trace" }
--- a/compactor/Cargo.toml
+++ b/compactor/Cargo.toml
@ -14,7 +14,7 @@ datafusion = { path = "../datafusion" }
 futures = "0.3"
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
 predicate = { path = "../predicate" }
--- a/compactor/src/cold.rs
+++ b/compactor/src/cold.rs
@ -45,7 +45,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
        compaction_type,
        CompactionLevel::Initial,
        compact_in_parallel,
-        false, // no split
+        true, // split
        candidates.clone().into(),
    )
    .await;
@ -57,7 +57,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
            compaction_type,
            CompactionLevel::FileNonOverlapped,
            compact_in_parallel,
-            false, // don't split
+            true, // split
            candidates.into(),
        )
        .await;
@ -812,24 +812,42 @@ mod tests {

        compact(compactor, true).await;

-        // Should have 1 non-soft-deleted file:
+        // Should have 2 non-soft-deleted file:
        //
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - the 2 level-2 files created after combining all 3 level 1 files created by the first step
        //   of compaction to compact remaining level 0 files
        let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 1, "{files:?}");
+        assert_eq!(files.len(), 2, "{files:?}");
        let files_and_levels: Vec<_> = files
            .iter()
            .map(|f| (f.id.get(), f.compaction_level))
            .collect();

        // The initial files are: L0 1-4, L1 5-6. The first step of cold compaction took files 1-5
-        // and compacted them into a l-1 file 7. The second step of cold compaction
-        // took 6 and 7 and combined them all into file 8.
-        assert_eq!(files_and_levels, vec![(8, CompactionLevel::Final)]);
+        // and compacted them into two l-1 files 7, 8. The second step of cold compaction
+        // took 6, 7, and 8 and combined them all into two files 9 and 10.
+        assert_eq!(
+            files_and_levels,
+            vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)]
+        );

        // ------------------------------------------------
        // Verify the parquet file content
+        // first file:
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z |",
+                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // second file
        let file = files.pop().unwrap();
        let batches = table.read_parquet_file(file).await;
        assert_batches_sorted_eq!(
@ -847,9 +865,7 @@ mod tests {
                "| 20        |      | VT   | 20   | 1970-01-01T00:00:00.000026Z    |",
                "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
-                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z    |",
                "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
-                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z    |",
                "+-----------+------+------+------+--------------------------------+",
            ],
            &batches
@ -1027,14 +1043,14 @@ mod tests {

        compact(compactor, true).await;

-        // Should have 3 non-soft-deleted files:
+        // Should have 4 non-soft-deleted files:
        //
        // - pf4, the level 1 file untouched because it didn't fit in the memory budget
        // - pf6, the level 2 file untouched because it doesn't overlap anything
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - two level-2 files created after combining all 3 level 1 files created by the first step
        //   of compaction to compact remaining level 0 files
        let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 3, "{files:?}");
+        assert_eq!(files.len(), 4, "{files:?}");
        let files_and_levels: Vec<_> = files
            .iter()
            .map(|f| (f.id.get(), f.compaction_level))
@ -1042,20 +1058,35 @@ mod tests {

        // File 4 was L1 but didn't fit in the memory budget, so was untouched.
        // File 6 was already L2 and did not overlap with anything, so was untouched.
-        // Cold compaction took files 1, 2, 3, 5 and compacted them into file 7.
+        // Cold compaction took files 1, 2, 3, 5 and compacted them into 2 files 7 and 8.
        assert_eq!(
            files_and_levels,
            vec![
                (4, CompactionLevel::FileNonOverlapped),
                (6, CompactionLevel::Final),
                (7, CompactionLevel::Final),
+                (8, CompactionLevel::Final),
            ]
        );

        // ------------------------------------------------
        // Verify the parquet file content
-        let file1 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file1).await;
+        // newly created L-2 with largest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // newly created L-2 with smallest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
        assert_batches_sorted_eq!(
            &[
                "+-----------+------+------+------+--------------------------------+",
@ -1068,15 +1099,14 @@ mod tests {
                "| 1500      | WA   |      |      | 1970-01-01T00:00:00.000008Z    |",
                "| 1601      |      | PA   | 15   | 1970-01-01T00:00:00.000000009Z |",
                "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
-                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
                "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
                "+-----------+------+------+------+--------------------------------+",
            ],
            &batches
        );
-
-        let file0 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file0).await;
+        // available L2 that does not overlap
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
        assert_batches_sorted_eq!(
            &[
                "+-----------+------+------+-----------------------------+",
@ -1088,6 +1118,20 @@ mod tests {
            ],
            &batches
        );
+        // available L1 that did not fit in the memory budget
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+-----------------------------+",
+                "| field_int | tag2 | tag3 | time                        |",
+                "+-----------+------+------+-----------------------------+",
+                "| 1600      | WA   | 10   | 1970-01-01T00:00:00.000028Z |",
+                "| 20        | VT   | 20   | 1970-01-01T00:00:00.000026Z |",
+                "+-----------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
    }

    struct TestDb {
--- a/compactor/src/query.rs
+++ b/compactor/src/query.rs
@ -4,10 +4,10 @@ use data_types::{
    ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
    TableSummary, Timestamp, TimestampMinMax, Tombstone,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
 use parquet_file::chunk::ParquetChunk;
@ -194,7 +194,7 @@ impl QueryChunk for QueryableParquetChunk {
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
        _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

@ -208,7 +208,7 @@ impl QueryChunk for QueryableParquetChunk {
        _ctx: IOxSessionContext,
        _column_name: &str,
        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

@ -230,7 +230,7 @@ impl QueryChunk for QueryableParquetChunk {
        mut ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
        ctx.set_metadata("storage", "compactor");
        ctx.set_metadata("projection", format!("{}", selection));
        trace!(?selection, "selection");
@ -238,7 +238,7 @@ impl QueryChunk for QueryableParquetChunk {
        self.data
            .read_filter(predicate, selection)
            .context(ReadParquetSnafu)
-            .map_err(|e| Box::new(e) as _)
+            .map_err(|e| DataFusionError::External(Box::new(e)))
    }

    /// Returns chunk type
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@ -15,7 +15,7 @@ use datafusion::execution::context::TaskContext;
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::common::SizedRecordBatchStream;
 use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics};
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan};
 use datafusion::prelude::SessionContext;
 use datafusion::{
    arrow::{
@ -236,12 +236,19 @@ where
 }

 /// Create a SendableRecordBatchStream a RecordBatch
-pub fn stream_from_batch(batch: RecordBatch) -> SendableRecordBatchStream {
-    stream_from_batches(vec![Arc::new(batch)])
+pub fn stream_from_batch(schema: Arc<Schema>, batch: RecordBatch) -> SendableRecordBatchStream {
+    stream_from_batches(schema, vec![Arc::new(batch)])
 }

 /// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema
-pub fn stream_from_batches(batches: Vec<Arc<RecordBatch>>) -> SendableRecordBatchStream {
+pub fn stream_from_batches(
+    schema: Arc<Schema>,
+    batches: Vec<Arc<RecordBatch>>,
+) -> SendableRecordBatchStream {
+    if batches.is_empty() {
+        return Box::pin(EmptyRecordBatchStream::new(schema));
+    }
+
    let dummy_metrics = ExecutionPlanMetricsSet::new();
    let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
    let stream = SizedRecordBatchStream::new(batches[0].schema(), batches, mem_metrics);
--- a/docs/underground_guide.md
+++ b/docs/underground_guide.md
@ -15,17 +15,25 @@ developers.
 Build IOx for release with pprof:

 ```shell
+cd influxdb_iox
 cargo build --release --features=pprof
 ```

-## Step 2: Start redpanda and postgres
+You can also install the `influxdb_iox` command locally via 

-Now, start up redpanda and postgres locally in docker containers:
+```shell
+cd influxdb_iox
+cargo install --path influxdb_iox
+```
+
+## Step 2: Start kafka and postgres
+
+Now, start up kafka and postgres locally in docker containers:
 ```shell
 # get rskafka from https://github.com/influxdata/rskafka
 cd rskafka
-# Run redpanda on localhost:9010
-docker-compose -f docker-compose-redpanda.yml up &
+# Run kafka on localhost:9010
+docker-compose -f docker-compose-kafka.yml up &
 # now run postgres
 docker run -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust postgres &
 ```
@ -136,8 +144,8 @@ INFLUXDB_IOX_GRPC_BIND_ADDR=localhost:8084 \
 INFLUXDB_IOX_WRITE_BUFFER_TYPE=kafka \
 INFLUXDB_IOX_WRITE_BUFFER_ADDR=localhost:9010 \
 xINFLUXDB_IOX_WRITE_BUFFER_AUTO_CREATE_TOPICS=10 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_START=0 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_START=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_END=0 \
 INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES=5000000000 \
 INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES=4000000000 \
 INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost:5432/postgres \
@ -151,6 +159,11 @@ LOG_FILTER=info \

 # Step 5: Ingest data

+You can load data using the influxdb_iox client:
+```shell
+influxdb_iox  --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp
+```
+
 Now you can post data to `http://localhost:8080` with your favorite load generating tool

 My favorite is https://github.com/alamb/low_card
@ -171,3 +184,17 @@ posting fairly large requests (necessitating the
 # Step 6: Profile

 See [`profiling.md`](./profiling.md).
+
+
+# Step 7: Clean up local state
+
+If you find yourself needing to clean up postgres / kafka state use these commands:
+```shell
+docker ps -a -q | xargs docker stop
+docker rm rskafka_proxy_1
+docker rm rskafka_kafka-0_1
+docker rm rskafka_kafka-1_1
+docker rm rskafka_kafka-2_1
+docker rm rskafka_zookeeper_1
+docker volume rm  rskafka_kafka_0_data rskafka_kafka_1_data rskafka_kafka_2_data rskafka_zookeeper_data
+```
--- a/garbage_collector/Cargo.toml
+++ b/garbage_collector/Cargo.toml
@ -11,7 +11,7 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 observability_deps = { path = "../observability_deps" }
 snafu = "0.7"
 tokio = { version = "1", features = ["macros", "rt", "sync"] }
--- a/generated_types/protos/influxdata/iox/ingester/v1/query.proto
+++ b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
@ -82,8 +82,9 @@ message PartitionStatus {
  // Max sequence number persisted
  optional int64 parquet_max_sequence_number = 1;

-  // Max sequence number for a tombstone associated
-  optional int64 tombstone_max_sequence_number = 2;
+  // Deprecated tombstone support in ingester (#5825).
+  reserved "tombstone_max_sequence_number";
+  reserved 2;
 }

 // Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s
--- a/import/Cargo.toml
+++ b/import/Cargo.toml
@ -13,11 +13,11 @@ futures = "0.3"
 generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0", features = ["aws"] }
+object_store = { version = "0.5.1", features = ["aws"] }
 observability_deps = { path = "../observability_deps" }
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.82"
+serde_json = "1.0.86"
 thiserror = "1.0.37"
 tokio = { version = "1.21" }
 tonic = { version = "0.8" }
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@ -9,7 +9,7 @@ bytes = "1.2"
 futures = { version = "0.3", default-features = false }
 reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 url = "2.3.1"
 uuid = { version = "1", features = ["v4"] }
--- a/influxdb_influxql_parser/src/common.rs
+++ b/influxdb_influxql_parser/src/common.rs
@ -2,6 +2,7 @@ use crate::expression::conditional::{conditional_expression, ConditionalExpressi
 use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use crate::literal::unsigned_integer;
+use crate::string::{regex, Regex};
 use core::fmt;
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
@ -11,73 +12,82 @@ use nom::multi::separated_list1;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt::{Display, Formatter};

-/// Represents a fully-qualified measurement name.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub struct MeasurementNameExpression {
+/// Represents a measurement name as either an identifier or a regular expression.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MeasurementName {
+    /// A measurement name expressed as an [`Identifier`].
+    Name(Identifier),
+
+    /// A measurement name expressed as a [`Regex`].
+    Regex(Regex),
+}
+
+impl Parser for MeasurementName {
+    /// Parse a measurement name, which may be an identifier or a regular expression.
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            map(identifier, MeasurementName::Name),
+            map(regex, MeasurementName::Regex),
+        ))(i)
+    }
+}
+
+impl Display for MeasurementName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Name(ident) => fmt::Display::fmt(ident, f),
+            Self::Regex(regex) => fmt::Display::fmt(regex, f),
+        }
+    }
+}
+
+/// Represents a fully-qualified, 3-part measurement name.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct QualifiedMeasurementName {
    pub database: Option<Identifier>,
    pub retention_policy: Option<Identifier>,
-    pub name: Identifier,
+    pub name: MeasurementName,
 }

-impl MeasurementNameExpression {
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`.
-    pub fn new(name: Identifier) -> Self {
-        Self {
-            database: None,
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
-    pub fn new_db(name: Identifier, database: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
-    pub fn new_db_rp(name: Identifier, database: Identifier, retention_policy: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: Some(retention_policy),
-            name,
-        }
-    }
-}
-
-impl fmt::Display for MeasurementNameExpression {
+impl Display for QualifiedMeasurementName {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self {
                database: None,
                retention_policy: None,
                name,
-            } => write!(f, "{}", name)?,
+            } => write!(f, "{}", name),
            Self {
                database: Some(db),
                retention_policy: None,
                name,
-            } => write!(f, "{}..{}", db, name)?,
+            } => write!(f, "{}..{}", db, name),
            Self {
                database: None,
                retention_policy: Some(rp),
                name,
-            } => write!(f, "{}.{}", rp, name)?,
+            } => write!(f, "{}.{}", rp, name),
            Self {
                database: Some(db),
                retention_policy: Some(rp),
                name,
-            } => write!(f, "{}.{}.{}", db, rp, name)?,
-        };
-        Ok(())
+            } => write!(f, "{}.{}.{}", db, rp, name),
+        }
    }
 }

-/// Match a 3-part measurement name expression.
-pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementNameExpression> {
+/// Match a fully-qualified, 3-part measurement name.
+///
+/// ```text
+/// qualified_measurement_name ::= measurement_name |
+///                              ( policy_name "." measurement_name ) |
+///                              ( db_name "." policy_name? "." measurement_name )
+///
+/// db_name          ::= identifier
+/// policy_name      ::= identifier
+/// measurement_name ::= identifier | regex_lit
+/// ```
+pub fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> {
    let (remaining_input, (opt_db_rp, name)) = pair(
        opt(alt((
            // database "." retention_policy "."
@ -93,7 +103,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
            // retention_policy "."
            map(terminated(identifier, tag(".")), |rp| (None, Some(rp))),
        ))),
-        identifier,
+        MeasurementName::parse,
    )(i)?;

    // Extract possible `database` and / or `retention_policy`
@ -104,7 +114,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName

    Ok((
        remaining_input,
-        MeasurementNameExpression {
+        QualifiedMeasurementName {
            database,
            retention_policy,
            name,
@ -290,35 +300,107 @@ mod tests {
    use crate::assert_expect_error;
    use nom::character::complete::alphanumeric1;

-    #[test]
-    fn test_measurement_name_expression() {
-        let (_, got) = measurement_name_expression("diskio").unwrap();
-        assert_eq!(
-            got,
-            MeasurementNameExpression {
+    impl From<&str> for MeasurementName {
+        /// Convert a `str` to [`MeasurementName::Name`].
+        fn from(s: &str) -> Self {
+            Self::Name(Identifier(s.into()))
+        }
+    }
+
+    impl QualifiedMeasurementName {
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`.
+        pub fn new(name: MeasurementName) -> Self {
+            Self {
                database: None,
                retention_policy: None,
-                name: "diskio".into(),
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
+        pub fn new_db(name: MeasurementName, database: Identifier) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: None,
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
+        pub fn new_db_rp(
+            name: MeasurementName,
+            database: Identifier,
+            retention_policy: Identifier,
+        ) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: Some(retention_policy),
+                name,
+            }
+        }
+    }
+
+    #[test]
+    fn test_qualified_measurement_name() {
+        use MeasurementName::*;
+
+        let (_, got) = qualified_measurement_name("diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Name("diskio".into()),
            }
        );

-        let (_, got) = measurement_name_expression("telegraf.autogen.diskio").unwrap();
+        let (_, got) = qualified_measurement_name("/diskio/").unwrap();
        assert_eq!(
            got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                database: Some("telegraf".into()),
                retention_policy: Some("autogen".into()),
-                name: "diskio".into(),
+                name: Name("diskio".into()),
            }
        );

-        let (_, got) = measurement_name_expression("telegraf..diskio").unwrap();
+        let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap();
        assert_eq!(
            got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                database: Some("telegraf".into()),
                retention_policy: None,
-                name: "diskio".into(),
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Regex("diskio".into()),
            }
        );
    }
--- a/influxdb_influxql_parser/src/delete.rs
+++ b/influxdb_influxql_parser/src/delete.rs
@ -73,9 +73,14 @@ mod test {
        // Validate via the Display trait, as we don't need to validate the contents of the
        // FROM and / or WHERE clauses, given they are tested in their on modules.

+        // Measurement name expressed as an identifier
        let (_, got) = delete_statement("DELETE FROM foo").unwrap();
        assert_eq!(format!("{}", got), "DELETE FROM foo");

+        // Measurement name expressed as a regular expression
+        let (_, got) = delete_statement("DELETE FROM /foo/").unwrap();
+        assert_eq!(format!("{}", got), "DELETE FROM /foo/");
+
        let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap();
        assert_eq!(format!("{}", got), "DELETE FROM foo WHERE time > 10");

--- a/influxdb_influxql_parser/src/explain.rs
+++ b/influxdb_influxql_parser/src/explain.rs
@ -0,0 +1,140 @@
+#![allow(dead_code)] // Temporary
+
+use crate::internal::{expect, ParseResult};
+use crate::select::{select_statement, SelectStatement};
+use nom::branch::alt;
+use nom::bytes::complete::tag_no_case;
+use nom::character::complete::multispace1;
+use nom::combinator::{map, opt, value};
+use nom::sequence::{preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+/// Represents various options for an `EXPLAIN` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ExplainOption {
+    /// `EXPLAIN VERBOSE statement`
+    Verbose,
+    /// `EXPLAIN ANALYZE statement`
+    Analyze,
+    /// `EXPLAIN ANALYZE VERBOSE statement`
+    AnalyzeVerbose,
+}
+
+impl Display for ExplainOption {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Verbose => f.write_str("VERBOSE"),
+            Self::Analyze => f.write_str("ANALYZE"),
+            Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"),
+        }
+    }
+}
+
+/// Represents an `EXPLAIN` statement.
+///
+/// ```text
+/// explain         ::= "EXPLAIN" explain_options? select_statement
+/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? )
+/// ```
+#[derive(Debug, Clone, PartialEq)]
+pub struct ExplainStatement {
+    options: Option<ExplainOption>,
+    select: Box<SelectStatement>,
+}
+
+impl Display for ExplainStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str("EXPLAIN ")?;
+        if let Some(options) = &self.options {
+            write!(f, "{} ", options)?;
+        }
+        Display::fmt(&self.select, f)
+    }
+}
+
+/// Parse an `EXPLAIN` statement.
+pub fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> {
+    map(
+        tuple((
+            tag_no_case("EXPLAIN"),
+            opt(preceded(
+                multispace1,
+                alt((
+                    map(
+                        preceded(
+                            tag_no_case("ANALYZE"),
+                            opt(preceded(multispace1, tag_no_case("VERBOSE"))),
+                        ),
+                        |v| match v {
+                            // If the optional combinator is Some, then it matched VERBOSE
+                            Some(_) => ExplainOption::AnalyzeVerbose,
+                            _ => ExplainOption::Analyze,
+                        },
+                    ),
+                    value(ExplainOption::Verbose, tag_no_case("VERBOSE")),
+                )),
+            )),
+            multispace1,
+            expect(
+                "invalid EXPLAIN statement, expected SELECT statement",
+                select_statement,
+            ),
+        )),
+        |(_, options, _, select)| ExplainStatement {
+            options,
+            select: Box::new(select),
+        },
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::explain::{explain_statement, ExplainOption};
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_explain_statement() {
+        let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(format!("{}", got), "EXPLAIN SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(format!("{}", got), "EXPLAIN VERBOSE SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(format!("{}", got), "EXPLAIN ANALYZE SELECT val FROM temp");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            format!("{}", got),
+            "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
+        );
+
+        // Fallible cases
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SHOW DATABASES"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        // surfaces statement-specific errors
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+    }
+}
--- a/influxdb_influxql_parser/src/internal.rs
+++ b/influxdb_influxql_parser/src/internal.rs
@ -22,12 +22,10 @@ impl<I: Display> Display for Error<I> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Syntax { input: _, message } => {
-                write!(f, "Syntax error: {}", message)?;
+                write!(f, "Syntax error: {}", message)
            }
-            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind)?,
+            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind),
        }
-
-        Ok(())
    }
 }

--- a/influxdb_influxql_parser/src/lib.rs
+++ b/influxdb_influxql_parser/src/lib.rs
@ -29,6 +29,7 @@ mod test_util;
 mod common;
 mod delete;
 mod drop;
+mod explain;
 mod expression;
 mod identifier;
 mod internal;
--- a/influxdb_influxql_parser/src/select.rs
+++ b/influxdb_influxql_parser/src/select.rs
@ -1,6 +1,6 @@
 use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, order_by_clause, where_clause,
-    MeasurementNameExpression, OneOrMore, OrderByClause, Parser,
+    limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause,
+    OneOrMore, OrderByClause, Parser, QualifiedMeasurementName,
 };
 use crate::expression::arithmetic::Expr::Wildcard;
 use crate::expression::arithmetic::{
@ -164,8 +164,7 @@ pub fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> {
 /// Represents a single measurement selection found in a `FROM` clause.
 #[derive(Clone, Debug, PartialEq)]
 pub enum MeasurementSelection {
-    Name(MeasurementNameExpression),
-    Regex(Regex),
+    Name(QualifiedMeasurementName),
    Subquery(Box<SelectStatement>),
 }

@ -173,7 +172,6 @@ impl Display for MeasurementSelection {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            Self::Name(ref name) => fmt::Display::fmt(name, f),
-            Self::Regex(ref re) => fmt::Display::fmt(re, f),
            Self::Subquery(ref subquery) => write!(f, "({})", subquery),
        }
    }
@ -182,8 +180,7 @@ impl Display for MeasurementSelection {
 impl Parser for MeasurementSelection {
    fn parse(i: &str) -> ParseResult<&str, Self> {
        alt((
-            map(measurement_name_expression, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
+            map(qualified_measurement_name, MeasurementSelection::Name),
            map(
                delimited(
                    preceded(multispace0, char('(')),
@ -812,7 +809,7 @@ mod test {
        assert_matches!(got, MeasurementSelection::Name(_));

        let (_, got) = MeasurementSelection::parse("/regex/").unwrap();
-        assert_matches!(got, MeasurementSelection::Regex(_));
+        assert_matches!(got, MeasurementSelection::Name(_));

        let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap();
        assert_matches!(got, MeasurementSelection::Subquery(_));
--- a/influxdb_influxql_parser/src/show_measurements.rs
+++ b/influxdb_influxql_parser/src/show_measurements.rs
@ -2,24 +2,21 @@
 //!
 //! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements

+use crate::common::{
+    limit_clause, offset_clause, qualified_measurement_name, where_clause, QualifiedMeasurementName,
+};
+use crate::expression::conditional::ConditionalExpression;
+use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
-use nom::character::complete::{char, multispace0, multispace1};
+use nom::character::complete::{multispace0, multispace1};
 use nom::combinator::{map, opt, value};
 use nom::sequence::tuple;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt;
 use std::fmt::Formatter;

-use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, where_clause,
-    MeasurementNameExpression,
-};
-use crate::expression::conditional::ConditionalExpression;
-use crate::identifier::{identifier, Identifier};
-use crate::string::{regex, Regex};
-
 /// OnExpression represents an InfluxQL database or retention policy name
 /// or a wildcard.
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
@ -110,18 +107,16 @@ impl fmt::Display for ShowMeasurementsStatement {

 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum MeasurementExpression {
-    Equals(MeasurementNameExpression),
-    Regex(Regex),
+    Equals(QualifiedMeasurementName),
+    Regex(QualifiedMeasurementName),
 }

 impl fmt::Display for MeasurementExpression {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Equals(ref name) => write!(f, "= {}", name)?,
-            Self::Regex(ref re) => write!(f, "=~ {}", re)?,
-        };
-
-        Ok(())
+            Self::Equals(ref name) => write!(f, "= {}", name),
+            Self::Regex(ref re) => write!(f, "=~ {}", re),
+        }
    }
 }

@ -140,23 +135,15 @@ fn with_measurement_clause(i: &str) -> ParseResult<&str, MeasurementExpression>
            "expected = or =~",
            alt((
                map(
-                    tuple((
-                        tag("=~"),
-                        multispace0,
-                        expect("expected regular expression literal", regex),
-                    )),
-                    |(_, _, regex)| MeasurementExpression::Regex(regex),
+                    preceded(pair(tag("=~"), multispace0), qualified_measurement_name),
+                    MeasurementExpression::Regex,
                ),
                map(
-                    tuple((
-                        char('='),
-                        multispace0,
-                        expect(
-                            "expected measurement name or wildcard",
-                            measurement_name_expression,
-                        ),
-                    )),
-                    |(_, _, name)| MeasurementExpression::Equals(name),
+                    preceded(
+                        pair(tag("="), multispace0),
+                        expect("expected measurement name", qualified_measurement_name),
+                    ),
+                    MeasurementExpression::Equals,
                ),
            )),
        ),
@ -200,6 +187,7 @@ pub fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement
 mod test {
    use super::*;
    use crate::assert_expect_error;
+    use crate::common::MeasurementName;
    use crate::expression::arithmetic::Expr;
    use assert_matches::assert_matches;

@ -232,7 +220,7 @@ mod test {
            ShowMeasurementsStatement {
                on_expression: Some(OnExpression::Database("foo".into())),
                measurement_expression: Some(MeasurementExpression::Equals(
-                    MeasurementNameExpression {
+                    QualifiedMeasurementName {
                        database: None,
                        retention_policy: None,
                        name: "bar".into(),
@ -255,7 +243,9 @@ mod test {
            got,
            ShowMeasurementsStatement {
                on_expression: Some(OnExpression::Database("foo".into())),
-                measurement_expression: Some(MeasurementExpression::Regex(Regex("bar".into()))),
+                measurement_expression: Some(MeasurementExpression::Regex(
+                    QualifiedMeasurementName::new(MeasurementName::Regex("bar".into()))
+                )),
                condition: Some(Expr::Literal(true.into()).into()),
                limit: None,
                offset: None
@ -343,33 +333,50 @@ mod test {

    #[test]
    fn test_with_measurement_clause() {
+        use crate::common::MeasurementName::*;
+
        let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap();
        assert_eq!(
            got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: None,
-                retention_policy: None,
-                name: "foo".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Name("foo".into())))
        );

        let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );

        // Expressions are still valid when whitespace is omitted

        let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap();
        assert_eq!(
            got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: Some("foo".into()),
-                retention_policy: None,
-                name: "bar".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new_db(
+                Name("bar".into()),
+                "foo".into()
+            ))
        );

        let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662
+
+        let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Name("foo".into())))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Regex("foo".into())))
+        );

        // Fallible cases

@ -379,28 +386,16 @@ mod test {
            "invalid WITH clause, expected MEASUREMENT"
        );

-        // Must have a regex for equal regex operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement =~ foo"),
-            "expected regular expression literal"
-        );
-
        // Unsupported regex not equal operator
        assert_expect_error!(
            with_measurement_clause("WITH measurement !~ foo"),
            "expected = or =~"
        );

-        // Must have an identifier for equal operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement = /foo/"),
-            "expected measurement name or wildcard"
-        );
-
        // Must have an identifier
        assert_expect_error!(
            with_measurement_clause("WITH measurement = 1"),
-            "expected measurement name or wildcard"
+            "expected measurement name"
        );
    }
 }
--- a/influxdb_influxql_parser/src/simple_from_clause.rs
+++ b/influxdb_influxql_parser/src/simple_from_clause.rs
@ -1,41 +1,12 @@
-use crate::common::{measurement_name_expression, MeasurementNameExpression, OneOrMore, Parser};
+use crate::common::{
+    qualified_measurement_name, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName,
+};
 use crate::identifier::{identifier, Identifier};
 use crate::internal::ParseResult;
-use crate::string::{regex, Regex};
-use nom::branch::alt;
 use nom::bytes::complete::tag_no_case;
 use nom::character::complete::multispace1;
-use nom::combinator::map;
 use nom::sequence::{pair, preceded};
 use std::fmt;
-use std::fmt::Formatter;
-
-/// Represents a single measurement selection found in a `FROM` measurement clause.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum MeasurementSelection<T: Parser> {
-    Name(T),
-    Regex(Regex),
-}
-
-impl<T: Parser> Parser for MeasurementSelection<T> {
-    fn parse(i: &str) -> ParseResult<&str, Self> {
-        alt((
-            map(T::parse, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
-        ))(i)
-    }
-}
-
-impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::Name(ref name) => fmt::Display::fmt(name, f)?,
-            Self::Regex(ref re) => fmt::Display::fmt(re, f)?,
-        };
-
-        Ok(())
-    }
-}

 /// Represents a `FROM` clause of a `DELETE` or `SHOW` statement.
 ///
@ -43,7 +14,7 @@ impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
 /// for measurements names.
 ///
 /// A `FROM` clause for a number of `SHOW` statements can accept a 3-part measurement name or
-pub type FromMeasurementClause<U> = OneOrMore<MeasurementSelection<U>>;
+pub type FromMeasurementClause<U> = OneOrMore<U>;

 fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasurementClause<T>> {
    preceded(
@ -54,9 +25,9 @@ fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasu
    )(i)
 }

-impl Parser for MeasurementNameExpression {
+impl Parser for QualifiedMeasurementName {
    fn parse(i: &str) -> ParseResult<&str, Self> {
-        measurement_name_expression(i)
+        qualified_measurement_name(i)
    }
 }

@ -68,10 +39,9 @@ impl Parser for MeasurementNameExpression {
 /// It is defined by the following EBNF notation:
 ///
 /// ```text
-/// from_clause ::= "FROM" measurement_selection ("," measurement_selection)*
-/// measurement_selection ::= measurement
+/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)*
 ///
-/// measurement      ::= measurement_name |
+/// qualified_measurement_name ::= measurement_name |
 ///                      ( policy_name "." measurement_name ) |
 ///                      ( db_name "." policy_name? "." measurement_name )
 ///
@ -92,7 +62,7 @@ impl Parser for MeasurementNameExpression {
 /// ```text
 /// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar
 /// ```
-pub type ShowFromClause = FromMeasurementClause<MeasurementNameExpression>;
+pub type ShowFromClause = FromMeasurementClause<QualifiedMeasurementName>;

 /// Parse a `FROM` clause for various `SHOW` statements.
 pub fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> {
@ -106,7 +76,7 @@ impl Parser for Identifier {
 }

 /// Represents a `FROM` clause for a `DELETE` statement.
-pub type DeleteFromClause = FromMeasurementClause<Identifier>;
+pub type DeleteFromClause = FromMeasurementClause<MeasurementName>;

 /// Parse a `FROM` clause for a `DELETE` statement.
 pub fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> {
@ -119,49 +89,52 @@ mod test {

    #[test]
    fn test_show_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;

        let (_, from) = show_from_clause("FROM c").unwrap();
        assert_eq!(
            from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new("c".into()))])
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))])
        );

        let (_, from) = show_from_clause("FROM a..c").unwrap();
        assert_eq!(
            from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db(
+                Name("c".into()),
                "a".into()
-            ))])
+            )])
        );

        let (_, from) = show_from_clause("FROM a.b.c").unwrap();
        assert_eq!(
            from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db_rp(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp(
+                Name("c".into()),
                "a".into(),
                "b".into()
-            ))])
+            )])
        );

        let (_, from) = show_from_clause("FROM /reg/").unwrap();
-        assert_eq!(from, ShowFromClause::new(vec![Regex("reg".into())]));
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))])
+        );

        let (_, from) = show_from_clause("FROM c, /reg/").unwrap();
        assert_eq!(
            from,
            ShowFromClause::new(vec![
-                Name(MeasurementNameExpression::new("c".into())),
-                Regex("reg".into())
+                QualifiedMeasurementName::new(Name("c".into())),
+                QualifiedMeasurementName::new(Regex("reg".into()))
            ])
        );
    }

    #[test]
    fn test_delete_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;

        let (_, from) = delete_from_clause("FROM c").unwrap();
        assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())]));
--- a/influxdb_influxql_parser/src/statement.rs
+++ b/influxdb_influxql_parser/src/statement.rs
@ -1,5 +1,6 @@
 use crate::delete::{delete_statement, DeleteStatement};
 use crate::drop::{drop_statement, DropMeasurementStatement};
+use crate::explain::{explain_statement, ExplainStatement};
 use crate::internal::ParseResult;
 use crate::select::{select_statement, SelectStatement};
 use crate::show::{show_statement, ShowDatabasesStatement};
@ -19,6 +20,8 @@ pub enum Statement {
    Delete(Box<DeleteStatement>),
    /// Represents a `DROP MEASUREMENT` statement.
    DropMeasurement(Box<DropMeasurementStatement>),
+    /// Represents an `EXPLAIN` statement.
+    Explain(Box<ExplainStatement>),
    /// Represents a `SELECT` statement.
    Select(Box<SelectStatement>),
    /// Represents a `SHOW DATABASES` statement.
@ -40,6 +43,7 @@ impl Display for Statement {
        match self {
            Self::Delete(s) => Display::fmt(s, f),
            Self::DropMeasurement(s) => Display::fmt(s, f),
+            Self::Explain(s) => Display::fmt(s, f),
            Self::Select(s) => Display::fmt(s, f),
            Self::ShowDatabases(s) => Display::fmt(s, f),
            Self::ShowMeasurements(s) => Display::fmt(s, f),
@ -56,6 +60,7 @@ pub fn statement(i: &str) -> ParseResult<&str, Statement> {
    alt((
        map(delete_statement, |s| Statement::Delete(Box::new(s))),
        map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))),
+        map(explain_statement, |s| Statement::Explain(Box::new(s))),
        map(select_statement, |s| Statement::Select(Box::new(s))),
        show_statement,
    ))(i)
@ -77,6 +82,10 @@ mod test {
        let (got, _) = statement("DROP MEASUREMENT foo").unwrap();
        assert_eq!(got, "");

+        // explain_statement combinator
+        let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap();
+        assert_eq!(got, "");
+
        let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap();
        assert_eq!(got, "");

--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@ -25,7 +25,7 @@ ioxd_querier = { path = "../ioxd_querier"}
 ioxd_router = { path = "../ioxd_router"}
 ioxd_test = { path = "../ioxd_test"}
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 object_store_metrics = { path = "../object_store_metrics" }
 observability_deps = { path = "../observability_deps" }
 panic_logging = { path = "../panic_logging" }
@ -47,6 +47,8 @@ clap = { version = "4", features = ["derive", "env"] }
 console-subscriber = { version = "0.1.8", optional = true, features = ["parking_lot"] }
 dotenvy = "0.15.5"
 futures = "0.3"
+futures-util = { version = "0.3" }
+flate2 = "1.0"
 hashbrown = "0.12"
 http = "0.2.8"
 humantime = "2.1.0"
@ -55,7 +57,7 @@ libc = { version = "0.2" }
 num_cpus = "1.13.0"
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 rustyline = { version = "10.0", default-features = false }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 thiserror = "1.0.37"
 tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
--- a/influxdb_iox/src/commands/sql/repl.rs
+++ b/influxdb_iox/src/commands/sql/repl.rs
@ -53,7 +53,7 @@ pub enum Error {
 pub type Result<T, E = Error> = std::result::Result<T, E>;

 enum QueryEngine {
-    /// Run queries against the named database on the remote server
+    /// Run queries against the namespace on the remote server
    Remote(String),

    /// Run queries against a local `Observer` instance
@ -177,7 +177,7 @@ pub struct Repl {
    /// Client for running sql
    flight_client: influxdb_iox_client::flight::Client,

-    /// database name against which SQL commands are run
+    /// namespace name against which SQL commands are run
    query_engine: Option<QueryEngine>,

    /// Formatter to use to format query results
@ -239,8 +239,8 @@ impl Repl {
                        .map_err(|e| println!("{}", e))
                        .ok();
                }
-                ReplCommand::UseDatabase { db_name } => {
-                    self.use_database(db_name);
+                ReplCommand::UseNamespace { db_name } => {
+                    self.use_namespace(db_name);
                }
                ReplCommand::SqlCommand { sql } => {
                    self.run_sql(sql).await.map_err(|e| println!("{}", e)).ok();
@ -302,18 +302,18 @@ impl Repl {
        self.print_results(&[record_batch])
    }

-    // Run a command against the currently selected remote database
+    // Run a command against the currently selected remote namespace
    async fn run_sql(&mut self, sql: String) -> Result<()> {
        let start = Instant::now();

        let batches = match &mut self.query_engine {
            None => {
-                println!("Error: no database selected.");
-                println!("Hint: Run USE DATABASE <dbname> to select database");
+                println!("Error: no namespace selected.");
+                println!("Hint: Run USE NAMESPACE <dbname> to select namespace");
                return Ok(());
            }
            Some(QueryEngine::Remote(db_name)) => {
-                info!(%db_name, %sql, "Running sql on remote database");
+                info!(%db_name, %sql, "Running sql on remote namespace");

                scrape_query(&mut self.flight_client, db_name, &sql).await?
            }
@ -349,9 +349,9 @@ impl Repl {
        }
    }

-    fn use_database(&mut self, db_name: String) {
-        info!(%db_name, "setting current database");
-        println!("You are now in remote mode, querying database {}", db_name);
+    fn use_namespace(&mut self, db_name: String) {
+        info!(%db_name, "setting current namespace");
+        println!("You are now in remote mode, querying namespace {}", db_name);
        self.set_query_engine(QueryEngine::Remote(db_name));
    }

--- a/influxdb_iox/src/commands/sql/repl_command.rs
+++ b/influxdb_iox/src/commands/sql/repl_command.rs
@ -7,7 +7,7 @@ pub enum ReplCommand {
    ShowNamespaces,
    Observer,
    SetFormat { format: String },
-    UseDatabase { db_name: String },
+    UseNamespace { db_name: String },
    SqlCommand { sql: String },
    Exit,
 }
@ -64,18 +64,18 @@ impl TryFrom<&str> for ReplCommand {
            ["observer"] => Ok(Self::Observer),
            ["exit"] => Ok(Self::Exit),
            ["quit"] => Ok(Self::Exit),
-            ["use", "database"] => {
-                Err("name not specified. Usage: USE DATABASE <name>".to_string())
-            } // USE DATABASE
-            ["use", "database", _name] => {
-                // USE DATABASE <name>
-                Ok(Self::UseDatabase {
+            ["use", "namespace"] => {
+                Err("name not specified. Usage: USE NAMESPACE <name>".to_string())
+            } // USE NAMESPACE
+            ["use", "namespace", _name] => {
+                // USE namespace <name>
+                Ok(Self::UseNamespace {
                    db_name: raw_commands[2].to_string(),
                })
            }
            ["use", _command] => {
                // USE <name>
-                Ok(Self::UseDatabase {
+                Ok(Self::UseNamespace {
                    db_name: raw_commands[1].to_string(),
                })
            }
@ -98,9 +98,9 @@ impl ReplCommand {
 Available commands (not case sensitive):
 HELP (this one)

-SHOW NAMESPACES: List databases available on the server
+SHOW NAMESPACES: List namespaces available on the server

-USE [DATABASE|NAMESPACE] <name>: Set the current remote database to name
+USE NAMESPACE <name>: Set the current remote namespace to name

 SET FORMAT <format>: Set the output format to Pretty, csv or json

@ -108,9 +108,9 @@ OBSERVER: Locally query unified queryable views of remote system tables

 [EXIT | QUIT]: Quit this session and exit the program

-# Examples: use remote database foo
-SHOW DATABASES;
-USE DATABASE foo;
+# Examples: use remote namespace foo
+SHOW NAMESPACES;
+USE foo;

 # Basic IOx SQL Primer

@ -199,35 +199,35 @@ mod tests {
    }

    #[test]
-    fn use_database() {
-        let expected = Ok(ReplCommand::UseDatabase {
+    fn use_namespace() {
+        let expected = Ok(ReplCommand::UseNamespace {
            db_name: "Foo".to_string(),
        });
        assert_eq!("use Foo".try_into(), expected);
-        assert_eq!("use Database Foo;".try_into(), expected);
-        assert_eq!("use Database Foo ;".try_into(), expected);
-        assert_eq!(" use Database Foo;   ".try_into(), expected);
-        assert_eq!("   use Database Foo;   ".try_into(), expected);
+        assert_eq!("use Namespace Foo;".try_into(), expected);
+        assert_eq!("use Namespace Foo ;".try_into(), expected);
+        assert_eq!(" use Namespace Foo;   ".try_into(), expected);
+        assert_eq!("   use Namespace Foo;   ".try_into(), expected);

-        // ensure that database name is case sensitive
-        let expected = Ok(ReplCommand::UseDatabase {
+        // ensure that namespace name is case sensitive
+        let expected = Ok(ReplCommand::UseNamespace {
            db_name: "FOO".to_string(),
        });
        assert_eq!("use FOO".try_into(), expected);
-        assert_eq!("use DATABASE FOO;".try_into(), expected);
-        assert_eq!("USE DATABASE FOO;".try_into(), expected);
+        assert_eq!("use NAMESPACE FOO;".try_into(), expected);
+        assert_eq!("USE NAMESPACE FOO;".try_into(), expected);

        let expected: Result<ReplCommand, String> =
-            Err("name not specified. Usage: USE DATABASE <name>".to_string());
-        assert_eq!("use Database;".try_into(), expected);
-        assert_eq!("use DATABASE".try_into(), expected);
-        assert_eq!("use database".try_into(), expected);
+            Err("name not specified. Usage: USE NAMESPACE <name>".to_string());
+        assert_eq!("use Namespace;".try_into(), expected);
+        assert_eq!("use NAMESPACE".try_into(), expected);
+        assert_eq!("use namespace".try_into(), expected);

-        let expected = sql_cmd("use database foo bar");
-        assert_eq!("use database foo bar".try_into(), expected);
+        let expected = sql_cmd("use namespace foo bar");
+        assert_eq!("use namespace foo bar".try_into(), expected);

-        let expected = sql_cmd("use database foo BAR");
-        assert_eq!("use database foo BAR".try_into(), expected);
+        let expected = sql_cmd("use namespace foo BAR");
+        assert_eq!("use namespace foo BAR".try_into(), expected);
    }

    #[test]
--- a/influxdb_iox/src/commands/write.rs
+++ b/influxdb_iox/src/commands/write.rs
@ -1,6 +1,14 @@
+use futures::StreamExt;
 use influxdb_iox_client::{connection::Connection, write};
-use snafu::{ResultExt, Snafu};
-use std::{fs::File, io::Read, path::PathBuf};
+use observability_deps::tracing::info;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::{
+    fs::File,
+    io::{BufReader, Read},
+    num::NonZeroUsize,
+    path::PathBuf,
+    time::Instant,
+};

 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@ -11,10 +19,30 @@ pub enum Error {
        source: std::io::Error,
    },

+    #[snafu(display("Error reading files: {:#?}", sources))]
+    ReadingFiles { sources: Vec<Error> },
+
    #[snafu(display("Client error: {source}"))]
    ClientError {
        source: influxdb_iox_client::error::Error,
    },
+
+    #[snafu(display("Error converting parquet: {}", source))]
+    Conversion {
+        source: parquet_to_line_protocol::Error,
+    },
+
+    #[snafu(display("Line protocol was not valid utf8: {}", source))]
+    InvalidUtf8 { source: std::string::FromUtf8Error },
+
+    #[snafu(display("Error decoding gzip {:?}:  {}", file_name, source))]
+    Gz {
+        file_name: PathBuf,
+        source: std::io::Error,
+    },
+
+    #[snafu(display("Max concurrent uploads must be greater than zero"))]
+    MaxConcurrentUploadsVerfication,
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -22,36 +50,176 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// Write data into the specified database
 #[derive(Debug, clap::Parser)]
 pub struct Config {
+    /// If specified, restricts the maxium amount of line protocol
+    /// sent per request to this many bytes. Defaults to 1MB
+    #[clap(action, long, short = 'b', default_value = "1048576")]
+    max_request_payload_size_bytes: usize,
+
+    /// Uploads up to this many http requests at a time. Defaults to 10
+    #[clap(action, long, short = 'c', default_value = "10")]
+    max_concurrent_uploads: usize,
+
    /// The namespace into which to write
    #[clap(action)]
    namespace: String,

-    /// File with data to load. Currently supported formats are .lp
+    /// File(s) with data to load. Currently supported formats are .lp (line protocol),
+    /// .parquet (IOx created parquet files), and .gz (gzipped line protocol)
    #[clap(action)]
-    file_name: PathBuf,
+    file_names: Vec<PathBuf>,
 }

 pub async fn command(connection: Connection, config: Config) -> Result<()> {
+    let start = Instant::now();
+
    let Config {
        namespace,
-        file_name,
+        file_names,
+        max_request_payload_size_bytes,
+        max_concurrent_uploads,
    } = config;
-    let file_name = &file_name;

-    let mut file = File::open(file_name).context(ReadingFileSnafu { file_name })?;
+    let max_concurrent_uploads =
+        NonZeroUsize::new(max_concurrent_uploads).context(MaxConcurrentUploadsVerficationSnafu)?;

-    let mut lp_data = String::new();
-    file.read_to_string(&mut lp_data)
-        .context(ReadingFileSnafu { file_name })?;
+    info!(
+        num_files = file_names.len(),
+        max_request_payload_size_bytes, max_concurrent_uploads, "Beginning upload"
+    );

-    let mut client = write::Client::new(connection);
+    // first pass is to check that all the files exist and can be
+    // opened and if not fail fast.
+    let file_open_errors: Vec<_> = file_names
+        .iter()
+        .filter_map(|file_name| {
+            File::open(file_name)
+                .context(ReadingFileSnafu { file_name })
+                .err()
+        })
+        .collect();
+
+    ensure!(
+        file_open_errors.is_empty(),
+        ReadingFilesSnafu {
+            sources: file_open_errors
+        }
+    );
+
+    // if everything looked good, go through and read the files out
+    // them potentially in parallel.
+    let lp_stream = futures_util::stream::iter(file_names)
+        .map(|file_name| tokio::task::spawn(slurp_file(file_name)))
+        // Since the contents of each file are buffered into a string,
+        // limit the number that are open at once to the maximum
+        // possible uploads
+        .buffered(max_concurrent_uploads.into())
+        // warn and skip any errors
+        .filter_map(|res| async move {
+            match res {
+                Ok(Ok(lp_data)) => Some(lp_data),
+                Ok(Err(e)) => {
+                    eprintln!("WARNING: ignoring error : {}", e);
+                    None
+                }
+                Err(e) => {
+                    eprintln!("WARNING: ignoring task fail: {}", e);
+                    None
+                }
+            }
+        });
+
+    let mut client = write::Client::new(connection)
+        .with_max_concurrent_uploads(max_concurrent_uploads)
+        .with_max_request_payload_size_bytes(Some(max_request_payload_size_bytes));

    let total_bytes = client
-        .write_lp(namespace, lp_data)
+        .write_lp_stream(namespace, lp_stream)
        .await
        .context(ClientSnafu)?;

-    println!("{} Bytes OK", total_bytes);
+    let elapsed = Instant::now() - start;
+    let mb = (total_bytes as f64) / (1024.0 * 1024.0);
+    let mb_per_sec = (mb / (elapsed.as_millis() as f64)) * (1000.0);
+    println!("{total_bytes} Bytes OK in {elapsed:?}. {mb_per_sec:.2} MB/sec");

    Ok(())
 }
+
+/// Reads the contents of `file_name into a string
+///
+/// .parquet files --> iox parquet files (convert to parquet)
+/// .gz  --> treated as gzipped line protocol
+/// .lp (or anything else) --> treated as raw line protocol
+///
+async fn slurp_file(file_name: PathBuf) -> Result<String> {
+    let file_name = &file_name;
+
+    let extension = file_name
+        .extension()
+        .map(|extension| extension.to_ascii_lowercase());
+
+    match extension {
+        // Transform parquet to line protocol prior to upload
+        // Not the most efficient process, but it is expedient
+        Some(extension) if extension.to_string_lossy() == "parquet" => {
+            let mut lp_data = vec![];
+            parquet_to_line_protocol::convert_file(file_name, &mut lp_data)
+                .await
+                .context(ConversionSnafu)?;
+
+            let lp_data = String::from_utf8(lp_data).context(InvalidUtf8Snafu)?;
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from parquet file"
+            );
+            Ok(lp_data)
+        }
+        // decompress as gz
+        Some(extension) if extension.to_string_lossy() == "gz" => {
+            let mut lp_data = String::new();
+            let reader =
+                BufReader::new(File::open(&file_name).context(ReadingFileSnafu { file_name })?);
+
+            flate2::read::GzDecoder::new(reader)
+                .read_to_string(&mut lp_data)
+                .context(GzSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from gzipped line protocol file"
+            );
+            Ok(lp_data)
+        }
+        // anything else, treat as line protocol
+        Some(_) | None => {
+            let lp_data =
+                std::fs::read_to_string(file_name).context(ReadingFileSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol file"
+            );
+            Ok(lp_data)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use clap::Parser;
+    use influxdb_iox_client::write::DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES;
+
+    use super::*;
+
+    #[test]
+    fn command_default_is_same_as_client_default() {
+        let config = Config::try_parse_from(vec!["my_db", "file1"]).unwrap();
+        assert_eq!(
+            Some(config.max_request_payload_size_bytes),
+            DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES
+        );
+    }
+}
--- a/influxdb_iox/tests/end_to_end_cases/cli.rs
+++ b/influxdb_iox/tests/end_to_end_cases/cli.rs
@ -6,7 +6,6 @@ use predicates::prelude::*;
 use serde_json::Value;
 use std::time::{Duration, Instant};
 use tempfile::tempdir;
-use test_helpers::make_temp_file;
 use test_helpers_end_to_end::{
    maybe_skip_integration, AddAddrEnv, BindAddresses, MiniCluster, ServerType, Step, StepTest,
    StepTestState,
@ -526,9 +525,6 @@ async fn write_and_query() {
        vec![
            Step::Custom(Box::new(|state: &mut StepTestState| {
                async {
-                    // write line protocol to a temp file
-                    let lp_file = make_temp_file("m,tag=1 v=2 12345");
-                    let lp_file_path = lp_file.path().to_string_lossy().to_string();
                    let router_addr = state.cluster().router().router_http_base().to_string();

                    let namespace = state.cluster().namespace();
@ -537,53 +533,48 @@ async fn write_and_query() {
                    // Validate the output of the schema CLI command
                    Command::cargo_bin("influxdb_iox")
                        .unwrap()
+                        .arg("-v")
                        .arg("-h")
                        .arg(&router_addr)
                        .arg("write")
                        .arg(&namespace)
-                        .arg(&lp_file_path)
+                        // raw line protocol ('h2o_temperature' measurement)
+                        .arg("../test_fixtures/lineproto/air_and_water.lp")
+                        // gzipped line protocol ('m0')
+                        .arg("../test_fixtures/lineproto/read_filter.lp.gz")
+                         // iox formatted parquet ('cpu' measurement)
+                        .arg("../test_fixtures/cpu.parquet")
                        .assert()
                        .success()
-                        .stdout(predicate::str::contains("17 Bytes OK"));
+                        // this number is the total size of
+                        // uncompressed line protocol stored in all
+                        // three files
+                        .stdout(predicate::str::contains("1137058 Bytes OK"));
                }
                .boxed()
            })),
            Step::Custom(Box::new(|state: &mut StepTestState| {
                async {
-                    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
-                    let namespace = state.cluster().namespace();
+                    // data from 'air_and_water.lp'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from h2o_temperature order by time desc limit 10",
+                        "| 51.3           | coyote_creek | CA    | 55.1            | 1970-01-01T00:00:01.568756160Z |"
+                    ).await;

-                    let max_wait_time = Duration::from_secs(10);
-                    let expected = "| 1   | 1970-01-01T00:00:00.000012345Z | 2 |";
-                    println!("Waiting for {expected}");
+                    // data from 'read_filter.lp.gz'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from m0 order by time desc limit 10;",
+                        "| value1 | value9 | value9 | value49 | value0 | 2021-04-26T13:47:39.727574Z | 1  |"
+                    ).await;

-                    // Validate the output of running the query CLI command appears after at most max_wait_time
-                    let end = Instant::now() + max_wait_time;
-                    while Instant::now() < end {
-                        let maybe_result = Command::cargo_bin("influxdb_iox")
-                            .unwrap()
-                            .arg("-h")
-                            .arg(&querier_addr)
-                            .arg("query")
-                            .arg(&namespace)
-                            .arg("SELECT * from m")
-                            .assert()
-                            .success()
-                            .try_stdout(predicate::str::contains(expected));
-
-                        match maybe_result {
-                            Err(e) => {
-                                println!("Got err: {}, retrying", e);
-                            }
-                            Ok(r) => {
-                                println!("Success: {:?}", r);
-                                return;
-                            }
-                        }
-                        // sleep and try again
-                        tokio::time::sleep(Duration::from_millis(500)).await
-                    }
-                    panic!("Did not find expected output in allotted time");
+                    // data from 'cpu.parquet'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from cpu where cpu = 'cpu2' order by time desc limit 10",
+                        "cpu2 | MacBook-Pro-8.hsd1.ma.comcast.net | 2022-09-30T12:55:00Z"
+                    ).await;
                }
                .boxed()
            })),
@ -593,6 +584,53 @@ async fn write_and_query() {
    .await
 }

+/// Runs the specified query in a loop for up to 10 seconds, waiting
+/// for the specified output to appear
+async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, expected: &str) {
+    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
+    let namespace = state.cluster().namespace();
+
+    let max_wait_time = Duration::from_secs(10);
+    println!("Waiting for {expected}");
+
+    // Validate the output of running the query CLI command appears after at most max_wait_time
+    let end = Instant::now() + max_wait_time;
+    while Instant::now() < end {
+        let assert = Command::cargo_bin("influxdb_iox")
+            .unwrap()
+            .arg("-h")
+            .arg(&querier_addr)
+            .arg("query")
+            .arg(&namespace)
+            .arg(query_sql)
+            .assert();
+
+        let assert = match assert.try_success() {
+            Err(e) => {
+                println!("Got err running command: {}, retrying", e);
+                continue;
+            }
+            Ok(a) => a,
+        };
+
+        match assert.try_stdout(predicate::str::contains(expected)) {
+            Err(e) => {
+                println!("No match: {}, retrying", e);
+            }
+            Ok(r) => {
+                println!("Success: {:?}", r);
+                return;
+            }
+        }
+        // sleep and try again
+        tokio::time::sleep(Duration::from_secs(1)).await
+    }
+    panic!(
+        "Did not find expected output {} within {:?}",
+        expected, max_wait_time
+    );
+}
+
 /// Test the schema cli command
 #[tokio::test]
 async fn namespaces_cli() {
--- a/influxdb_iox/tests/end_to_end_cases/ingester.rs
+++ b/influxdb_iox/tests/end_to_end_cases/ingester.rs
@ -52,7 +52,6 @@ async fn ingester_flight_api() {
            partition_id,
            status: Some(PartitionStatus {
                parquet_max_sequence_number: None,
-                tombstone_max_sequence_number: None
            })
        },
    );
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@ -7,7 +7,8 @@ use futures::FutureExt;
 use predicates::prelude::*;
 use test_helpers::assert_contains;
 use test_helpers_end_to_end::{
-    maybe_skip_integration, run_query, MiniCluster, Step, StepTest, StepTestState, TestConfig,
+    maybe_skip_integration, run_query, try_run_query, GrpcRequestBuilder, MiniCluster, Step,
+    StepTest, StepTestState, TestConfig,
 };

 #[tokio::test]
@ -454,6 +455,87 @@ async fn issue_4631_b() {
    .await
 }

+#[tokio::test]
+async fn oom_protection() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let router_config = TestConfig::new_router(&database_url);
+    let ingester_config = TestConfig::new_ingester(&router_config);
+    let querier_config =
+        TestConfig::new_querier(&ingester_config).with_querier_max_table_query_bytes(1);
+    let mut cluster = MiniCluster::new()
+        .with_router(router_config)
+        .await
+        .with_ingester(ingester_config)
+        .await
+        .with_querier(querier_config)
+        .await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!("{},tag1=A,tag2=B val=42i 123457", table_name)),
+            Step::WaitForReadable,
+            Step::AssertNotPersisted,
+            // SQL query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let sql = format!("select * from {}", table_name);
+                    let err = try_run_query(
+                        sql,
+                        state.cluster().namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                    )
+                    .await
+                    .unwrap_err();
+
+                    if let influxdb_iox_client::flight::Error::GrpcError(status) = err {
+                        assert_eq!(
+                            status.code(),
+                            tonic::Code::ResourceExhausted,
+                            "Wrong status code: {}\n\nStatus:\n{}",
+                            status.code(),
+                            status,
+                        );
+                    } else {
+                        panic!("Not a gRPC error: {err}");
+                    }
+                }
+                .boxed()
+            })),
+            // InfluxRPC/storage query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut storage_client = state.cluster().querier_storage_client();
+
+                    let read_filter_request = GrpcRequestBuilder::new()
+                        .source(state.cluster())
+                        .build_read_filter();
+
+                    let status = storage_client
+                        .read_filter(read_filter_request)
+                        .await
+                        .unwrap_err();
+                    assert_eq!(
+                        status.code(),
+                        tonic::Code::ResourceExhausted,
+                        "Wrong status code: {}\n\nStatus:\n{}",
+                        status.code(),
+                        status,
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 /// This structure holds information for tests that need to force a parquet file to be persisted
 struct ForcePersistenceSetup {
    // Set up a cluster that will will persist quickly
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@ -13,6 +13,7 @@ format = ["arrow", "arrow_util"]
 # Workspace dependencies, in alphabetical order
 arrow_util = { path = "../arrow_util", optional = true }
 client_util = { path = "../client_util" }
+influxdb_line_protocol = { path = "../influxdb_line_protocol"}
 generated_types = { path = "../generated_types", default-features = false, features = ["data_types_conversions"] }

 # Crates.io dependencies, in alphabetical order
@ -23,9 +24,7 @@ futures-util = { version = "0.3", optional = true }
 prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
+tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
+tokio-stream = "0.1.11"
 thiserror = "1.0.37"
 tonic = { version = "0.8" }
-
-[dev-dependencies] # In alphabetical order
-tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
-mockito = "0.31"
--- a/influxdb_iox_client/src/client/write.rs
+++ b/influxdb_iox_client/src/client/write.rs
@ -1,15 +1,16 @@
-/// Re-export generated_types
-pub mod generated_types {
-    pub use generated_types::influxdata::pbdata::v1::*;
-}
+use std::{fmt::Debug, num::NonZeroUsize, sync::Arc};

 use client_util::{connection::HttpConnection, namespace_translation::split_namespace};
+use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt};

 use crate::{
    connection::Connection,
    error::{translate_response, Error},
 };
-use reqwest::Method;
+use reqwest::{Body, Method};
+
+/// The default value for the maximum size of each request, in bytes
+pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);

 /// An IOx Write API client.
 ///
@ -37,18 +38,67 @@ use reqwest::Method;
 /// ```
 #[derive(Debug, Clone)]
 pub struct Client {
-    inner: HttpConnection,
+    /// The inner client used to actually make requests.
+    ///
+    /// Uses a trait for test mocking.
+    ///
+    /// Does not expose the trait in the `Client` type to avoid
+    /// exposing an internal implementation detail (the trait) in the
+    /// public interface.
+    inner: Arc<dyn RequestMaker>,
+
+    /// If `Some`, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict
+    /// the amount sent per request. Defaults to `Some(1MB)`
+    ///
+    /// Splitting the upload size consumes a non trivial amount of CPU
+    /// to find line protocol boundaries. This can be disabled by
+    /// setting `max_request_payload_size_bytes` to `None`.
+    max_request_payload_size_bytes: Option<usize>,
+
+    /// Makes this many concurrent requests at a time. Defaults to 1
+    max_concurrent_uploads: NonZeroUsize,
 }

 impl Client {
    /// Creates a new client with the provided connection
    pub fn new(connection: Connection) -> Self {
+        Self::new_with_maker(Arc::new(connection.into_http_connection()))
+    }
+
+    /// Creates a new client with the provided request maker
+    fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
        Self {
-            inner: connection.into_http_connection(),
+            inner,
+            max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
+            max_concurrent_uploads: NonZeroUsize::new(1).unwrap(),
        }
    }

-    /// Write the [LineProtocol] formatted data in `lp_data` to
+    /// Override the default of sending 1MB of line protocol per request.
+    /// If `Some` is specified, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict the amount of
+    /// line protocol sent per request.
+    pub fn with_max_request_payload_size_bytes(
+        self,
+        max_request_payload_size_bytes: Option<usize>,
+    ) -> Self {
+        Self {
+            max_request_payload_size_bytes,
+            ..self
+        }
+    }
+
+    /// The client makes this many concurrent uploads at a
+    /// time. Defaults to 1.
+    pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self {
+        Self {
+            max_concurrent_uploads,
+            ..self
+        }
+    }
+
+    /// Write the [LineProtocol] formatted string in `lp_data` to
    /// namespace `namespace`.
    ///
    /// Returns the number of bytes which were written to the database
@ -59,11 +109,24 @@ impl Client {
        namespace: impl AsRef<str> + Send,
        lp_data: impl Into<String> + Send,
    ) -> Result<usize, Error> {
-        let lp_data = lp_data.into();
-        let data_len = lp_data.len();
+        let sources = futures_util::stream::iter([lp_data.into()]);

-        let write_url = format!("{}api/v2/write", self.inner.uri());
+        self.write_lp_stream(namespace, sources).await
+    }

+    /// Write the stream of [LineProtocol] formatted strings in
+    /// `sources` to namespace `namespace`. It is assumed that
+    /// individual lines (points) do not cross these strings
+    ///
+    /// Returns the number of bytes, in total, which were written to
+    /// the database
+    ///
+    /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    pub async fn write_lp_stream(
+        &mut self,
+        namespace: impl AsRef<str> + Send,
+        sources: impl Stream<Item = String> + Send,
+    ) -> Result<usize, Error> {
        let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| {
            Error::invalid_argument(
                "namespace",
@ -71,47 +134,302 @@ impl Client {
            )
        })?;

-        let response = self
-            .inner
-            .client()
-            .request(Method::POST, &write_url)
-            .query(&[("bucket", bucket_id), ("org", org_id)])
-            .body(lp_data)
-            .send()
+        let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
+        let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
+
+        // make a stream and process in parallel
+        let results = sources
+            // split each input source in parallel, if possible
+            .flat_map(|source| {
+                split_lp(
+                    source,
+                    max_request_payload_size_bytes,
+                    max_concurrent_uploads,
+                )
+            })
+            // do the actual write
+            .map(|source| {
+                let org_id = org_id.to_string();
+                let bucket_id = bucket_id.to_string();
+                let inner = Arc::clone(&self.inner);
+
+                tokio::task::spawn(
+                    async move { inner.write_source(org_id, bucket_id, source).await },
+                )
+            })
+            // Do the uploads in parallel
+            .buffered(max_concurrent_uploads)
+            .try_collect::<Vec<_>>()
+            // handle panics in tasks
            .await
-            .map_err(Error::client)?;
+            .map_err(Error::client)?
+            // find / return any errors
+            .into_iter()
+            .collect::<Result<Vec<_>, Error>>()?;

-        translate_response(response).await?;
+        Ok(results.into_iter().sum())
+    }
+}

-        Ok(data_len)
+/// Something that knows how to send http data. Exists so it can be
+/// mocked out for testing
+trait RequestMaker: Debug + Send + Sync {
+    /// Write the body data to the specified org, bucket, and
+    /// returning the number of bytes written
+    ///
+    /// (this is implemented manually to avoid `async_trait`)
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>>;
+}
+
+impl RequestMaker for HttpConnection {
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>> {
+        let write_url = format!("{}api/v2/write", self.uri());
+
+        async move {
+            let body: Body = body.into();
+
+            let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0);
+
+            let response = self
+                .client()
+                .request(Method::POST, &write_url)
+                .query(&[("bucket", bucket_id), ("org", org_id)])
+                .body(body)
+                .send()
+                .await
+                .map_err(Error::client)?;
+
+            translate_response(response).await?;
+
+            Ok(data_len)
+        }
+        .boxed()
+    }
+}
+
+/// splits input line protocol into one or more sizes of at most
+/// `max_chunk` on line breaks in a separte tokio task
+fn split_lp(
+    input: String,
+    max_chunk_size: Option<usize>,
+    max_concurrent_uploads: usize,
+) -> impl Stream<Item = String> {
+    let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads);
+
+    tokio::task::spawn(async move {
+        match max_chunk_size {
+            None => {
+                // ignore errors (means the receiver hung up but nothing to communicate
+                tx.send(input).await.ok();
+            }
+            Some(max_chunk_size) => {
+                // use the actual line protocol parser to split on valid boundaries
+                let mut acc = LineAccumulator::new(max_chunk_size);
+                for l in influxdb_line_protocol::split_lines(&input) {
+                    if let Some(chunk) = acc.push(l) {
+                        // abort if receiver has hungup
+                        if tx.send(chunk).await.is_err() {
+                            return;
+                        }
+                    }
+                }
+                if let Some(chunk) = acc.flush() {
+                    tx.send(chunk).await.ok();
+                }
+            }
+        }
+    });
+
+    tokio_stream::wrappers::ReceiverStream::new(rx)
+}
+#[derive(Debug)]
+struct LineAccumulator {
+    current_chunk: String,
+    max_chunk_size: usize,
+}
+
+impl LineAccumulator {
+    fn new(max_chunk_size: usize) -> Self {
+        Self {
+            current_chunk: String::with_capacity(max_chunk_size),
+            max_chunk_size,
+        }
+    }
+
+    // Add data `l` to the current chunk being created, returning the
+    // current chunk if complete.
+    fn push(&mut self, l: &str) -> Option<String> {
+        let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size {
+            self.flush()
+        } else {
+            None
+        };
+
+        if !self.current_chunk.is_empty() {
+            self.current_chunk += "\n";
+        }
+
+        self.current_chunk += l;
+        chunk
+    }
+
+    /// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length
+    /// `self.current_chunk.len()` is zero
+    fn flush(&mut self) -> Option<String> {
+        if !self.current_chunk.is_empty() {
+            let mut new_chunk = String::with_capacity(self.max_chunk_size);
+            std::mem::swap(&mut new_chunk, &mut self.current_chunk);
+            Some(new_chunk)
+        } else {
+            None
+        }
    }
 }

 #[cfg(test)]
 mod tests {
+    use std::sync::Mutex;
+
    use super::*;
-    use crate::connection::Builder;

    #[tokio::test]
-    /// Ensure the basic plumbing is hooked up correctly
-    async fn basic() {
-        let url = mockito::server_url();
-
-        let connection = Builder::new().build(&url).await.unwrap();
+    async fn test() {
+        let mock = Arc::new(MockRequestMaker::new());

        let namespace = "orgname_bucketname";
        let data = "m,t=foo f=4";

-        let m = mockito::mock("POST", "/api/v2/write?bucket=bucketname&org=orgname")
-            .with_status(201)
-            .match_body(data)
-            .create();
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucketname".into(),
+            body: data.into(),
+        }];

-        let res = Client::new(connection).write_lp(namespace, data).await;
-
-        m.assert();
-
-        let num_bytes = res.expect("Error making write request");
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
        assert_eq!(num_bytes, 11);
    }
+
+    #[tokio::test]
+    async fn test_max_request_payload_size() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = "m,t=foo f=4\n\
+                    m,t=bar f=3\n\
+                    m,t=fooddddddd f=4";
+
+        // expect the data to be broken up into two chunks:
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4\nm,t=bar f=3".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=fooddddddd f=4".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            // enough to get first two lines, but not last
+            .with_max_request_payload_size_bytes(Some(30))
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 41);
+    }
+
+    #[tokio::test]
+    async fn test_write_lp_stream() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = futures_util::stream::iter(
+            vec!["m,t=foo f=4", "m,t=bar f=3"]
+                .into_iter()
+                .map(|s| s.to_string()),
+        );
+
+        // expect the data to come in two chunks
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=bar f=3".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp_stream(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 22);
+    }
+
+    #[derive(Debug, Clone, PartialEq)]
+    struct MockRequest {
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    }
+
+    #[derive(Debug)]
+    struct MockRequestMaker {
+        requests: Mutex<Vec<MockRequest>>,
+    }
+
+    impl MockRequestMaker {
+        fn new() -> Self {
+            Self {
+                requests: Mutex::new(vec![]),
+            }
+        }
+
+        /// get a copy of the requests that were made using this mock
+        fn requests(&self) -> Vec<MockRequest> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl RequestMaker for MockRequestMaker {
+        fn write_source(
+            &self,
+            org_id: String,
+            bucket_id: String,
+            body: String,
+        ) -> BoxFuture<'_, Result<usize, Error>> {
+            let sz = body.len();
+
+            self.requests.lock().unwrap().push(MockRequest {
+                org_id,
+                bucket_id,
+                body,
+            });
+
+            async move { Ok(sz) }.boxed()
+        }
+    }
 }
--- a/influxdb_line_protocol/Cargo.toml
+++ b/influxdb_line_protocol/Cargo.toml
@ -14,7 +14,7 @@ ffi = ["libc"]
 bytes = "1.2"
 libc = { version = "0.2", optional = true }
 nom = { version = "7", default-features = false, features = ["std"] }
-smallvec = { version = "1.9.0", features = ["union"] }
+smallvec = { version = "1.10.0", features = ["union"] }
 snafu = "0.7"
 observability_deps = { path = "../observability_deps" }
 workspace-hack = { path = "../workspace-hack"}
--- a/influxdb_line_protocol/src/lib.rs
+++ b/influxdb_line_protocol/src/lib.rs
@ -529,7 +529,7 @@ pub fn parse_lines(input: &str) -> impl Iterator<Item = Result<ParsedLine<'_>>>
 /// logic duplication for scanning fields, duplicating it also means
 /// we can be more sure of the compatibility of the rust parser and
 /// the canonical Go parser.
-fn split_lines(input: &str) -> impl Iterator<Item = &str> {
+pub fn split_lines(input: &str) -> impl Iterator<Item = &str> {
    // NB: This is ported as closely as possibly from the original Go code:
    let mut quoted = false;
    let mut fields = false;
--- a/influxrpc_parser/Cargo.toml
+++ b/influxrpc_parser/Cargo.toml
@ -4,8 +4,8 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-sqlparser = "0.24.0"
-snafu = "0.7.1"
+sqlparser = "0.25.0"
+snafu = "0.7.2"

 generated_types = { path = "../generated_types" }
 workspace-hack = { path = "../workspace-hack"}
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@ -24,7 +24,7 @@ iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch"}
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
@ -45,6 +45,7 @@ write_buffer = { path = "../write_buffer" }
 write_summary = { path = "../write_summary" }
 tokio-util = { version = "0.7.4" }
 trace = { path = "../trace" }
+rand = "0.8.5"

 [dev-dependencies]
 assert_matches = "1.5.0"
@ -52,4 +53,4 @@ bitflags = {version = "1.3.2"}
 once_cell = "1"
 paste = "1.0.9"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
-tokio-stream = {version = "0.1.10", default_features = false }
+tokio-stream = {version = "0.1.11", default_features = false }
--- a/ingester/src/compact.rs
+++ b/ingester/src/compact.rs
@ -18,7 +18,7 @@ use crate::{data::partition::PersistingBatch, query::QueryableBatch};

 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
+pub(crate) enum Error {
    #[snafu(display("Error while building logical plan for Ingester's compaction"))]
    LogicalPlan {
        source: iox_query::frontend::reorg::Error,
@ -86,11 +86,8 @@ pub(crate) async fn compact_persisting_batch(
    namespace_id: i64,
    partition_info: &PartitionInfo,
    batch: Arc<PersistingBatch>,
-) -> Result<Option<CompactedStream>> {
-    // Nothing to compact
-    if batch.data.data.is_empty() {
-        return Ok(None);
-    }
+) -> Result<CompactedStream> {
+    assert!(!batch.data.data.is_empty());

    let namespace_name = &partition_info.namespace_name;
    let table_name = &partition_info.table_name;
@ -141,11 +138,11 @@ pub(crate) async fn compact_persisting_batch(
        sort_key: Some(metadata_sort_key),
    };

-    Ok(Some(CompactedStream {
+    Ok(CompactedStream {
        stream,
        iox_metadata,
        sort_key_update,
-    }))
+    })
 }

 /// Compact a given Queryable Batch
@ -192,8 +189,8 @@ mod tests {
        create_batches_with_influxtype_same_columns_different_type,
        create_one_record_batch_with_influxtype_duplicates,
        create_one_record_batch_with_influxtype_no_duplicates,
-        create_one_row_record_batch_with_influxtype, create_tombstone, make_meta,
-        make_persisting_batch, make_queryable_batch, make_queryable_batch_with_deletes,
+        create_one_row_record_batch_with_influxtype, make_meta, make_persisting_batch,
+        make_queryable_batch,
    };

    // this test was added to guard against https://github.com/influxdata/influxdb_iox/issues/3782
@ -226,7 +223,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -254,7 +250,6 @@ mod tests {
        let CompactedStream { stream, .. } =
            compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
                .await
-                .unwrap()
                .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -297,7 +292,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -328,7 +322,6 @@ mod tests {
            sort_key_update,
        } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
            .await
-            .unwrap()
            .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -394,7 +387,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -426,7 +418,6 @@ mod tests {
            sort_key_update,
        } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
            .await
-            .unwrap()
            .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -494,7 +485,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -527,7 +517,6 @@ mod tests {
            sort_key_update,
        } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
            .await
-            .unwrap()
            .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -595,7 +584,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -629,7 +617,6 @@ mod tests {
            sort_key_update,
        } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
            .await
-            .unwrap()
            .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -700,7 +687,6 @@ mod tests {
            partition_id,
            uuid,
            batches,
-            vec![],
        );

        // verify PK
@ -739,7 +725,6 @@ mod tests {
            sort_key_update,
        } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
            .await
-            .unwrap()
            .unwrap();

        let output_batches = datafusion::physical_plan::common::collect(stream)
@ -825,54 +810,6 @@ mod tests {
        assert_batches_eq!(&expected, &output_batches);
    }

-    #[tokio::test]
-    async fn test_compact_one_batch_no_dupilcates_with_deletes() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-        // verify no empty record batches - bug #3782
-        assert_eq!(output_batches.len(), 2);
-        assert_eq!(output_batches[0].num_rows(), 1);
-        assert_eq!(output_batches[1].num_rows(), 1);
-
-        // verify compacted data
-        // row with "tag1=UT" no longer available
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
    #[tokio::test]
    async fn test_compact_one_batch_with_duplicates() {
        // create input data
@ -1019,23 +956,12 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_deletes(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates() {
        // create many-batches input data
        let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![create_tombstone(
-            1,
-            1,
-            1,
-            100,                          // delete's seq_number
-            0,                            // min time of data to get deleted
-            200000,                       // max time of data to get deleted
-            "tag2=CT and field_int=1000", // delete predicate
-        )];

        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);

        // verify PK
        let schema = compact_batch.schema();
@ -1058,7 +984,6 @@ mod tests {

        // verify compacted data
        // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") are also removed
        // CORRECT RESULT
        let expected = vec![
            "+-----------+------+------+--------------------------------+",
@ -1067,73 +992,15 @@ mod tests {
            "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
            "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
            "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
            "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
-            "+-----------+------+------+--------------------------------+",
-        ];
-
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
-    #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes(
-    ) {
-        // create many-batches input data
-        let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,        // delete's seq_number
-                0,          // min time of data to get deleted
-                200000,     // max time of data to get deleted
-                "tag1!=MT", // delete predicate
-            ),
-        ];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "tag2", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify compacted data
-        // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1!=MT") are also removed
-        let expected = vec![
-            "+-----------+------+------+--------------------------------+",
-            "| field_int | tag1 | tag2 | time                           |",
-            "+-----------+------+------+--------------------------------+",
-            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
-            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
            "+-----------+------+------+--------------------------------+",
        ];

@ -1142,31 +1009,12 @@ mod tests {

    // BUG
    #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes_2(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates2() {
        // create many-batches input data
        let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,       // delete's seq_number
-                0,         // min time of data to get deleted
-                200000,    // max time of data to get deleted
-                "tag1=MT", // delete predicate
-            ),
-        ];

        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);

        // verify PK
        let schema = compact_batch.schema();
@ -1189,29 +1037,22 @@ mod tests {

        // verify compacted data
        // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1=MT") are also removed
-        // CORRECT RESULT
-        // let expected = vec![
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| field_int | tag1 | tag2 | time                           |",
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
-        //     "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
-        //     "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
-        //     "+-----------+------+------+--------------------------------+",
-        // ];
-        // current WRONMG result: "tag1 is null" is also eliminated
        let expected = vec![
            "+-----------+------+------+--------------------------------+",
            "| field_int | tag1 | tag2 | time                           |",
            "+-----------+------+------+--------------------------------+",
+            "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
+            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
+            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
            "+-----------+------+------+--------------------------------+",
        ];

--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@ -1,15 +1,12 @@
 //! Data for the lifecycle of the Ingester

-use std::{collections::BTreeMap, pin::Pin, sync::Arc};
+use std::{collections::BTreeMap, sync::Arc};

-use arrow::{error::ArrowError, record_batch::RecordBatch};
-use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use async_trait::async_trait;
 use backoff::{Backoff, BackoffConfig};
-use data_types::{PartitionId, SequenceNumber, ShardId, ShardIndex};
-use datafusion::physical_plan::SendableRecordBatchStream;
+use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId};
+
 use dml::DmlOperation;
-use futures::{Stream, StreamExt};
 use iox_catalog::interface::{get_table_schema_by_id, Catalog};
 use iox_query::exec::Executor;
 use iox_time::SystemProvider;
@ -25,16 +22,12 @@ use crate::{
    lifecycle::LifecycleHandle,
 };

-pub mod namespace;
+pub(crate) mod namespace;
 pub mod partition;
-mod query_dedup;
-pub mod shard;
-pub mod table;
+pub(crate) mod shard;
+pub(crate) mod table;

-use self::{
-    partition::{resolver::PartitionProvider, PartitionStatus},
-    shard::ShardData,
-};
+use self::{partition::resolver::PartitionProvider, shard::ShardData, table::TableName};

 #[cfg(test)]
 mod triggers;
@ -51,9 +44,6 @@ pub enum Error {
    #[snafu(display("Table {} not found in buffer", table_name))]
    TableNotFound { table_name: String },

-    #[snafu(display("Table must be specified in delete"))]
-    TableNotPresent,
-
    #[snafu(display("Error accessing catalog: {}", source))]
    Catalog {
        source: iox_catalog::interface::Error,
@ -186,7 +176,7 @@ impl IngesterData {
            .get(&shard_id)
            .context(ShardNotFoundSnafu { shard_id })?;
        shard_data
-            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle, &self.exec)
+            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle)
            .await
    }

@ -220,7 +210,13 @@ impl IngesterData {
 #[async_trait]
 pub trait Persister: Send + Sync + 'static {
    /// Persits the partition ID. Will retry forever until it succeeds.
-    async fn persist(&self, partition_id: PartitionId);
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    );

    /// Updates the shard's `min_unpersisted_sequence_number` in the catalog.
    /// This number represents the minimum that might be unpersisted, which is the
@ -235,7 +231,69 @@ pub trait Persister: Send + Sync + 'static {

 #[async_trait]
 impl Persister for IngesterData {
-    async fn persist(&self, partition_id: PartitionId) {
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    ) {
+        // lookup the state from the ingester data. If something isn't found,
+        // it's unexpected. Crash so someone can take a look.
+        let shard_data = self
+            .shards
+            .get(&shard_id)
+            .unwrap_or_else(|| panic!("shard state for {shard_id} not in ingester data"));
+        let namespace = shard_data
+            .namespace_by_id(namespace_id)
+            .unwrap_or_else(|| panic!("namespace {namespace_id} not in shard {shard_id} state"));
+
+        let partition_key;
+        let batch;
+        {
+            let table_data = namespace.table_id(table_id).unwrap_or_else(|| {
+                panic!("table {table_id} in namespace {namespace_id} not in shard {shard_id} state")
+            });
+
+            let mut guard = table_data.write().await;
+            let partition = guard.get_partition(partition_id).unwrap_or_else(|| {
+                panic!(
+                    "partition {partition_id} in table {table_id} in namespace {namespace_id} not in shard {shard_id} state"
+                )
+            });
+
+            partition_key = partition.partition_key().clone();
+            batch = partition.snapshot_to_persisting_batch();
+        };
+
+        debug!(%shard_id, %namespace_id, %table_id, %partition_id, %partition_key, "persisting partition");
+
+        // Check if there is any data to persist.
+        let batch = match batch {
+            Some(v) if !v.data.data.is_empty() => v,
+            _ => {
+                warn!(
+                    %shard_id,
+                    %namespace_id,
+                    %table_id,
+                    %partition_id,
+                    %partition_key,
+                    "partition marked for persistence contains no data"
+                );
+                return;
+            }
+        };
+
+        // lookup column IDs from catalog
+        // TODO: this can be removed once the ingester uses column IDs internally as well
+        let table_schema = Backoff::new(&self.backoff_config)
+            .retry_all_errors("get table schema", || async {
+                let mut repos = self.catalog.repositories().await;
+                get_table_schema_by_id(table_id, repos.as_mut()).await
+            })
+            .await
+            .expect("retry forever");
+
        // lookup the partition_info from the catalog
        let partition_info = Backoff::new(&self.backoff_config)
            .retry_all_errors("get partition_info_by_id", || async {
@ -243,217 +301,159 @@ impl Persister for IngesterData {
                repos.partitions().partition_info_by_id(partition_id).await
            })
            .await
-            .expect("retry forever");
+            .expect("retry forever").unwrap_or_else(|| panic!("partition {partition_id} in table {table_id} in namespace {namespace_id} in shard {shard_id} has no partition info in catalog"));

-        // lookup the state from the ingester data. If something isn't found, it's unexpected. Crash
-        // so someone can take a look.
-        let partition_info = partition_info
-            .unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id));
-        let shard_data = self
-            .shards
-            .get(&partition_info.partition.shard_id)
-            .unwrap_or_else(|| {
-                panic!(
-                    "shard state for {} not in ingester data",
-                    partition_info.partition.shard_id
-                )
-            }); //{
-        let namespace = shard_data
-            .namespace(&partition_info.namespace_name)
-            .unwrap_or_else(|| {
-                panic!(
-                    "namespace {} not in shard {} state",
-                    partition_info.namespace_name, partition_info.partition.shard_id
-                )
-            });
-        debug!(?partition_id, ?partition_info, "persisting partition");
+        // do the CPU intensive work of compaction, de-duplication and sorting
+        let CompactedStream {
+            stream: record_stream,
+            iox_metadata,
+            sort_key_update,
+        } = compact_persisting_batch(
+            Arc::new(SystemProvider::new()),
+            &self.exec,
+            namespace.namespace_id().get(),
+            &partition_info,
+            Arc::clone(&batch),
+        )
+        .await
+        .expect("unable to compact persisting batch");

-        // lookup column IDs from catalog
-        // TODO: this can be removed once the ingester uses column IDs internally as well
-        let table_schema = Backoff::new(&self.backoff_config)
-            .retry_all_errors("get table schema", || async {
-                let mut repos = self.catalog.repositories().await;
-                let table = repos
-                    .tables()
-                    .get_by_namespace_and_name(namespace.namespace_id(), &partition_info.table_name)
-                    .await?
-                    .expect("table not found in catalog");
-                get_table_schema_by_id(table.id, repos.as_mut()).await
-            })
+        // Save the compacted data to a parquet file in object storage.
+        //
+        // This call retries until it completes.
+        let (md, file_size) = self
+            .store
+            .upload(record_stream, &iox_metadata)
            .await
-            .expect("retry forever");
+            .expect("unexpected fatal persist error");

-        let persisting_batch = namespace
-            .snapshot_to_persisting(
-                &partition_info.table_name,
-                &partition_info.partition.partition_key,
-            )
-            .await;
-
-        if let Some(persisting_batch) = persisting_batch {
-            // do the CPU intensive work of compaction, de-duplication and sorting
-            let compacted_stream = match compact_persisting_batch(
-                Arc::new(SystemProvider::new()),
-                &self.exec,
-                namespace.namespace_id().get(),
-                &partition_info,
-                Arc::clone(&persisting_batch),
-            )
-            .await
-            {
-                Err(e) => {
-                    // this should never error out. if it does, we need to crash hard so
-                    // someone can take a look.
-                    panic!("unable to compact persisting batch with error: {:?}", e);
-                }
-                Ok(Some(r)) => r,
-                Ok(None) => {
-                    warn!("persist called with no data");
-                    return;
-                }
-            };
-            let CompactedStream {
-                stream: record_stream,
-                iox_metadata,
-                sort_key_update,
-            } = compacted_stream;
-
-            // Save the compacted data to a parquet file in object storage.
-            //
-            // This call retries until it completes.
-            let (md, file_size) = self
-                .store
-                .upload(record_stream, &iox_metadata)
-                .await
-                .expect("unexpected fatal persist error");
-
-            // Update the sort key in the catalog if there are
-            // additional columns BEFORE adding parquet file to the
-            // catalog. If the order is reversed, the querier or
-            // compactor may see a parquet file with an inconsistent
-            // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
-            if let Some(new_sort_key) = sort_key_update {
-                let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
-                Backoff::new(&self.backoff_config)
-                    .retry_all_errors("update_sort_key", || async {
-                        let mut repos = self.catalog.repositories().await;
-                        let _partition = repos
-                            .partitions()
-                            .update_sort_key(partition_id, &sort_key)
-                            .await?;
-                        // compiler insisted on getting told the type of the error :shrug:
-                        Ok(()) as Result<(), iox_catalog::interface::Error>
-                    })
-                    .await
-                    .expect("retry forever");
-                debug!(
-                    ?partition_id,
-                    table = partition_info.table_name,
-                    ?new_sort_key,
-                    "adjusted sort key during batch compact & persist"
-                );
-            }
-
-            // Add the parquet file to the catalog until succeed
-            let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
-                table_schema.columns.get(name).expect("Unknown column").id
-            });
-
-            // Assert partitions are persisted in-order.
-            //
-            // It is an invariant that partitions are persisted in order so that
-            // both the per-shard, and per-partition watermarks are correctly
-            // advanced and accurate.
-            if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
-                assert!(
-                    parquet_file.max_sequence_number > last_persist,
-                    "out of order partition persistence, persisting {}, previously persisted {}",
-                    parquet_file.max_sequence_number.get(),
-                    last_persist.get(),
-                );
-            }
-
-            // Add the parquet file to the catalog.
-            //
-            // This has the effect of allowing the queriers to "discover" the
-            // parquet file by polling / querying the catalog.
+        // Update the sort key in the catalog if there are
+        // additional columns BEFORE adding parquet file to the
+        // catalog. If the order is reversed, the querier or
+        // compactor may see a parquet file with an inconsistent
+        // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
+        if let Some(new_sort_key) = sort_key_update {
+            let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
            Backoff::new(&self.backoff_config)
-                .retry_all_errors("add parquet file to catalog", || async {
+                .retry_all_errors("update_sort_key", || async {
                    let mut repos = self.catalog.repositories().await;
-                    let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
-                    debug!(
-                        ?partition_id,
-                        table_id=?parquet_file.table_id,
-                        parquet_file_id=?parquet_file.id,
-                        table_name=%iox_metadata.table_name,
-                        "parquet file written to catalog"
-                    );
+                    let _partition = repos
+                        .partitions()
+                        .update_sort_key(partition_id, &sort_key)
+                        .await?;
                    // compiler insisted on getting told the type of the error :shrug:
                    Ok(()) as Result<(), iox_catalog::interface::Error>
                })
                .await
                .expect("retry forever");
-
-            // Update the per-partition persistence watermark, so that new
-            // ingester instances skip the just-persisted ops during replay.
-            //
-            // This could be transactional with the above parquet insert to
-            // maintain catalog consistency, though in practice it is an
-            // unnecessary overhead - the system can tolerate replaying the ops
-            // that lead to this parquet file being generated, and tolerate
-            // creating a parquet file containing duplicate data (remedied by
-            // compaction).
-            //
-            // This means it is possible to observe a parquet file with a
-            // max_persisted_sequence_number >
-            // partition.persisted_sequence_number, either in-between these
-            // catalog updates, or for however long it takes a crashed ingester
-            // to restart and replay the ops, and re-persist a file containing
-            // the same (or subset of) data.
-            //
-            // The above is also true of the per-shard persist marker that
-            // governs the ingester's replay start point, which is
-            // non-transactionally updated after all partitions have persisted.
-            Backoff::new(&self.backoff_config)
-                .retry_all_errors("set partition persist marker", || async {
-                    self.catalog
-                        .repositories()
-                        .await
-                        .partitions()
-                        .update_persisted_sequence_number(
-                            parquet_file.partition_id,
-                            parquet_file.max_sequence_number,
-                        )
-                        .await
-                })
-                .await
-                .expect("retry forever");
-
-            // Record metrics
-            let attributes = Attributes::from([(
-                "shard_id",
-                format!("{}", partition_info.partition.shard_id).into(),
-            )]);
-            self.persisted_file_size_bytes
-                .recorder(attributes)
-                .record(file_size as u64);
-
-            // and remove the persisted data from memory
-            namespace
-                .mark_persisted(
-                    &partition_info.table_name,
-                    &partition_info.partition.partition_key,
-                    iox_metadata.max_sequence_number,
-                )
-                .await;
            debug!(
                ?partition_id,
-                table_name=%partition_info.table_name,
-                partition_key=%partition_info.partition.partition_key,
-                max_sequence_number=%iox_metadata.max_sequence_number.get(),
-                "marked partition as persisted"
+                table = partition_info.table_name,
+                ?new_sort_key,
+                "adjusted sort key during batch compact & persist"
            );
        }
+
+        // Add the parquet file to the catalog until succeed
+        let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
+            table_schema.columns.get(name).expect("Unknown column").id
+        });
+
+        // Assert partitions are persisted in-order.
+        //
+        // It is an invariant that partitions are persisted in order so that
+        // both the per-shard, and per-partition watermarks are correctly
+        // advanced and accurate.
+        if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
+            assert!(
+                parquet_file.max_sequence_number > last_persist,
+                "out of order partition persistence, persisting {}, previously persisted {}",
+                parquet_file.max_sequence_number.get(),
+                last_persist.get(),
+            );
+        }
+
+        // Add the parquet file to the catalog.
+        //
+        // This has the effect of allowing the queriers to "discover" the
+        // parquet file by polling / querying the catalog.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("add parquet file to catalog", || async {
+                let mut repos = self.catalog.repositories().await;
+                let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
+                debug!(
+                    ?partition_id,
+                    table_id=?parquet_file.table_id,
+                    parquet_file_id=?parquet_file.id,
+                    table_name=%iox_metadata.table_name,
+                    "parquet file written to catalog"
+                );
+                // compiler insisted on getting told the type of the error :shrug:
+                Ok(()) as Result<(), iox_catalog::interface::Error>
+            })
+            .await
+            .expect("retry forever");
+
+        // Update the per-partition persistence watermark, so that new
+        // ingester instances skip the just-persisted ops during replay.
+        //
+        // This could be transactional with the above parquet insert to
+        // maintain catalog consistency, though in practice it is an
+        // unnecessary overhead - the system can tolerate replaying the ops
+        // that lead to this parquet file being generated, and tolerate
+        // creating a parquet file containing duplicate data (remedied by
+        // compaction).
+        //
+        // This means it is possible to observe a parquet file with a
+        // max_persisted_sequence_number >
+        // partition.persisted_sequence_number, either in-between these
+        // catalog updates, or for however long it takes a crashed ingester
+        // to restart and replay the ops, and re-persist a file containing
+        // the same (or subset of) data.
+        //
+        // The above is also true of the per-shard persist marker that
+        // governs the ingester's replay start point, which is
+        // non-transactionally updated after all partitions have persisted.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("set partition persist marker", || async {
+                self.catalog
+                    .repositories()
+                    .await
+                    .partitions()
+                    .update_persisted_sequence_number(
+                        parquet_file.partition_id,
+                        parquet_file.max_sequence_number,
+                    )
+                    .await
+            })
+            .await
+            .expect("retry forever");
+
+        // Record metrics
+        let attributes = Attributes::from([(
+            "shard_id",
+            format!("{}", partition_info.partition.shard_id).into(),
+        )]);
+        self.persisted_file_size_bytes
+            .recorder(attributes)
+            .record(file_size as u64);
+
+        // and remove the persisted data from memory
+        let table_name = TableName::from(&partition_info.table_name);
+        namespace
+            .mark_persisted(
+                &table_name,
+                &partition_info.partition.partition_key,
+                iox_metadata.max_sequence_number,
+            )
+            .await;
+        debug!(
+            ?partition_id,
+            %table_name,
+            partition_key=%partition_info.partition.partition_key,
+            max_sequence_number=%iox_metadata.max_sequence_number.get(),
+            "marked partition as persisted"
+        );
    }

    async fn update_min_unpersisted_sequence_number(
@ -475,172 +475,24 @@ impl Persister for IngesterData {
    }
 }

-/// Stream of snapshots.
-///
-/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
-pub(crate) type SnapshotStream =
-    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
-
-/// Response data for a single partition.
-pub(crate) struct IngesterQueryPartition {
-    /// Stream of snapshots.
-    snapshots: SnapshotStream,
-
-    /// Partition ID.
-    id: PartitionId,
-
-    /// Partition persistence status.
-    status: PartitionStatus,
-}
-
-impl std::fmt::Debug for IngesterQueryPartition {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryPartition")
-            .field("snapshots", &"<SNAPSHOT STREAM>")
-            .field("id", &self.id)
-            .field("status", &self.status)
-            .finish()
-    }
-}
-
-impl IngesterQueryPartition {
-    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
-        Self {
-            snapshots,
-            id,
-            status,
-        }
-    }
-}
-
-/// Stream of partitions in this response.
-pub(crate) type IngesterQueryPartitionStream =
-    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
-
-/// Response streams for querier<>ingester requests.
-///
-/// The data structure is constructed to allow lazy/streaming data generation. For easier
-/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
-pub struct IngesterQueryResponse {
-    /// Stream of partitions.
-    partitions: IngesterQueryPartitionStream,
-}
-
-impl std::fmt::Debug for IngesterQueryResponse {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryResponse")
-            .field("partitions", &"<PARTITION STREAM>")
-            .finish()
-    }
-}
-
-impl IngesterQueryResponse {
-    /// Make a response
-    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
-        Self { partitions }
-    }
-
-    /// Flattens the data according to the wire protocol.
-    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
-        self.partitions
-            .flat_map(|partition_res| match partition_res {
-                Ok(partition) => {
-                    let head = futures::stream::once(async move {
-                        Ok(FlatIngesterQueryResponse::StartPartition {
-                            partition_id: partition.id,
-                            status: partition.status,
-                        })
-                    });
-                    let tail = partition
-                        .snapshots
-                        .flat_map(|snapshot_res| match snapshot_res {
-                            Ok(snapshot) => {
-                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
-
-                                let schema_captured = Arc::clone(&schema);
-                                let head = futures::stream::once(async {
-                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
-                                        schema: schema_captured,
-                                    })
-                                });
-
-                                let tail = snapshot.map(move |batch_res| match batch_res {
-                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
-                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
-                                    }),
-                                    Err(e) => Err(e),
-                                });
-
-                                head.chain(tail).boxed()
-                            }
-                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-                        });
-
-                    head.chain(tail).boxed()
-                }
-                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-            })
-            .boxed()
-    }
-}
-
-/// Flattened version of [`IngesterQueryResponse`].
-pub(crate) type FlatIngesterQueryResponseStream =
-    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
-
-/// Element within the flat wire protocol.
-#[derive(Debug, PartialEq)]
-pub enum FlatIngesterQueryResponse {
-    /// Start a new partition.
-    StartPartition {
-        /// Partition ID.
-        partition_id: PartitionId,
-
-        /// Partition persistence status.
-        status: PartitionStatus,
-    },
-
-    /// Start a new snapshot.
-    ///
-    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
-    /// message.
-    StartSnapshot {
-        /// Snapshot schema.
-        schema: Arc<arrow::datatypes::Schema>,
-    },
-
-    /// Add a record batch to the snapshot that was announced by the last
-    /// [`StartSnapshot`](Self::StartSnapshot) message.
-    RecordBatch {
-        /// Record batch.
-        batch: RecordBatch,
-    },
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{
-        ops::DerefMut,
-        sync::Arc,
-        task::{Context, Poll},
-        time::Duration,
-    };
+    use std::{ops::DerefMut, sync::Arc, time::Duration};

-    use arrow::datatypes::SchemaRef;
    use assert_matches::assert_matches;
    use data_types::{
        ColumnId, ColumnSet, CompactionLevel, DeletePredicate, NamespaceSchema, NonEmptyString,
        ParquetFileParams, Sequence, Timestamp, TimestampRange,
    };
-    use datafusion::physical_plan::RecordBatchStream;
+
    use dml::{DmlDelete, DmlMeta, DmlWrite};
    use futures::TryStreamExt;
    use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
    use iox_time::Time;
    use metric::{MetricObserver, Observation};
-    use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch};
+    use mutable_batch_lp::lines_to_batches;
    use object_store::memory::InMemory;
-    use schema::selection::Selection;
+
    use uuid::Uuid;

    use super::*;
@ -804,17 +656,20 @@ mod tests {
        // limits)
        assert!(!should_pause);

-        let partition_id = {
+        let (table_id, partition_id) = {
            let sd = data.shards.get(&shard1.id).unwrap();
-            let n = sd.namespace("foo").unwrap();
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("mem").is_some());
+            let n = sd.namespace(&"foo".into()).unwrap();
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"mem".into()).is_some());
            let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
-            p.id()
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
+            (mem_table.table_id(), p.partition_id())
        };

-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;

        // verify that a file got put into object store
        let file_paths: Vec<_> = object_store
@ -945,17 +800,20 @@ mod tests {
        assert_progress(&data, shard_index, expected_progress).await;

        let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();
        let partition_id;
        let table_id;
        {
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("cpu").is_some());
-            let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"cpu".into()).is_some());

+            let mem_table = mem_table.write().await;
            table_id = mem_table.table_id();
-            partition_id = p.id();
+
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
+            partition_id = p.partition_id();
        }
        {
            // verify the partition doesn't have a sort key before any data has been persisted
@ -969,7 +827,8 @@ mod tests {
            assert!(partition_info.partition.sort_key.is_empty());
        }

-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;

        // verify that a file got put into object store
        let file_paths: Vec<_> = object_store
@ -1061,7 +920,7 @@ mod tests {
            .unwrap();
        assert_eq!(partition_info.partition.sort_key, vec!["time"]);

-        let mem_table = n.table_data("mem").unwrap();
+        let mem_table = n.table_data(&"mem".into()).unwrap();
        let mem_table = mem_table.read().await;

        // verify that the parquet_max_sequence_number got updated
@ -1177,7 +1036,7 @@ mod tests {

        // Get the namespace
        let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();

        let expected_progress = ShardProgress::new().with_buffered(SequenceNumber::new(1));
        assert_progress(&data, shard_index, expected_progress).await;
@ -1336,23 +1195,28 @@ mod tests {
            Arc::clone(&metrics),
            Arc::new(SystemProvider::new()),
        );
-        let exec = Executor::new(1);

        let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog)));

-        let data = NamespaceData::new(namespace.id, shard.id, partition_provider, &*metrics);
+        let data = NamespaceData::new(
+            namespace.id,
+            "foo".into(),
+            shard.id,
+            partition_provider,
+            &*metrics,
+        );

        // w1 should be ignored because the per-partition replay offset is set
        // to 1 already, so it shouldn't be buffered and the buffer should
        // remain empty.
        let should_pause = data
-            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle(), &exec)
+            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle())
            .await
            .unwrap();
        {
-            let table_data = data.table_data("mem").unwrap();
+            let table_data = data.table_data(&"mem".into()).unwrap();
            let table = table_data.read().await;
-            let p = table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
            assert_eq!(
                p.max_persisted_sequence_number(),
                Some(SequenceNumber::new(1))
@ -1362,13 +1226,13 @@ mod tests {
        assert!(!should_pause);

        // w2 should be in the buffer
-        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle(), &exec)
+        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle())
            .await
            .unwrap();

-        let table_data = data.table_data("mem").unwrap();
+        let table_data = data.table_data(&"mem".into()).unwrap();
        let table = table_data.read().await;
-        let partition = table.partition_data.get(&"1970-01-01".into()).unwrap();
+        let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
        assert_eq!(
            partition.data.buffer.as_ref().unwrap().min_sequence_number,
            SequenceNumber::new(2)
@ -1454,19 +1318,6 @@ mod tests {
        .await
        .unwrap();

-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name)
-                .unwrap()
-                .table_data("mem")
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            None,
-        );
-
        let predicate = DeletePredicate {
            range: TimestampRange::new(1, 2),
            exprs: vec![],
@ -1485,19 +1336,6 @@ mod tests {
        data.buffer_operation(shard1.id, DmlOperation::Delete(d1), &manager.handle())
            .await
            .unwrap();
-
-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name)
-                .unwrap()
-                .table_data("mem")
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            Some(SequenceNumber::new(2)),
-        );
    }

    /// Verifies that the progress in data is the same as expected_progress
@ -1513,132 +1351,4 @@ mod tests {

        assert_eq!(progresses, expected_progresses);
    }
-
-    #[tokio::test]
-    async fn test_ingester_query_response_flatten() {
-        let batch_1_1 = lp_to_batch("table x=1 0");
-        let batch_1_2 = lp_to_batch("table x=2 1");
-        let batch_2 = lp_to_batch("table y=1 10");
-        let batch_3 = lp_to_batch("table z=1 10");
-
-        let schema_1 = batch_1_1.schema();
-        let schema_2 = batch_2.schema();
-        let schema_3 = batch_3.schema();
-
-        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![
-                            Ok(batch_1_1.clone()),
-                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-                            Ok(batch_1_2.clone()),
-                        ],
-                        Arc::clone(&schema_1),
-                    )) as _),
-                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![Ok(batch_2.clone())],
-                        Arc::clone(&schema_2),
-                    )) as _),
-                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
-                ])),
-                PartitionId::new(2),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
-                },
-            )),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([])),
-                PartitionId::new(1),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
-                },
-            )),
-        ])));
-
-        let actual: Vec<_> = response.flatten().collect().await;
-        let expected = vec![
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(2),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
-                },
-            }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
-            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
-            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(1),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
-                },
-            }),
-        ];
-
-        assert_eq!(actual.len(), expected.len());
-        for (actual, expected) in actual.into_iter().zip(expected) {
-            match (actual, expected) {
-                (Ok(actual), Ok(expected)) => {
-                    assert_eq!(actual, expected);
-                }
-                (Err(_), Err(_)) => {
-                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
-                }
-                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
-                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
-            }
-        }
-    }
-
-    fn lp_to_batch(lp: &str) -> RecordBatch {
-        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
-    }
-
-    pub struct TestRecordBatchStream {
-        schema: SchemaRef,
-        batches: Vec<Result<RecordBatch, ArrowError>>,
-    }
-
-    impl TestRecordBatchStream {
-        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
-            Self { schema, batches }
-        }
-    }
-
-    impl RecordBatchStream for TestRecordBatchStream {
-        fn schema(&self) -> SchemaRef {
-            Arc::clone(&self.schema)
-        }
-    }
-
-    impl futures::Stream for TestRecordBatchStream {
-        type Item = Result<RecordBatch, ArrowError>;
-
-        fn poll_next(
-            mut self: std::pin::Pin<&mut Self>,
-            _: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            if self.batches.is_empty() {
-                Poll::Ready(None)
-            } else {
-                Poll::Ready(Some(self.batches.remove(0)))
-            }
-        }
-
-        fn size_hint(&self) -> (usize, Option<usize>) {
-            (self.batches.len(), Some(self.batches.len()))
-        }
-    }
 }
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@ -1,36 +1,91 @@
 //! Namespace level data buffer structures.

-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};

-use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId};
+use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
+use observability_deps::tracing::warn;
 use parking_lot::RwLock;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use write_summary::ShardProgress;

 #[cfg(test)]
 use super::triggers::TestTriggers;
 use super::{
-    partition::{resolver::PartitionProvider, PersistingBatch},
-    table::TableData,
+    partition::resolver::PartitionProvider,
+    table::{TableData, TableName},
 };
 use crate::lifecycle::LifecycleHandle;

+/// A double-referenced map where [`TableData`] can be looked up by name, or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<TableName, Arc<tokio::sync::RwLock<TableData>>>,
+    by_id: HashMap<TableId, Arc<tokio::sync::RwLock<TableData>>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, t: TableData) -> Arc<tokio::sync::RwLock<TableData>> {
+        let name = t.table_name().clone();
+        let id = t.table_id();
+
+        let t = Arc::new(tokio::sync::RwLock::new(t));
+        self.by_name.insert(name, Arc::clone(&t));
+        self.by_id.insert(id, Arc::clone(&t));
+        t
+    }
+
+    fn by_name(&self, name: &TableName) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    fn by_id(&self, id: TableId) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
+/// The string name / identifier of a Namespace.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub(crate) struct NamespaceName(Arc<str>);
+
+impl<T> From<T> for NamespaceName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::ops::Deref for NamespaceName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl std::fmt::Display for NamespaceName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Data of a Namespace that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct NamespaceData {
    namespace_id: NamespaceId,
+    namespace_name: NamespaceName,

    /// The catalog ID of the shard this namespace is being populated from.
    shard_id: ShardId,

-    tables: RwLock<BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>>,
+    tables: RwLock<DoubleRef>,
    table_count: U64Counter,

    /// The resolver of `(shard_id, table_id, partition_key)` to
@ -87,8 +142,9 @@ pub(crate) struct NamespaceData {

 impl NamespaceData {
    /// Initialize new tables with default partition template of daily
-    pub fn new(
+    pub(super) fn new(
        namespace_id: NamespaceId,
+        namespace_name: NamespaceName,
        shard_id: ShardId,
        partition_provider: Arc<dyn PartitionProvider>,
        metrics: &metric::Registry,
@ -102,6 +158,7 @@ impl NamespaceData {

        Self {
            namespace_id,
+            namespace_name,
            shard_id,
            tables: Default::default(),
            table_count,
@ -120,7 +177,6 @@ impl NamespaceData {
        dml_operation: DmlOperation,
        catalog: &Arc<dyn Catalog>,
        lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
    ) -> Result<bool, super::Error> {
        let sequence_number = dml_operation
            .meta()
@ -146,6 +202,7 @@ impl NamespaceData {
                    .clone();

                for (t, b) in write.into_tables() {
+                    let t = TableName::from(t);
                    let table_data = match self.table_data(&t) {
                        Some(t) => t,
                        None => self.insert_table(&t, catalog).await?,
@ -171,19 +228,17 @@ impl NamespaceData {
                Ok(pause_writes)
            }
            DmlOperation::Delete(delete) => {
-                let table_name = delete.table_name().context(super::TableNotPresentSnafu)?;
-                let table_data = match self.table_data(table_name) {
-                    Some(t) => t,
-                    None => self.insert_table(table_name, catalog).await?,
-                };
+                // Deprecated delete support:
+                // https://github.com/influxdata/influxdb_iox/issues/5825
+                warn!(
+                    shard_id=%self.shard_id,
+                    namespace_name=%self.namespace_name,
+                    namespace_id=%self.namespace_id,
+                    table_name=?delete.table_name(),
+                    sequence_number=?delete.meta().sequence(),
+                    "discarding unsupported delete op"
+                );

-                let mut table_data = table_data.write().await;
-
-                table_data
-                    .buffer_delete(delete.predicate(), sequence_number, &**catalog, executor)
-                    .await?;
-
-                // don't pause writes since deletes don't count towards memory limits
                Ok(false)
            }
        }
@ -194,16 +249,16 @@ impl NamespaceData {
    #[cfg(test)] // Only used in tests
    pub(crate) async fn snapshot(
        &self,
-        table_name: &str,
+        table_name: &TableName,
        partition_key: &PartitionKey,
    ) -> Option<(
        Vec<Arc<super::partition::SnapshotBatch>>,
-        Option<Arc<PersistingBatch>>,
+        Option<Arc<super::partition::PersistingBatch>>,
    )> {
        if let Some(t) = self.table_data(table_name) {
            let mut t = t.write().await;

-            return t.partition_data.get_mut(partition_key).map(|p| {
+            return t.get_partition_by_key_mut(partition_key).map(|p| {
                p.data
                    .generate_snapshot()
                    .expect("snapshot on mutable batch should never fail");
@ -217,17 +272,17 @@ impl NamespaceData {
    /// Snapshots the mutable buffer for the partition, which clears it out and then moves all
    /// snapshots over to a persisting batch, which is returned. If there is no data to snapshot
    /// or persist, None will be returned.
+    #[cfg(test)] // Only used in tests
    pub(crate) async fn snapshot_to_persisting(
        &self,
-        table_name: &str,
+        table_name: &TableName,
        partition_key: &PartitionKey,
-    ) -> Option<Arc<PersistingBatch>> {
+    ) -> Option<Arc<super::partition::PersistingBatch>> {
        if let Some(table_data) = self.table_data(table_name) {
            let mut table_data = table_data.write().await;

            return table_data
-                .partition_data
-                .get_mut(partition_key)
+                .get_partition_by_key_mut(partition_key)
                .and_then(|partition_data| partition_data.snapshot_to_persisting_batch());
        }

@ -237,45 +292,55 @@ impl NamespaceData {
    /// Gets the buffered table data
    pub(crate) fn table_data(
        &self,
-        table_name: &str,
+        table_name: &TableName,
    ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
        let t = self.tables.read();
-        t.get(table_name).cloned()
+        t.by_name(table_name)
+    }
+
+    /// Return the table data by ID.
+    pub(crate) fn table_id(
+        &self,
+        table_id: TableId,
+    ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        let t = self.tables.read();
+        t.by_id(table_id)
    }

    /// Inserts the table or returns it if it happens to be inserted by some other thread
    async fn insert_table(
        &self,
-        table_name: &str,
+        table_name: &TableName,
        catalog: &Arc<dyn Catalog>,
    ) -> Result<Arc<tokio::sync::RwLock<TableData>>, super::Error> {
        let mut repos = catalog.repositories().await;
+
        let info = repos
            .tables()
            .get_table_persist_info(self.shard_id, self.namespace_id, table_name)
            .await
            .context(super::CatalogSnafu)?
-            .context(super::TableNotFoundSnafu { table_name })?;
+            .ok_or_else(|| super::Error::TableNotFound {
+                table_name: table_name.to_string(),
+            })?;

        let mut t = self.tables.write();

-        let data = match t.entry(table_name.to_string()) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new(
+        Ok(match t.by_name(table_name) {
+            Some(v) => v,
+            None => {
+                self.table_count.inc(1);
+
+                // Insert the table and then return a ref to it.
+                t.insert(TableData::new(
                    info.table_id,
-                    table_name,
+                    table_name.clone(),
                    self.shard_id,
                    self.namespace_id,
-                    info.tombstone_max_sequence_number,
                    Arc::clone(&self.partition_provider),
-                ))));
-                self.table_count.inc(1);
-                Arc::clone(v)
+                ))
            }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };
-
-        Ok(data)
+        })
    }

    /// Walks down the table and partition and clears the persisting batch. The sequence number is
@ -283,13 +348,13 @@ impl NamespaceData {
    /// data buffer.
    pub(super) async fn mark_persisted(
        &self,
-        table_name: &str,
+        table_name: &TableName,
        partition_key: &PartitionKey,
        sequence_number: SequenceNumber,
    ) {
        if let Some(t) = self.table_data(table_name) {
            let mut t = t.write().await;
-            let partition = t.partition_data.get_mut(partition_key);
+            let partition = t.get_partition_by_key_mut(partition_key);

            if let Some(p) = partition {
                p.mark_persisted(sequence_number);
@ -299,7 +364,7 @@ impl NamespaceData {

    /// Return progress from this Namespace
    pub(super) async fn progress(&self) -> ShardProgress {
-        let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect();
+        let tables: Vec<_> = self.tables.read().by_id.values().map(Arc::clone).collect();

        // Consolidate progtress across partitions.
        let mut progress = ShardProgress::new()
@ -323,6 +388,12 @@ impl NamespaceData {
    pub(super) fn table_count(&self) -> &U64Counter {
        &self.table_count
    }
+
+    /// Returns the [`NamespaceName`] for this namespace.
+    #[cfg(test)]
+    pub(crate) fn namespace_name(&self) -> &NamespaceName {
+        &self.namespace_name
+    }
 }

 /// RAAI struct that sets buffering sequence number on creation and clears it on free
@ -357,3 +428,92 @@ impl<'a> Drop for ScopedSequenceNumber<'a> {
        *buffering_sequence_number = None;
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_namespace_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let ns = NamespaceData::new(
+            ns_id,
+            NAMESPACE_NAME.into(),
+            shard_id,
+            partition_provider,
+            &*metrics,
+        );
+
+        // Assert the namespace name was stored
+        assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME);
+
+        // Assert the namespace does not contain the test data
+        assert!(ns.table_data(&TABLE_NAME.into()).is_none());
+        assert!(ns.table_id(table_id).is_none());
+
+        // Write some test data
+        ns.buffer_operation(
+            DmlOperation::Write(make_write_op(
+                &PartitionKey::from("banana-split"),
+                SHARD_INDEX,
+                NAMESPACE_NAME,
+                0,
+                r#"bananas,city=Medford day="sun",temp=55 22"#,
+            )),
+            &catalog,
+            &MockLifecycleHandle::default(),
+        )
+        .await
+        .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(ns.table_data(&TABLE_NAME.into()).is_some());
+        assert!(ns.table_id(table_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_tables_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@ -3,18 +3,21 @@
 use std::sync::Arc;

 use arrow::record_batch::RecordBatch;
-use data_types::{
-    NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, Tombstone,
-};
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use observability_deps::tracing::*;
+use schema::{selection::Selection, sort::SortKey};
 use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;

-use self::buffer::{BufferBatch, DataBuffer};
-use crate::{data::query_dedup::query, query::QueryableBatch};
+use self::{
+    buffer::{BufferBatch, DataBuffer},
+    resolver::DeferredSortKey,
+};
+use crate::{querier_handler::PartitionStatus, query::QueryableBatch};
+
+use super::table::TableName;

 mod buffer;
 pub mod resolver;
@ -28,20 +31,6 @@ pub(crate) struct UnpersistedPartitionData {
    pub(crate) partition_status: PartitionStatus,
 }

-/// Status of a partition that has unpersisted data.
-///
-/// Note that this structure is specific to a partition (which itself is bound to a table and
-/// shard)!
-#[derive(Debug, Clone, PartialEq, Eq)]
-#[allow(missing_copy_implementations)]
-pub struct PartitionStatus {
-    /// Max sequence number persisted
-    pub parquet_max_sequence_number: Option<SequenceNumber>,
-
-    /// Max sequence number for a tombstone
-    pub tombstone_max_sequence_number: Option<SequenceNumber>,
-}
-
 /// PersistingBatch contains all needed info and data for creating
 /// a parquet file for given set of SnapshotBatches
 #[derive(Debug, PartialEq, Clone)]
@ -132,7 +121,28 @@ impl SnapshotBatch {
    }
 }

-/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard
+/// The load state of the [`SortKey`] for a given partition.
+#[derive(Debug)]
+pub(crate) enum SortKeyState {
+    /// The [`SortKey`] has not yet been fetched from the catalog, and will be
+    /// lazy loaded (or loaded in the background) by a call to
+    /// [`DeferredSortKey::get()`].
+    Deferred(DeferredSortKey),
+    /// The sort key is known and specified.
+    Provided(Option<SortKey>),
+}
+
+impl SortKeyState {
+    async fn get(&self) -> Option<SortKey> {
+        match self {
+            Self::Deferred(v) => v.get().await,
+            Self::Provided(v) => v.clone(),
+        }
+    }
+}
+
+/// Data of an IOx Partition of a given Table of a Namespace that belongs to a
+/// given Shard
 #[derive(Debug)]
 pub struct PartitionData {
    /// The catalog ID of the partition this buffer is for.
@ -140,12 +150,23 @@ pub struct PartitionData {
    /// The string partition key for this partition.
    partition_key: PartitionKey,

+    /// The sort key of this partition.
+    ///
+    /// This can known, in which case this field will contain a
+    /// [`SortKeyState::Provided`] with the [`SortKey`], or unknown with a value
+    /// of [`SortKeyState::Deferred`] causing it to be loaded from the catalog
+    /// (potentially) in the background or at read time.
+    ///
+    /// Callers should use [`Self::sort_key()`] to be abstracted away from these
+    /// fetch details.
+    sort_key: SortKeyState,
+
    /// The shard, namespace & table IDs for this partition.
    shard_id: ShardId,
    namespace_id: NamespaceId,
    table_id: TableId,
    /// The name of the table this partition is part of.
-    table_name: Arc<str>,
+    table_name: TableName,

    pub(super) data: DataBuffer,

@ -156,18 +177,21 @@ pub struct PartitionData {

 impl PartitionData {
    /// Initialize a new partition data buffer
+    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        id: PartitionId,
        partition_key: PartitionKey,
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
+        sort_key: SortKeyState,
        max_persisted_sequence_number: Option<SequenceNumber>,
    ) -> Self {
        Self {
            id,
            partition_key,
+            sort_key,
            shard_id,
            namespace_id,
            table_id,
@ -209,100 +233,36 @@ impl PartitionData {
        sequence_number: SequenceNumber,
        mb: MutableBatch,
    ) -> Result<(), super::Error> {
-        match &mut self.data.buffer {
+        let (min_sequence_number, max_sequence_number) = match &mut self.data.buffer {
            Some(buf) => {
                buf.max_sequence_number = sequence_number.max(buf.max_sequence_number);
                buf.data.extend_from(&mb).context(super::BufferWriteSnafu)?;
+                (buf.min_sequence_number, buf.max_sequence_number)
            }
            None => {
                self.data.buffer = Some(BufferBatch {
                    min_sequence_number: sequence_number,
                    max_sequence_number: sequence_number,
                    data: mb,
-                })
+                });
+                (sequence_number, sequence_number)
            }
-        }
+        };
+        trace!(
+            min_sequence_number=?min_sequence_number,
+            max_sequence_number=?max_sequence_number,
+            "buffered write"
+        );

        Ok(())
    }

-    /// Buffers a new tombstone:
-    ///   . All the data in the `buffer` and `snapshots` will be replaced with one
-    ///     tombstone-applied snapshot
-    ///   . The tombstone is only added in the `deletes_during_persisting` if the `persisting`
-    ///     exists
-    pub(super) async fn buffer_tombstone(&mut self, executor: &Executor, tombstone: Tombstone) {
-        self.data.add_tombstone(tombstone.clone());
-
-        // ----------------------------------------------------------
-        // First apply the tombstone on all in-memory & non-persisting data
-        // Make a QueryableBatch for all buffer + snapshots + the given tombstone
-        let max_sequence_number = tombstone.sequence_number;
-        let query_batch = match self.data.snapshot_to_queryable_batch(
-            &self.table_name,
-            self.id,
-            Some(tombstone.clone()),
-        ) {
-            Some(query_batch) if !query_batch.is_empty() => query_batch,
-            _ => {
-                // No need to proceed further
-                return;
-            }
-        };
-
-        let (min_sequence_number, _) = query_batch.min_max_sequence_numbers();
-        assert!(min_sequence_number <= max_sequence_number);
-
-        // Run query on the QueryableBatch to apply the tombstone.
-        let stream = match query(executor, Arc::new(query_batch)).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to apply tombstones on snapshots: {:?}", e);
-            }
-            Ok(stream) => stream,
-        };
-        let record_batches = match datafusion::physical_plan::common::collect(stream).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to collect record batches: {:?}", e);
-            }
-            Ok(batches) => batches,
-        };
-
-        // Merge all result record batches into one record batch
-        // and make a snapshot for it
-        let snapshot = if !record_batches.is_empty() {
-            let record_batch =
-                arrow::compute::concat_batches(&record_batches[0].schema(), &record_batches)
-                    .unwrap_or_else(|e| {
-                        panic!("unable to concat record batches: {:?}", e);
-                    });
-            let snapshot = SnapshotBatch {
-                min_sequence_number,
-                max_sequence_number,
-                data: Arc::new(record_batch),
-            };
-
-            Some(Arc::new(snapshot))
-        } else {
-            None
-        };
-
-        // ----------------------------------------------------------
-        // Add the tombstone-applied data back in as one snapshot
-        if let Some(snapshot) = snapshot {
-            self.data.snapshots.push(snapshot);
-        }
-    }
-
    /// Return the progress from this Partition
    pub(super) fn progress(&self) -> ShardProgress {
        self.data.progress()
    }

-    pub(super) fn id(&self) -> PartitionId {
+    pub(super) fn partition_id(&self) -> PartitionId {
        self.id
    }

@ -347,6 +307,13 @@ impl PartitionData {
    pub fn namespace_id(&self) -> NamespaceId {
        self.namespace_id
    }
+
+    /// Return the [`SortKey`] for this partition.
+    ///
+    /// NOTE: this MAY involve querying the catalog with unbounded retries.
+    pub async fn sort_key(&self) -> Option<SortKey> {
+        self.sort_key.get().await
+    }
 }

 #[cfg(test)]
@ -355,7 +322,6 @@ mod tests {
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;

    use super::*;
-    use crate::test_util::create_tombstone;

    #[test]
    fn snapshot_buffer_different_but_compatible_schemas() {
@ -366,6 +332,7 @@ mod tests {
            NamespaceId::new(42),
            TableId::new(1),
            "foo".into(),
+            SortKeyState::Provided(None),
            None,
        );

@ -401,7 +368,7 @@ mod tests {

    // Test deletes mixed with writes on a single parittion
    #[tokio::test]
-    async fn writes_and_deletes() {
+    async fn writes() {
        // Make a partition with empty DataBuffer
        let s_id = 1;
        let t_id = 1;
@ -413,9 +380,9 @@ mod tests {
            NamespaceId::new(42),
            TableId::new(t_id),
            "restaurant".into(),
+            SortKeyState::Provided(None),
            None,
        );
-        let exec = Executor::new(1);

        // ------------------------------------------
        // Fill `buffer`
@ -438,42 +405,8 @@ mod tests {
            SequenceNumber::new(2)
        );
        assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
        assert_eq!(p.data.persisting, None);

-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 3
-        let ts = create_tombstone(
-            1,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            3,         // delete's seq_number
-            0,         // min time of data to get deleted
-            20,        // max time of data to get deleted
-            "day=thu", // delete predicate
-        );
-        // one row will get deleted, the other is moved to snapshot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has one row since the other one got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+--------+-----+------+--------------------------------+",
-            "| city   | day | temp | time                           |",
-            "+--------+-----+------+--------------------------------+",
-            "| Boston | fri | 50   | 1970-01-01T00:00:00.000000010Z |",
-            "+--------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 3);
-
        // ------------------------------------------
        // Fill `buffer`
        // --- seq_num: 4
@ -493,50 +426,15 @@ mod tests {
        // verify data
        assert_eq!(
            p.data.buffer.as_ref().unwrap().min_sequence_number,
-            SequenceNumber::new(4)
+            SequenceNumber::new(1)
        );
        assert_eq!(
            p.data.buffer.as_ref().unwrap().max_sequence_number,
            SequenceNumber::new(5)
        );
-        assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
+        assert_eq!(p.data.snapshots.len(), 0);
        assert_eq!(p.data.persisting, None);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 6
-        let ts = create_tombstone(
-            2,             // tombstone id
-            t_id,          // table id
-            s_id,          // shard id
-            6,             // delete's seq_number
-            10,            // min time of data to get deleted
-            50,            // max time of data to get deleted
-            "city=Boston", // delete predicate
-        );
-        // two rows will get deleted, one from existing snapshot, one from the buffer being moved
-        // to snpashot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has two rows since the other 2 rows with city=Boston have got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+---------+-----+------+--------------------------------+",
-            "| city    | day | temp | time                           |",
-            "+---------+-----+------+--------------------------------+",
-            "| Andover | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "+---------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 6);
+        assert!(p.data.buffer.is_some());

        // ------------------------------------------
        // Persisting
@ -545,32 +443,12 @@ mod tests {
        // verify data
        assert!(p.data.buffer.is_none()); // always empty after issuing persit
        assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit
-        assert_eq!(p.data.deletes_during_persisting().len(), 0); // deletes not happen yet
        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));

-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 7
-        let ts = create_tombstone(
-            3,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            7,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=55", // delete predicate
-        );
-        // if a query come while persisting, the row with temp=55 will be deleted before
-        // data is sent back to Querier
-        p.buffer_tombstone(&exec, ts).await;
-
        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-                                          // no snpashots becasue buffer has not data yet and the
-                                          // snapshot was empty too
-        assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 1); // tombstone added since data is
-                                                                 // persisting
+        assert!(p.data.buffer.is_none());
+        assert_eq!(p.data.snapshots.len(), 0); // no snpashots becasue buffer has not data yet and the
+                                               // snapshot was empty too
        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));

        // ------------------------------------------
@ -591,7 +469,6 @@ mod tests {
            SequenceNumber::new(8)
        ); // 1 newly added mutable batch of 3 rows of data
        assert_eq!(p.data.snapshots.len(), 0); // still empty
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));

        // ------------------------------------------
@ -600,7 +477,6 @@ mod tests {
        // verify data
        assert!(p.data.buffer.is_none()); // empty after snapshot
        assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
        // snapshot has three rows moved from buffer
        let data = (*p.data.snapshots[0].data).clone();
@ -616,41 +492,5 @@ mod tests {
        assert_batches_sorted_eq!(&expected, &[data]);
        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 8);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 9
-        let ts = create_tombstone(
-            4,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            9,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=60", // delete predicate
-        );
-        // the row with temp=60 will be removed from the sanphot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied
-        assert_eq!(p.data.deletes_during_persisting().len(), 2); // one more tombstone added make it 2
-        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
-        // snapshot has only 2 rows because the row with tem=60 was removed
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Wilmington | sun | 55   | 1970-01-01T00:00:00.000000035Z |",
-            "| Boston     | sun | 62   | 1970-01-01T00:00:00.000000038Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 9);
-
-        exec.join().await;
    }
 }
--- a/ingester/src/data/partition/buffer.rs
+++ b/ingester/src/data/partition/buffer.rs
@ -2,13 +2,15 @@

 use std::sync::Arc;

-use data_types::{PartitionId, SequenceNumber, ShardId, TableId, Tombstone};
+use data_types::{PartitionId, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
 use schema::selection::Selection;
 use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;

+use crate::data::table::TableName;
+
 use super::{PersistingBatch, QueryableBatch, SnapshotBatch};

 /// Data of an IOx partition split into batches
@ -38,14 +40,6 @@ pub(crate) struct DataBuffer {
    /// Buffer of incoming writes
    pub(crate) buffer: Option<BufferBatch>,

-    /// Buffer of tombstones whose time range may overlap with this partition.
-    /// All tombstones were already applied to corresponding snapshots. This list
-    /// only keep the ones that come during persisting. The reason
-    /// we keep them becasue if a query comes, we need to apply these tombstones
-    /// on the persiting data before sending it to the Querier
-    /// When the `persiting` is done and removed, this list will get empty, too
-    deletes_during_persisting: Vec<Tombstone>,
-
    /// Data in `buffer` will be moved to a `snapshot` when one of these happens:
    ///  . A background persist is called
    ///  . A read request from Querier
@ -70,14 +64,6 @@ pub(crate) struct DataBuffer {
 }

 impl DataBuffer {
-    /// Add a new tombstones into the [`DataBuffer`].
-    pub(super) fn add_tombstone(&mut self, tombstone: Tombstone) {
-        // Only keep this tombstone if some data is being persisted
-        if self.persisting.is_some() {
-            self.deletes_during_persisting.push(tombstone);
-        }
-    }
-
    /// If a [`BufferBatch`] exists, convert it to a [`SnapshotBatch`] and add
    /// it to the list of snapshots.
    ///
@ -109,9 +95,8 @@ impl DataBuffer {
    /// Both buffer and snapshots will be empty after this
    pub(super) fn snapshot_to_queryable_batch(
        &mut self,
-        table_name: &Arc<str>,
+        table_name: &TableName,
        partition_id: PartitionId,
-        tombstone: Option<Tombstone>,
    ) -> Option<QueryableBatch> {
        self.generate_snapshot()
            .expect("This mutable batch snapshot error should be impossible.");
@ -119,21 +104,11 @@ impl DataBuffer {
        let mut data = vec![];
        std::mem::swap(&mut data, &mut self.snapshots);

-        let mut tombstones = vec![];
-        if let Some(tombstone) = tombstone {
-            tombstones.push(tombstone);
-        }
-
        // only produce batch if there is any data
        if data.is_empty() {
            None
        } else {
-            Some(QueryableBatch::new(
-                Arc::clone(table_name),
-                partition_id,
-                data,
-                tombstones,
-            ))
+            Some(QueryableBatch::new(table_name.clone(), partition_id, data))
        }
    }

@ -164,15 +139,13 @@ impl DataBuffer {
        shard_id: ShardId,
        table_id: TableId,
        partition_id: PartitionId,
-        table_name: &Arc<str>,
+        table_name: &TableName,
    ) -> Option<Arc<PersistingBatch>> {
        if self.persisting.is_some() {
            panic!("Unable to snapshot while persisting. This is an unexpected state.")
        }

-        if let Some(queryable_batch) =
-            self.snapshot_to_queryable_batch(table_name, partition_id, None)
-        {
+        if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, partition_id) {
            let persisting_batch = Arc::new(PersistingBatch {
                shard_id,
                table_id,
@ -197,12 +170,7 @@ impl DataBuffer {
        };

        // persisting data
-        let mut queryable_batch = (*persisting.data).clone();
-
-        // Add new tombstones if any
-        queryable_batch.add_tombstones(&self.deletes_during_persisting);
-
-        Some(queryable_batch)
+        Some((*persisting.data).clone())
    }

    /// Return the progress in this DataBuffer
@ -239,12 +207,6 @@ impl DataBuffer {

    pub(crate) fn mark_persisted(&mut self) {
        self.persisting = None;
-        self.deletes_during_persisting.clear()
-    }
-
-    #[cfg(test)]
-    pub(super) fn deletes_during_persisting(&self) -> &[Tombstone] {
-        self.deletes_during_persisting.as_ref()
    }
 }

--- a/ingester/src/data/partition/resolver/cache.rs
+++ b/ingester/src/data/partition/resolver/cache.rs
@ -1,13 +1,18 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Duration};

 use async_trait::async_trait;
+use backoff::BackoffConfig;
 use data_types::{
    NamespaceId, Partition, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
 };
+use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;

-use crate::data::partition::PartitionData;
+use crate::data::{
+    partition::{resolver::DeferredSortKey, PartitionData, SortKeyState},
+    table::TableName,
+};

 use super::r#trait::PartitionProvider;

@ -43,6 +48,18 @@ struct Entry {
 /// Each cache hit _removes_ the entry from the cache - this eliminates the
 /// memory overhead for items that were hit. This is the expected (only valid!)
 /// usage pattern.
+///
+/// # Deferred Sort Key Loading
+///
+/// This cache does NOT cache the [`SortKey`] for each [`PartitionData`], as the
+/// sort key can be large and is likely unique per table, and thus not
+/// share-able across instances / prohibitively expensive to cache.
+///
+/// Instead cached instances are returned with a deferred sort key resolver
+/// which attempts to fetch the sort key in the background some time after
+/// construction.
+///
+/// [`SortKey`]: schema::sort::SortKey
 #[derive(Debug)]
 pub(crate) struct PartitionCache<T> {
    // The inner delegate called for a cache miss.
@ -59,13 +76,31 @@ pub(crate) struct PartitionCache<T> {
    /// a faster search for cache misses.
    #[allow(clippy::type_complexity)]
    entries: Mutex<HashMap<PartitionKey, HashMap<ShardId, HashMap<TableId, Entry>>>>,
+
+    /// Data needed to construct the [`DeferredSortKey`] for cached entries.
+    catalog: Arc<dyn Catalog>,
+    backoff_config: BackoffConfig,
+    /// The maximum amount of time a [`DeferredSortKey`] may wait until
+    /// pre-fetching the sort key in the background.
+    max_smear: Duration,
 }

 impl<T> PartitionCache<T> {
    /// Initialise a [`PartitionCache`] containing the specified partitions.
    ///
    /// Any cache miss is passed through to `inner`.
-    pub(crate) fn new<P>(inner: T, partitions: P) -> Self
+    ///
+    /// Any cache hit returns a [`PartitionData`] configured with a
+    /// [`SortKeyState::Deferred`] for deferred key loading in the background.
+    /// The [`DeferredSortKey`] is initialised with the given `catalog`,
+    /// `backoff_config`, and `max_smear` maximal load wait duration.
+    pub(crate) fn new<P>(
+        inner: T,
+        partitions: P,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self
    where
        P: IntoIterator<Item = Partition>,
    {
@ -97,6 +132,9 @@ impl<T> PartitionCache<T> {
        Self {
            entries: Mutex::new(entries),
            inner,
+            catalog,
+            backoff_config,
+            max_smear,
        }
    }

@ -154,7 +192,7 @@ where
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
    ) -> PartitionData {
        // Use the cached PartitionKey instead of the caller's partition_key,
        // instead preferring to reuse the already-shared Arc<str> in the cache.
@ -171,6 +209,12 @@ where
                namespace_id,
                table_id,
                table_name,
+                SortKeyState::Deferred(DeferredSortKey::new(
+                    cached.partition_id,
+                    self.max_smear,
+                    Arc::clone(&__self.catalog),
+                    self.backoff_config.clone(),
+                )),
                cached.max_sequence_number,
            );
        }
@ -186,6 +230,8 @@ where

 #[cfg(test)]
 mod tests {
+    use iox_catalog::mem::MemCatalog;
+
    use crate::data::partition::resolver::MockPartitionProvider;

    use super::*;
@ -197,6 +243,22 @@ mod tests {
    const TABLE_ID: TableId = TableId::new(3);
    const TABLE_NAME: &str = "platanos";

+    fn new_cache<P>(
+        inner: MockPartitionProvider,
+        partitions: P,
+    ) -> PartitionCache<MockPartitionProvider>
+    where
+        P: IntoIterator<Item = Partition>,
+    {
+        PartitionCache::new(
+            inner,
+            partitions,
+            Duration::from_secs(10_000_000),
+            Arc::new(MemCatalog::new(Arc::new(metric::Registry::default()))),
+            BackoffConfig::default(),
+        )
+    }
+
    #[tokio::test]
    async fn test_miss() {
        let data = PartitionData::new(
@ -206,11 +268,12 @@ mod tests {
            NAMESPACE_ID,
            TABLE_ID,
            TABLE_NAME.into(),
+            SortKeyState::Provided(None),
            None,
        );
        let inner = MockPartitionProvider::default().with_partition(data);

-        let cache = PartitionCache::new(inner, []);
+        let cache = new_cache(inner, []);
        let got = cache
            .get_partition(
                PARTITION_KEY.into(),
@ -221,7 +284,7 @@ mod tests {
            )
            .await;

-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
        assert_eq!(got.shard_id(), SHARD_ID);
        assert_eq!(got.table_id(), TABLE_ID);
        assert_eq!(got.table_name(), TABLE_NAME);
@ -238,11 +301,11 @@ mod tests {
            shard_id: SHARD_ID,
            table_id: TABLE_ID,
            partition_key: stored_partition_key.clone(),
-            sort_key: Default::default(),
+            sort_key: vec!["dos".to_string(), "bananas".to_string()],
            persisted_sequence_number: Default::default(),
        };

-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);

        let callers_partition_key = PartitionKey::from(PARTITION_KEY);
        let got = cache
@ -255,7 +318,7 @@ mod tests {
            )
            .await;

-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
        assert_eq!(got.shard_id(), SHARD_ID);
        assert_eq!(got.table_id(), TABLE_ID);
        assert_eq!(got.table_name(), TABLE_NAME);
@ -274,7 +337,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_miss_partition_jey() {
+    async fn test_miss_partition_key() {
        let other_key = PartitionKey::from("test");
        let other_key_id = PartitionId::new(99);
        let inner = MockPartitionProvider::default().with_partition(PartitionData::new(
@ -284,6 +347,7 @@ mod tests {
            NAMESPACE_ID,
            TABLE_ID,
            TABLE_NAME.into(),
+            SortKeyState::Provided(None),
            None,
        ));

@ -296,7 +360,7 @@ mod tests {
            persisted_sequence_number: Default::default(),
        };

-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
        let got = cache
            .get_partition(
                other_key.clone(),
@ -307,7 +371,7 @@ mod tests {
            )
            .await;

-        assert_eq!(got.id(), other_key_id);
+        assert_eq!(got.partition_id(), other_key_id);
        assert_eq!(got.shard_id(), SHARD_ID);
        assert_eq!(got.table_id(), TABLE_ID);
        assert_eq!(got.table_name(), TABLE_NAME);
@ -323,6 +387,7 @@ mod tests {
            NAMESPACE_ID,
            other_table,
            TABLE_NAME.into(),
+            SortKeyState::Provided(None),
            None,
        ));

@ -335,7 +400,7 @@ mod tests {
            persisted_sequence_number: Default::default(),
        };

-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
        let got = cache
            .get_partition(
                PARTITION_KEY.into(),
@ -346,7 +411,7 @@ mod tests {
            )
            .await;

-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
        assert_eq!(got.shard_id(), SHARD_ID);
        assert_eq!(got.table_id(), other_table);
        assert_eq!(got.table_name(), TABLE_NAME);
@ -362,6 +427,7 @@ mod tests {
            NAMESPACE_ID,
            TABLE_ID,
            TABLE_NAME.into(),
+            SortKeyState::Provided(None),
            None,
        ));

@ -374,7 +440,7 @@ mod tests {
            persisted_sequence_number: Default::default(),
        };

-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
        let got = cache
            .get_partition(
                PARTITION_KEY.into(),
@ -385,7 +451,7 @@ mod tests {
            )
            .await;

-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
        assert_eq!(got.shard_id(), other_shard);
        assert_eq!(got.table_id(), TABLE_ID);
        assert_eq!(got.table_name(), TABLE_NAME);
--- a/ingester/src/data/partition/resolver/catalog.rs
+++ b/ingester/src/data/partition/resolver/catalog.rs
@ -9,7 +9,10 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;

-use crate::data::partition::PartitionData;
+use crate::data::{
+    partition::{PartitionData, SortKeyState},
+    table::TableName,
+};

 use super::r#trait::PartitionProvider;

@ -55,7 +58,7 @@ impl PartitionProvider for CatalogPartitionResolver {
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
    ) -> PartitionData {
        debug!(
            %partition_key,
@ -78,6 +81,7 @@ impl PartitionProvider for CatalogPartitionResolver {
            namespace_id,
            table_id,
            table_name,
+            SortKeyState::Provided(p.sort_key()),
            p.persisted_sequence_number,
        )
    }
@ -131,7 +135,7 @@ mod tests {
        };

        let callers_partition_key = PartitionKey::from(PARTITION_KEY);
-        let table_name = TABLE_NAME.into();
+        let table_name = TableName::from(TABLE_NAME);
        let resolver = CatalogPartitionResolver::new(Arc::clone(&catalog));
        let got = resolver
            .get_partition(
@ -139,11 +143,12 @@ mod tests {
                shard_id,
                namespace_id,
                table_id,
-                Arc::clone(&table_name),
+                table_name.clone(),
            )
            .await;
        assert_eq!(got.namespace_id(), namespace_id);
        assert_eq!(*got.table_name(), *table_name);
+        assert_eq!(got.sort_key().await, None);
        assert_eq!(got.max_persisted_sequence_number(), None);
        assert!(got.partition_key.ptr_eq(&callers_partition_key));

--- a/ingester/src/data/partition/resolver/mock.rs
+++ b/ingester/src/data/partition/resolver/mock.rs
@ -1,12 +1,12 @@
 //! A mock [`PartitionProvider`] to inject [`PartitionData`] for tests.

-use std::{collections::HashMap, sync::Arc};
+use std::collections::HashMap;

 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
 use parking_lot::Mutex;

-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};

 use super::r#trait::PartitionProvider;

@ -58,7 +58,7 @@ impl PartitionProvider for MockPartitionProvider {
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
    ) -> PartitionData {
        let p = self
            .partitions
--- a/ingester/src/data/partition/resolver/mod.rs
+++ b/ingester/src/data/partition/resolver/mod.rs
@ -11,6 +11,9 @@ pub use r#trait::*;
 mod catalog;
 pub use catalog::*;

+mod sort_key;
+pub(crate) use sort_key::*;
+
 #[cfg(test)]
 mod mock;
 #[cfg(test)]
--- a/ingester/src/data/partition/resolver/sort_key.rs
+++ b/ingester/src/data/partition/resolver/sort_key.rs
@ -0,0 +1,331 @@
+//! A optimised resolver of a partition [`SortKey`].
+
+use std::{sync::Arc, time::Duration};
+
+use backoff::{Backoff, BackoffConfig};
+use data_types::PartitionId;
+use iox_catalog::interface::Catalog;
+use parking_lot::Mutex;
+use rand::Rng;
+use schema::sort::SortKey;
+use tokio::task::JoinHandle;
+
+/// The states of a [`DeferredSortKey`] instance.
+#[derive(Debug)]
+enum State {
+    /// The value has not yet been fetched by the background task.
+    Unresolved,
+    /// The value was fetched by the background task and is read to be consumed.
+    Resolved(Option<SortKey>),
+}
+
+/// A resolver of [`SortKey`] from the catalog for a given partition.
+///
+/// This implementation combines lazy / deferred loading of the [`SortKey`] from
+/// the [`Catalog`], and a background timer that pre-fetches the [`SortKey`]
+/// after some random duration of time. Combined, these behaviours smear the
+/// [`SortKey`] queries across the allowable time range, avoiding a large number
+/// of queries from executing when multiple [`SortKey`] are needed in the system
+/// at one point in time.
+///
+/// If the [`DeferredSortKey`] is dropped and the background task is still
+/// incomplete (sleeping / actively fetching the [`SortKey`]) it is aborted
+/// immediately. The background task exists once it has successfully fetched the
+/// [`SortKey`].
+///
+/// # Stale Cached Values
+///
+/// This is effectively a cache that is pre-warmed in the background - this
+/// necessitates that the caller can tolerate, or determine, stale values.
+#[derive(Debug)]
+pub(crate) struct DeferredSortKey {
+    value: Arc<Mutex<State>>,
+    partition_id: PartitionId,
+
+    handle: JoinHandle<()>,
+
+    backoff_config: BackoffConfig,
+    catalog: Arc<dyn Catalog>,
+}
+
+impl DeferredSortKey {
+    /// Construct a [`DeferredSortKey`] instance that fetches the [`SortKey`]
+    /// for the specified `partition_id`.
+    ///
+    /// The background task will wait a uniformly random duration of time
+    /// between `[0, max_smear)` before attempting to pre-fetch the [`SortKey`]
+    /// from `catalog`.
+    pub(crate) fn new(
+        partition_id: PartitionId,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self {
+        // Init the value container the background thread populates.
+        let value = Arc::new(Mutex::new(State::Unresolved));
+
+        // Select random duration from a uniform distribution, up to the
+        // configured maximum.
+        let wait_for = rand::thread_rng().gen_range(Duration::ZERO..max_smear);
+
+        // Spawn the background task, sleeping for the random duration of time
+        // before fetching the sort key.
+        let handle = tokio::spawn({
+            let value = Arc::clone(&value);
+            let catalog = Arc::clone(&catalog);
+            let backoff_config = backoff_config.clone();
+            async move {
+                // Sleep for the random duration
+                tokio::time::sleep(wait_for).await;
+                // Fetch the sort key from the catalog
+                let v = fetch(partition_id, &*catalog, &backoff_config).await;
+                // And attempt to update the value container, if it hasn't
+                // already resolved
+                let mut state = value.lock();
+                *state = match *state {
+                    State::Unresolved => State::Resolved(v),
+                    State::Resolved(_) => return,
+                };
+            }
+        });
+
+        Self {
+            value,
+            partition_id,
+            handle,
+            backoff_config,
+            catalog,
+        }
+    }
+
+    /// Read the [`SortKey`] for the partition.
+    ///
+    /// If the [`SortKey`] was pre-fetched in the background, it is returned
+    /// immediately. If the [`SortKey`] has not yet been resolved, this call
+    /// blocks while it is read from the [`Catalog`].
+    ///
+    /// # Concurrency
+    ///
+    /// If this method requires resolving the [`SortKey`], N concurrent callers
+    /// will cause N queries against the catalog.
+    ///
+    /// # Await Safety
+    ///
+    /// Cancelling the future returned by calling [`Self::get()`] before
+    /// completion will leave [`Self`] without a background task. The next call
+    /// to [`Self::get()`] will incur a catalog query (see concurrency above).
+    pub(crate) async fn get(&self) -> Option<SortKey> {
+        {
+            let state = self.value.lock();
+
+            // If there is a resolved value, return it.
+            if let State::Resolved(v) = &*state {
+                return v.clone();
+            }
+        }
+
+        // Otherwise resolve the value immediately, aborting the background
+        // task.
+        self.handle.abort();
+        let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await;
+
+        {
+            let mut state = self.value.lock();
+            *state = State::Resolved(sort_key.clone());
+        }
+
+        sort_key
+    }
+}
+
+impl Drop for DeferredSortKey {
+    fn drop(&mut self) {
+        // Attempt to abort the background task, regardless of it having
+        // completed or not.
+        self.handle.abort()
+    }
+}
+
+/// Fetch the [`SortKey`] from the [`Catalog`] for `partition_id`, retrying
+/// endlessly when errors occur.
+async fn fetch(
+    partition_id: PartitionId,
+    catalog: &dyn Catalog,
+    backoff_config: &BackoffConfig,
+) -> Option<SortKey> {
+    Backoff::new(backoff_config)
+        .retry_all_errors("fetch partition sort key", || async {
+            let s = catalog
+                .repositories()
+                .await
+                .partitions()
+                .get_by_id(partition_id)
+                .await?
+                .expect("resolving sort key for non-existent partition")
+                .sort_key();
+
+            Result::<_, iox_catalog::interface::Error>::Ok(s)
+        })
+        .await
+        .expect("retry forever")
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::ShardIndex;
+    use test_helpers::timeout::FutureTimeout;
+
+    use crate::test_util::populate_catalog;
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+
+    // A test that (most likely) exercises the "read on demand" code path.
+    //
+    // The background task is configured to run some time between now, and
+    // 10,000,000 seconds in the future - it most likely doesn't get to complete
+    // before the get() call is issued.
+    //
+    // If this test flakes, it is POSSIBLE but UNLIKELY that the background task
+    // has completed and the get() call reads a pre-fetched value.
+    #[tokio::test]
+    async fn test_read_demand() {
+        const LONG_LONG_TIME: Duration = Duration::from_secs(10_000_000);
+
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            Duration::from_secs(36_000_000),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        )
+        .get()
+        .await;
+        assert!(fetched.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            LONG_LONG_TIME,
+            Arc::clone(&catalog),
+            backoff_config,
+        )
+        .get()
+        .await;
+
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+
+    // A test that deterministically exercises the "background pre-fetch" code path.
+    #[tokio::test]
+    async fn test_read_pre_fetched() {
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        assert!(fetcher.get().await.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        let fetched = fetcher.get().await;
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+}
--- a/ingester/src/data/partition/resolver/trait.rs
+++ b/ingester/src/data/partition/resolver/trait.rs
@ -3,7 +3,7 @@ use std::{fmt::Debug, sync::Arc};
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};

-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};

 /// An infallible resolver of [`PartitionData`] for the specified shard, table,
 /// and partition key, returning an initialised [`PartitionData`] buffer for it.
@ -20,7 +20,7 @@ pub trait PartitionProvider: Send + Sync + Debug {
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
    ) -> PartitionData;
 }

@ -35,7 +35,7 @@ where
        shard_id: ShardId,
        namespace_id: NamespaceId,
        table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
    ) -> PartitionData {
        (**self)
            .get_partition(partition_key, shard_id, namespace_id, table_id, table_name)
@ -49,7 +49,7 @@ mod tests {

    use data_types::PartitionId;

-    use crate::data::partition::resolver::MockPartitionProvider;
+    use crate::data::partition::{resolver::MockPartitionProvider, SortKeyState};

    use super::*;

@ -59,7 +59,7 @@ mod tests {
        let shard_id = ShardId::new(42);
        let namespace_id = NamespaceId::new(1234);
        let table_id = TableId::new(24);
-        let table_name = "platanos".into();
+        let table_name = TableName::from("platanos");
        let partition = PartitionId::new(4242);
        let data = PartitionData::new(
            partition,
@ -67,22 +67,17 @@ mod tests {
            shard_id,
            namespace_id,
            table_id,
-            Arc::clone(&table_name),
+            table_name.clone(),
+            SortKeyState::Provided(None),
            None,
        );

        let mock = Arc::new(MockPartitionProvider::default().with_partition(data));

        let got = mock
-            .get_partition(
-                key,
-                shard_id,
-                namespace_id,
-                table_id,
-                Arc::clone(&table_name),
-            )
+            .get_partition(key, shard_id, namespace_id, table_id, table_name.clone())
            .await;
-        assert_eq!(got.id(), partition);
+        assert_eq!(got.partition_id(), partition);
        assert_eq!(got.namespace_id(), namespace_id);
        assert_eq!(*got.table_name(), *table_name);
    }
--- a/ingester/src/data/query_dedup.rs
+++ b/ingester/src/data/query_dedup.rs
@ -1,159 +0,0 @@
-use std::sync::Arc;
-
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
-use iox_query::{
-    exec::{Executor, ExecutorType},
-    QueryChunk, QueryChunkMeta, ScanPlanBuilder,
-};
-use observability_deps::tracing::debug;
-use snafu::{ResultExt, Snafu};
-
-use crate::query::QueryableBatch;
-
-#[derive(Debug, Snafu)]
-#[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
-    #[snafu(display("Error creating plan for querying Ingester data to send to Querier"))]
-    Frontend {
-        source: iox_query::frontend::common::Error,
-    },
-
-    #[snafu(display("Error building logical plan for querying Ingester data to send to Querier"))]
-    LogicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error building physical plan for querying Ingester data to send to Querier: {}",
-        source
-    ))]
-    PhysicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error executing the query for getting Ingester data to send to Querier: {}",
-        source
-    ))]
-    ExecutePlan { source: DataFusionError },
-}
-
-/// A specialized `Error` for Ingester's Query errors
-pub type Result<T, E = Error> = std::result::Result<T, E>;
-
-/// Query a given Queryable Batch, applying selection and filters as appropriate
-/// Return stream of record batches
-pub(crate) async fn query(
-    executor: &Executor,
-    data: Arc<QueryableBatch>,
-) -> Result<SendableRecordBatchStream> {
-    // Build logical plan for filtering data
-    // Note that this query will also apply the delete predicates that go with the QueryableBatch
-
-    // TODO: Since we have different type of servers (router,
-    // ingester, compactor, and querier), we may want to add more
-    // types into the ExecutorType to have better log and resource
-    // managment
-    let ctx = executor.new_context(ExecutorType::Query);
-
-    // Creates an execution plan for a scan and filter data of a single chunk
-    let schema = data.schema();
-    let table_name = data.table_name().to_string();
-
-    debug!(%table_name, "Creating single chunk scan plan");
-
-    let logical_plan = ScanPlanBuilder::new(schema, ctx.child_ctx("scan_and_filter planning"))
-        .with_chunks([data as _])
-        .build()
-        .context(FrontendSnafu)?
-        .plan_builder
-        .build()
-        .context(LogicalPlanSnafu)?;
-
-    debug!(%table_name, plan=%logical_plan.display_indent_schema(),
-           "created single chunk scan plan");
-
-    // Build physical plan
-    let physical_plan = ctx
-        .create_physical_plan(&logical_plan)
-        .await
-        .context(PhysicalPlanSnafu {})?;
-
-    // Execute the plan and return the filtered stream
-    let output_stream = ctx
-        .execute_stream(physical_plan)
-        .await
-        .context(ExecutePlanSnafu {})?;
-
-    Ok(output_stream)
-}
-
-#[cfg(test)]
-mod tests {
-    use arrow_util::assert_batches_eq;
-
-    use super::*;
-    use crate::test_util::{
-        create_one_record_batch_with_influxtype_no_duplicates, create_tombstone,
-        make_queryable_batch, make_queryable_batch_with_deletes,
-    };
-
-    #[tokio::test]
-    async fn test_query() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch("test_table", 0, 1, batches);
-
-        // query without filters
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data: all rows and columns should be returned
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 70        | UT   | 1970-01-01T00:00:00.000020Z |",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-
-    #[tokio::test]
-    async fn test_query_with_delete() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data:
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-}
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@ -1,22 +1,49 @@
 //! Shard level data buffer structures.

-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};

-use data_types::{ShardId, ShardIndex};
+use data_types::{NamespaceId, ShardId, ShardIndex};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
 use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
 use write_summary::ShardProgress;

-use super::{namespace::NamespaceData, partition::resolver::PartitionProvider};
+use super::{
+    namespace::{NamespaceData, NamespaceName},
+    partition::resolver::PartitionProvider,
+};
 use crate::lifecycle::LifecycleHandle;

+/// A double-referenced map where [`NamespaceData`] can be looked up by name, or
+/// ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<NamespaceName, Arc<NamespaceData>>,
+    by_id: HashMap<NamespaceId, Arc<NamespaceData>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, name: NamespaceName, ns: NamespaceData) -> Arc<NamespaceData> {
+        let id = ns.namespace_id();
+
+        let ns = Arc::new(ns);
+        self.by_name.insert(name, Arc::clone(&ns));
+        self.by_id.insert(id, Arc::clone(&ns));
+        ns
+    }
+
+    fn by_name(&self, name: &NamespaceName) -> Option<Arc<NamespaceData>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
 /// Data of a Shard
 #[derive(Debug)]
 pub(crate) struct ShardData {
@ -32,7 +59,7 @@ pub(crate) struct ShardData {
    partition_provider: Arc<dyn PartitionProvider>,

    // New namespaces can come in at any time so we need to be able to add new ones
-    namespaces: RwLock<BTreeMap<String, Arc<NamespaceData>>>,
+    namespaces: RwLock<DoubleRef>,

    metrics: Arc<metric::Registry>,
    namespace_count: U64Counter,
@ -72,9 +99,8 @@ impl ShardData {
        dml_operation: DmlOperation,
        catalog: &Arc<dyn Catalog>,
        lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
    ) -> Result<bool, super::Error> {
-        let namespace_data = match self.namespace(dml_operation.namespace()) {
+        let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) {
            Some(d) => d,
            None => {
                self.insert_namespace(dml_operation.namespace(), &**catalog)
@ -83,14 +109,24 @@ impl ShardData {
        };

        namespace_data
-            .buffer_operation(dml_operation, catalog, lifecycle_handle, executor)
+            .buffer_operation(dml_operation, catalog, lifecycle_handle)
            .await
    }

    /// Gets the namespace data out of the map
-    pub(crate) fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
+    pub(crate) fn namespace(&self, namespace: &NamespaceName) -> Option<Arc<NamespaceData>> {
        let n = self.namespaces.read();
-        n.get(namespace).cloned()
+        n.by_name(namespace)
+    }
+
+    /// Gets the namespace data out of the map
+    pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        // TODO: this should be the default once IDs are pushed over the wire.
+        //
+        // At which point the map should be indexed by IDs, instead of namespace
+        // names.
+        let n = self.namespaces.read();
+        n.by_id(namespace_id)
    }

    /// Retrieves the namespace from the catalog and initializes an empty buffer, or
@ -101,6 +137,8 @@ impl ShardData {
        catalog: &dyn Catalog,
    ) -> Result<Arc<NamespaceData>, super::Error> {
        let mut repos = catalog.repositories().await;
+
+        let ns_name = NamespaceName::from(namespace);
        let namespace = repos
            .namespaces()
            .get_by_name(namespace)
@ -110,26 +148,35 @@ impl ShardData {

        let mut n = self.namespaces.write();

-        let data = match n.entry(namespace.name) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(NamespaceData::new(
-                    namespace.id,
-                    self.shard_id,
-                    Arc::clone(&self.partition_provider),
-                    &*self.metrics,
-                )));
+        Ok(match n.by_name(&ns_name) {
+            Some(v) => v,
+            None => {
                self.namespace_count.inc(1);
-                Arc::clone(v)
-            }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };

-        Ok(data)
+                // Insert the table and then return a ref to it.
+                n.insert(
+                    ns_name.clone(),
+                    NamespaceData::new(
+                        namespace.id,
+                        ns_name,
+                        self.shard_id,
+                        Arc::clone(&self.partition_provider),
+                        &*self.metrics,
+                    ),
+                )
+            }
+        })
    }

    /// Return the progress of this shard
    pub(super) async fn progress(&self) -> ShardProgress {
-        let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect();
+        let namespaces: Vec<_> = self
+            .namespaces
+            .read()
+            .by_id
+            .values()
+            .map(Arc::clone)
+            .collect();

        let mut progress = ShardProgress::new();

@ -144,3 +191,89 @@ impl ShardData {
        self.shard_index
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, PartitionKey, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_shard_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let shard = ShardData::new(
+            SHARD_INDEX,
+            shard_id,
+            partition_provider,
+            Arc::clone(&metrics),
+        );
+
+        // Assert the namespace does not contain the test data
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_none());
+        assert!(shard.namespace_by_id(ns_id).is_none());
+
+        // Write some test data
+        shard
+            .buffer_operation(
+                DmlOperation::Write(make_write_op(
+                    &PartitionKey::from("banana-split"),
+                    SHARD_INDEX,
+                    NAMESPACE_NAME,
+                    0,
+                    r#"bananas,city=Medford day="sun",temp=55 22"#,
+                )),
+                &catalog,
+                &MockLifecycleHandle::default(),
+            )
+            .await
+            .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_some());
+        assert!(shard.namespace_by_id(ns_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_namespaces_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@ -1,41 +1,94 @@
 //! Table level data buffer structures.

-use std::{collections::BTreeMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc};

-use data_types::{
-    DeletePredicate, NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId, Timestamp,
-};
-use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
-use snafu::ResultExt;
+use observability_deps::tracing::*;
 use write_summary::ShardProgress;

-use super::partition::{
-    resolver::PartitionProvider, PartitionData, PartitionStatus, UnpersistedPartitionData,
-};
-use crate::lifecycle::LifecycleHandle;
+use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData};
+use crate::{lifecycle::LifecycleHandle, querier_handler::PartitionStatus};
+
+/// A double-referenced map where [`PartitionData`] can be looked up by
+/// [`PartitionKey`], or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_key: HashMap<PartitionKey, PartitionData>,
+    by_id: HashMap<PartitionId, PartitionKey>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, ns: PartitionData) {
+        let id = ns.partition_id();
+        let key = ns.partition_key().clone();
+
+        assert!(self.by_key.insert(key.clone(), ns).is_none());
+        assert!(self.by_id.insert(id, key).is_none());
+    }
+
+    #[cfg(test)]
+    fn by_key(&self, key: &PartitionKey) -> Option<&PartitionData> {
+        self.by_key.get(key)
+    }
+
+    fn by_key_mut(&mut self, key: &PartitionKey) -> Option<&mut PartitionData> {
+        self.by_key.get_mut(key)
+    }
+
+    fn by_id_mut(&mut self, id: PartitionId) -> Option<&mut PartitionData> {
+        let key = self.by_id.get(&id)?.clone();
+        self.by_key_mut(&key)
+    }
+}
+
+/// The string name / identifier of a Table.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct TableName(Arc<str>);
+
+impl<T> From<T> for TableName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::fmt::Display for TableName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl std::ops::Deref for TableName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}

 /// Data of a Table in a given Namesapce that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct TableData {
    table_id: TableId,
-    table_name: Arc<str>,
+    table_name: TableName,

    /// The catalog ID of the shard & namespace this table is being populated
    /// from.
    shard_id: ShardId,
    namespace_id: NamespaceId,

-    // the max sequence number for a tombstone associated with this table
-    tombstone_max_sequence_number: Option<SequenceNumber>,
-
    /// An abstract constructor of [`PartitionData`] instances for a given
    /// `(key, shard, table)` triplet.
    partition_provider: Arc<dyn PartitionProvider>,

-    // Map pf partition key to its data
-    pub(super) partition_data: BTreeMap<PartitionKey, PartitionData>,
+    // Map of partition key to its data
+    partition_data: DoubleRef,
 }

 impl TableData {
@ -51,18 +104,16 @@ impl TableData {
    /// for the first time.
    pub(super) fn new(
        table_id: TableId,
-        table_name: &str,
+        table_name: TableName,
        shard_id: ShardId,
        namespace_id: NamespaceId,
-        tombstone_max_sequence_number: Option<SequenceNumber>,
        partition_provider: Arc<dyn PartitionProvider>,
    ) -> Self {
        Self {
            table_id,
-            table_name: table_name.into(),
+            table_name,
            shard_id,
            namespace_id,
-            tombstone_max_sequence_number,
            partition_data: Default::default(),
            partition_provider,
        }
@ -71,18 +122,13 @@ impl TableData {
    /// Return parquet_max_sequence_number
    pub(super) fn parquet_max_sequence_number(&self) -> Option<SequenceNumber> {
        self.partition_data
+            .by_key
            .values()
            .map(|p| p.max_persisted_sequence_number())
            .max()
            .flatten()
    }

-    /// Return tombstone_max_sequence_number
-    #[allow(dead_code)] // Used in tests
-    pub(super) fn tombstone_max_sequence_number(&self) -> Option<SequenceNumber> {
-        self.tombstone_max_sequence_number
-    }
-
    // buffers the table write and returns true if the lifecycle manager indicates that
    // ingest should be paused.
    pub(super) async fn buffer_table_write(
@ -92,7 +138,7 @@ impl TableData {
        partition_key: PartitionKey,
        lifecycle_handle: &dyn LifecycleHandle,
    ) -> Result<bool, super::Error> {
-        let partition_data = match self.partition_data.get_mut(&partition_key) {
+        let partition_data = match self.partition_data.by_key.get_mut(&partition_key) {
            Some(p) => p,
            None => {
                let p = self
@ -102,86 +148,87 @@ impl TableData {
                        self.shard_id,
                        self.namespace_id,
                        self.table_id,
-                        Arc::clone(&self.table_name),
+                        self.table_name.clone(),
                    )
                    .await;
-                // Add the partition to the map.
-                assert!(self
-                    .partition_data
-                    .insert(partition_key.clone(), p)
-                    .is_none());
-                self.partition_data.get_mut(&partition_key).unwrap()
+                // Add the double-referenced partition to the map.
+                self.partition_data.insert(p);
+                self.partition_data.by_key_mut(&partition_key).unwrap()
            }
        };

        // skip the write if it has already been persisted
        if let Some(max) = partition_data.max_persisted_sequence_number() {
            if max >= sequence_number {
+                trace!(
+                    shard_id=%self.shard_id,
+                    op_sequence_number=?sequence_number,
+                    "skipping already-persisted write"
+                );
                return Ok(false);
            }
        }

+        let size = batch.size();
+        let rows = batch.rows();
+        partition_data.buffer_write(sequence_number, batch)?;
+
+        // Record the write as having been buffered.
+        //
+        // This should happen AFTER the write is applied, because buffering the
+        // op may fail which would lead to a write being recorded, but not
+        // applied.
        let should_pause = lifecycle_handle.log_write(
-            partition_data.id(),
+            partition_data.partition_id(),
            self.shard_id,
            self.namespace_id,
            self.table_id,
            sequence_number,
-            batch.size(),
-            batch.rows(),
+            size,
+            rows,
        );
-        partition_data.buffer_write(sequence_number, batch)?;

        Ok(should_pause)
    }

-    pub(super) async fn buffer_delete(
+    /// Return the [`PartitionData`] for the specified ID.
+    #[allow(unused)]
+    pub(crate) fn get_partition(
        &mut self,
-        predicate: &DeletePredicate,
-        sequence_number: SequenceNumber,
-        catalog: &dyn Catalog,
-        executor: &Executor,
-    ) -> Result<(), super::Error> {
-        let min_time = Timestamp::new(predicate.range.start());
-        let max_time = Timestamp::new(predicate.range.end());
+        partition_id: PartitionId,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_id_mut(partition_id)
+    }

-        let mut repos = catalog.repositories().await;
-        let tombstone = repos
-            .tombstones()
-            .create_or_get(
-                self.table_id,
-                self.shard_id,
-                sequence_number,
-                min_time,
-                max_time,
-                &predicate.expr_sql_string(),
-            )
-            .await
-            .context(super::CatalogSnafu)?;
+    /// Return the [`PartitionData`] for the specified partition key.
+    #[cfg(test)]
+    pub(crate) fn get_partition_by_key(
+        &self,
+        partition_key: &PartitionKey,
+    ) -> Option<&PartitionData> {
+        self.partition_data.by_key(partition_key)
+    }

-        // remember "persisted" state
-        self.tombstone_max_sequence_number = Some(sequence_number);
-
-        // modify one partition at a time
-        for data in self.partition_data.values_mut() {
-            data.buffer_tombstone(executor, tombstone.clone()).await;
-        }
-
-        Ok(())
+    /// Return the [`PartitionData`] for the specified partition key.
+    pub(crate) fn get_partition_by_key_mut(
+        &mut self,
+        partition_key: &PartitionKey,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_key_mut(partition_key)
    }

    pub(crate) fn unpersisted_partition_data(&self) -> Vec<UnpersistedPartitionData> {
        self.partition_data
+            .by_key
            .values()
            .map(|p| UnpersistedPartitionData {
-                partition_id: p.id(),
+                partition_id: p.partition_id(),
                non_persisted: p
                    .get_non_persisting_data()
                    .expect("get_non_persisting should always work"),
                persisting: p.get_persisting_data(),
                partition_status: PartitionStatus {
                    parquet_max_sequence_number: p.max_persisted_sequence_number(),
-                    tombstone_max_sequence_number: self.tombstone_max_sequence_number,
                },
            })
            .collect()
@ -196,14 +243,223 @@ impl TableData {
        };

        self.partition_data
+            .by_key
            .values()
            .fold(progress, |progress, partition_data| {
                progress.combine(partition_data.progress())
            })
    }

-    #[cfg(test)]
+    /// Returns the table ID for this partition.
    pub(super) fn table_id(&self) -> TableId {
        self.table_id
    }
+
+    /// Returns the name of this table.
+    pub(crate) fn table_name(&self) -> &TableName {
+        &self.table_name
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use assert_matches::assert_matches;
+    use data_types::{PartitionId, ShardIndex};
+    use iox_catalog::interface::Catalog;
+    use mutable_batch::writer;
+    use mutable_batch_lp::lines_to_batches;
+    use schema::{InfluxColumnType, InfluxFieldType};
+
+    use crate::{
+        data::{
+            partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+            Error,
+        },
+        lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle},
+        test_util::populate_catalog,
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+    const PARTITION_ID: PartitionId = PartitionId::new(0);
+
+    #[tokio::test]
+    async fn test_partition_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME.into(),
+            shard_id,
+            ns_id,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &MockLifecycleHandle::default(),
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_some());
+    }
+
+    #[tokio::test]
+    async fn test_bad_write_memory_counting() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME.into(),
+            shard_id,
+            ns_id,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Initialise the mock lifecycle handle and use it to inspect the calls
+        // made to the lifecycle manager during buffering.
+        let handle = MockLifecycleHandle::default();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
+
+        // And the lifecycle handle was called with the expected values
+        assert_eq!(
+            handle.get_log_calls(),
+            &[MockLifecycleCall {
+                partition_id: PARTITION_ID,
+                shard_id,
+                namespace_id: ns_id,
+                table_id,
+                sequence_number: SequenceNumber::new(42),
+                bytes_written: 1131,
+                rows_written: 1,
+            }]
+        );
+
+        // Attempt to buffer the second op that contains a type conflict - this
+        // should return an error, and not make a call to the lifecycle handle
+        // (as no data was buffered)
+        //
+        // Note the type of value was numeric previously, and here it is a string.
+        let batch = lines_to_batches(r#"bananas,bat=man value="platanos" 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        let err = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect_err("type conflict should error");
+
+        // The buffer op should return a column type error
+        assert_matches!(
+            err,
+            Error::BufferWrite {
+                source: mutable_batch::Error::WriterError {
+                    source: writer::Error::TypeMismatch {
+                        existing: InfluxColumnType::Field(InfluxFieldType::Float),
+                        inserted: InfluxColumnType::Field(InfluxFieldType::String),
+                        column: col_name,
+                    }
+                },
+            } => { assert_eq!(col_name, "value") }
+        );
+
+        // And the lifecycle handle should not be called.
+        //
+        // It still contains the first call, so the desired length is 1
+        // indicating no second call was made.
+        assert_eq!(handle.get_log_calls().len(), 1);
+    }
 }
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@ -30,17 +30,24 @@ use crate::{
    data::{
        partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
        shard::ShardData,
-        IngesterData, IngesterQueryResponse,
+        IngesterData,
    },
    lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
    poison::PoisonCabinet,
-    querier_handler::prepare_data_to_querier,
+    querier_handler::{prepare_data_to_querier, IngesterQueryResponse},
    stream_handler::{
        handler::SequencedStreamHandler, sink_adaptor::IngestSinkAdaptor,
        sink_instrumentation::SinkInstrumentation, PeriodicWatermarkFetcher,
    },
 };

+/// The maximum duration of time between creating a [`PartitionData`] and its
+/// [`SortKey`] being fetched from the catalog.
+///
+/// [`PartitionData`]: crate::data::partition::PartitionData
+/// [`SortKey`]: schema::sort::SortKey
+const SORT_KEY_PRE_FETCH: Duration = Duration::from_secs(30);
+
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
 pub enum Error {
@ -160,7 +167,13 @@ impl IngestHandlerImpl {

        // Build the partition provider.
        let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
-        let partition_provider = PartitionCache::new(partition_provider, recent_partitions);
+        let partition_provider = PartitionCache::new(
+            partition_provider,
+            recent_partitions,
+            SORT_KEY_PRE_FETCH,
+            Arc::clone(&catalog),
+            BackoffConfig::default(),
+        );
        let partition_provider: Arc<dyn PartitionProvider> = Arc::new(partition_provider);

        // build the initial ingester data state
@ -432,7 +445,7 @@ mod tests {
    use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};

    use super::*;
-    use crate::data::partition::SnapshotBatch;
+    use crate::data::{partition::SnapshotBatch, table::TableName};

    #[tokio::test]
    async fn read_from_write_buffer_write_to_mutable_buffer() {
@ -499,13 +512,16 @@ mod tests {
        // give the writes some time to go through the buffer. Exit once we've verified there's
        // data in there from both writes.
        tokio::time::timeout(Duration::from_secs(2), async {
+            let ns_name = ingester.namespace.name.into();
+            let table_name = TableName::from("a");
            loop {
                let mut has_measurement = false;

                if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
-                    if let Some(data) = data.namespace(&ingester.namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                        // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                            if let Some(b) = b.first() {
                                if b.data.num_rows() > 0 {
                                    has_measurement = true;
@ -740,13 +756,16 @@ mod tests {
        // give the writes some time to go through the buffer. Exit once we've verified there's
        // data in there
        tokio::time::timeout(Duration::from_secs(1), async move {
+            let ns_name = namespace.name.into();
+            let table_name = TableName::from("cpu");
            loop {
                let mut has_measurement = false;

                if let Some(data) = ingester.data.shard(shard.id) {
-                    if let Some(data) = data.namespace(&namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                        // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                            if let Some(b) = b.first() {
                                custom_batch_verification(b);

--- a/ingester/src/lifecycle.rs
+++ b/ingester/src/lifecycle.rs
@ -12,7 +12,7 @@ use std::{collections::BTreeMap, sync::Arc, time::Duration};
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
 use iox_time::{Time, TimeProvider};
 use metric::{Metric, U64Counter};
-use observability_deps::tracing::{error, info, warn};
+use observability_deps::tracing::{error, info, trace, warn};
 use parking_lot::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracker::TrackedFutureExt;
@ -97,6 +97,18 @@ impl LifecycleHandle for LifecycleHandleImpl {
        stats.last_write = now;
        stats.rows_written += rows_written;

+        trace!(
+            shard_id=%stats.shard_id,
+            partition_id=%stats.partition_id,
+            namespace_id=%stats.namespace_id,
+            table_id=%stats.table_id,
+            first_write=%stats.first_write,
+            last_write=%stats.last_write,
+            bytes_written=%stats.bytes_written,
+            first_sequence_number=?stats.first_sequence_number,
+            "logged write"
+        );
+
        s.total_bytes += bytes_written;

        // Pause if the server has exceeded the configured memory limit.
@ -234,7 +246,7 @@ struct LifecycleStats {
 }

 /// The stats for a partition
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 struct PartitionLifecycleStats {
    /// The shard this partition is under
    shard_id: ShardId,
@ -469,6 +481,18 @@ impl LifecycleManager {
        let persist_tasks: Vec<_> = to_persist
            .into_iter()
            .map(|s| {
+                // BUG: TOCTOU: memory usage released may be incorrect.
+                //
+                // Here the amount of memory to be reduced is acquired, but this
+                // code does not prevent continued writes adding more data to
+                // the partition in another thread.
+                //
+                // This may lead to more actual data being persisted than the
+                // call below returns to the server pool - this would slowly
+                // starve the ingester of memory it thinks it has.
+                //
+                // See https://github.com/influxdata/influxdb_iox/issues/5777
+
                // Mark this partition as being persisted, and remember the
                // memory allocation it had accumulated.
                let partition_memory_usage = self
@ -483,7 +507,9 @@ impl LifecycleManager {

                let state = Arc::clone(&self.state);
                tokio::task::spawn(async move {
-                    persister.persist(s.partition_id).await;
+                    persister
+                        .persist(s.shard_id, s.namespace_id, s.table_id, s.partition_id)
+                        .await;
                    // Now the data has been uploaded and the memory it was
                    // using has been freed, released the memory capacity back
                    // the ingester.
@ -524,6 +550,12 @@ impl LifecycleManager {
                    .map(|s| s.first_sequence_number)
                    .min()
                    .unwrap_or(sequence_number);
+                trace!(
+                    min_unpersisted_sequence_number=?min,
+                    shard_id=%shard_id,
+                    sequence_number=?sequence_number,
+                    "updated min_unpersisted_sequence_number for persisted shard"
+                );
                persister
                    .update_min_unpersisted_sequence_number(shard_id, min)
                    .await;
@ -602,7 +634,13 @@ mod tests {

    #[async_trait]
    impl Persister for TestPersister {
-        async fn persist(&self, partition_id: PartitionId) {
+        async fn persist(
+            &self,
+            _shard_id: ShardId,
+            _namespace_id: NamespaceId,
+            _table_id: TableId,
+            partition_id: PartitionId,
+        ) {
            let mut p = self.persist_called.lock();
            p.insert(partition_id);
        }
@ -662,8 +700,16 @@ mod tests {

    #[async_trait]
    impl Persister for PausablePersister {
-        async fn persist(&self, partition_id: PartitionId) {
-            self.inner.persist(partition_id).await;
+        async fn persist(
+            &self,
+            shard_id: ShardId,
+            namespace_id: NamespaceId,
+            table_id: TableId,
+            partition_id: PartitionId,
+        ) {
+            self.inner
+                .persist(shard_id, namespace_id, table_id, partition_id)
+                .await;
            if let Some(event) = self.event(partition_id) {
                event.before.wait().await;
                event.after.wait().await;
--- a/ingester/src/lifecycle/mock_handle.rs
+++ b/ingester/src/lifecycle/mock_handle.rs
@ -1,26 +1,66 @@
 //! A mock [`LifecycleHandle`] impl for testing.

+use std::sync::Arc;
+
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
+use parking_lot::Mutex;

 use super::LifecycleHandle;

-/// Special [`LifecycleHandle`] that never persists and always accepts more data.
-///
-/// This is useful to control persists manually.
-#[derive(Debug, Default, Clone, Copy)]
-pub struct NoopLifecycleHandle;
+/// A set of arguments captured from a call to
+/// [`MockLifecycleHandle::log_write()`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[allow(missing_docs)]
+pub struct MockLifecycleCall {
+    pub partition_id: PartitionId,
+    pub shard_id: ShardId,
+    pub namespace_id: NamespaceId,
+    pub table_id: TableId,
+    pub sequence_number: SequenceNumber,
+    pub bytes_written: usize,
+    pub rows_written: usize,
+}

-impl LifecycleHandle for NoopLifecycleHandle {
+/// A mock [`LifecycleHandle`] implementation that records calls made to
+/// [`Self::log_write()`] and never blocks ingest, always accepting more data.
+///
+/// # Cloning
+///
+/// Cloning a [`MockLifecycleHandle`] will clone the inner state - calls to all
+/// cloned instances are reported in a call to [`Self::get_log_calls()`].
+#[derive(Debug, Default, Clone)]
+pub struct MockLifecycleHandle {
+    log_calls: Arc<Mutex<Vec<MockLifecycleCall>>>,
+}
+
+impl MockLifecycleHandle {
+    /// Returns the ordered [`Self::log_write()`] calls made to this mock.
+    pub fn get_log_calls(&self) -> Vec<MockLifecycleCall> {
+        self.log_calls.lock().clone()
+    }
+}
+
+impl LifecycleHandle for MockLifecycleHandle {
    fn log_write(
        &self,
-        _partition_id: PartitionId,
-        _shard_id: ShardId,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _sequence_number: SequenceNumber,
-        _bytes_written: usize,
-        _rows_written: usize,
+        partition_id: PartitionId,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        sequence_number: SequenceNumber,
+        bytes_written: usize,
+        rows_written: usize,
    ) -> bool {
+        self.log_calls.lock().push(MockLifecycleCall {
+            partition_id,
+            shard_id,
+            namespace_id,
+            table_id,
+            sequence_number,
+            bytes_written,
+            rows_written,
+        });
+
        // do NOT pause ingest
        false
    }
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@ -1,10 +1,13 @@
 //! Handle all requests from Querier

-use std::sync::Arc;
+use std::{pin::Pin, sync::Arc};

+use arrow::{error::ArrowError, record_batch::RecordBatch};
+use arrow_util::optimize::{optimize_record_batch, optimize_schema};
+use data_types::{PartitionId, SequenceNumber};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
-use futures::StreamExt;
+use futures::{Stream, StreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use observability_deps::tracing::debug;
 use schema::selection::Selection;
@ -12,8 +15,8 @@ use snafu::{ensure, Snafu};

 use crate::{
    data::{
-        partition::UnpersistedPartitionData, IngesterData, IngesterQueryPartition,
-        IngesterQueryResponse,
+        namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName,
+        IngesterData,
    },
    query::QueryableBatch,
 };
@ -47,6 +50,159 @@ pub enum Error {
 /// A specialized `Error` for Ingester's Query errors
 pub type Result<T, E = Error> = std::result::Result<T, E>;

+/// Stream of snapshots.
+///
+/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
+pub(crate) type SnapshotStream =
+    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
+
+/// Status of a partition that has unpersisted data.
+///
+/// Note that this structure is specific to a partition (which itself is bound to a table and
+/// shard)!
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[allow(missing_copy_implementations)]
+pub struct PartitionStatus {
+    /// Max sequence number persisted
+    pub parquet_max_sequence_number: Option<SequenceNumber>,
+}
+
+/// Response data for a single partition.
+pub(crate) struct IngesterQueryPartition {
+    /// Stream of snapshots.
+    snapshots: SnapshotStream,
+
+    /// Partition ID.
+    id: PartitionId,
+
+    /// Partition persistence status.
+    status: PartitionStatus,
+}
+
+impl std::fmt::Debug for IngesterQueryPartition {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryPartition")
+            .field("snapshots", &"<SNAPSHOT STREAM>")
+            .field("id", &self.id)
+            .field("status", &self.status)
+            .finish()
+    }
+}
+
+impl IngesterQueryPartition {
+    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
+        Self {
+            snapshots,
+            id,
+            status,
+        }
+    }
+}
+
+/// Stream of partitions in this response.
+pub(crate) type IngesterQueryPartitionStream =
+    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
+
+/// Response streams for querier<>ingester requests.
+///
+/// The data structure is constructed to allow lazy/streaming data generation. For easier
+/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
+pub struct IngesterQueryResponse {
+    /// Stream of partitions.
+    partitions: IngesterQueryPartitionStream,
+}
+
+impl std::fmt::Debug for IngesterQueryResponse {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryResponse")
+            .field("partitions", &"<PARTITION STREAM>")
+            .finish()
+    }
+}
+
+impl IngesterQueryResponse {
+    /// Make a response
+    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
+        Self { partitions }
+    }
+
+    /// Flattens the data according to the wire protocol.
+    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
+        self.partitions
+            .flat_map(|partition_res| match partition_res {
+                Ok(partition) => {
+                    let head = futures::stream::once(async move {
+                        Ok(FlatIngesterQueryResponse::StartPartition {
+                            partition_id: partition.id,
+                            status: partition.status,
+                        })
+                    });
+                    let tail = partition
+                        .snapshots
+                        .flat_map(|snapshot_res| match snapshot_res {
+                            Ok(snapshot) => {
+                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
+
+                                let schema_captured = Arc::clone(&schema);
+                                let head = futures::stream::once(async {
+                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
+                                        schema: schema_captured,
+                                    })
+                                });
+
+                                let tail = snapshot.map(move |batch_res| match batch_res {
+                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
+                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
+                                    }),
+                                    Err(e) => Err(e),
+                                });
+
+                                head.chain(tail).boxed()
+                            }
+                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+                        });
+
+                    head.chain(tail).boxed()
+                }
+                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+            })
+            .boxed()
+    }
+}
+
+/// Flattened version of [`IngesterQueryResponse`].
+pub(crate) type FlatIngesterQueryResponseStream =
+    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
+
+/// Element within the flat wire protocol.
+#[derive(Debug, PartialEq)]
+pub enum FlatIngesterQueryResponse {
+    /// Start a new partition.
+    StartPartition {
+        /// Partition ID.
+        partition_id: PartitionId,
+
+        /// Partition persistence status.
+        status: PartitionStatus,
+    },
+
+    /// Start a new snapshot.
+    ///
+    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
+    /// message.
+    StartSnapshot {
+        /// Snapshot schema.
+        schema: Arc<arrow::datatypes::Schema>,
+    },
+
+    /// Add a record batch to the snapshot that was announced by the last
+    /// [`StartSnapshot`](Self::StartSnapshot) message.
+    RecordBatch {
+        /// Record batch.
+        batch: RecordBatch,
+    },
+}
+
 /// Return data to send as a response back to the Querier per its request
 pub async fn prepare_data_to_querier(
    ingest_data: &Arc<IngesterData>,
@ -57,7 +213,8 @@ pub async fn prepare_data_to_querier(
    let mut found_namespace = false;
    for (shard_id, shard_data) in ingest_data.shards() {
        debug!(shard_id=%shard_id.get());
-        let namespace_data = match shard_data.namespace(&request.namespace) {
+        let namespace_name = NamespaceName::from(&request.namespace);
+        let namespace_data = match shard_data.namespace(&namespace_name) {
            Some(namespace_data) => {
                debug!(namespace=%request.namespace, "found namespace");
                found_namespace = true;
@ -68,7 +225,8 @@ pub async fn prepare_data_to_querier(
            }
        };

-        let table_data = match namespace_data.table_data(&request.table) {
+        let table_name = TableName::from(&request.table);
+        let table_data = match namespace_data.table_data(&table_name) {
            Some(table_data) => {
                debug!(table_name=%request.table, "found table");
                table_data
@ -153,7 +311,6 @@ fn prepare_data_to_querier_for_partition(
                request.table.clone().into(),
                unpersisted_partition_data.partition_id,
                vec![],
-                vec![],
            )
        })
        .with_data(unpersisted_partition_data.non_persisted);
@ -188,22 +345,106 @@ fn prepare_data_to_querier_for_partition(

 #[cfg(test)]
 mod tests {
-    use arrow::{array::new_null_array, record_batch::RecordBatch};
+    use std::task::{Context, Poll};
+
+    use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch};
    use arrow_util::assert_batches_sorted_eq;
    use assert_matches::assert_matches;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        physical_plan::RecordBatchStream,
+    };
    use futures::TryStreamExt;
+    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
    use predicate::Predicate;
    use schema::merge::SchemaMerger;

    use super::*;
-    use crate::{
-        data::FlatIngesterQueryResponse,
-        test_util::{
-            make_ingester_data, make_ingester_data_with_tombstones, DataLocation, TEST_NAMESPACE,
-            TEST_TABLE,
-        },
-    };
+    use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE};
+
+    #[tokio::test]
+    async fn test_ingester_query_response_flatten() {
+        let batch_1_1 = lp_to_batch("table x=1 0");
+        let batch_1_2 = lp_to_batch("table x=2 1");
+        let batch_2 = lp_to_batch("table y=1 10");
+        let batch_3 = lp_to_batch("table z=1 10");
+
+        let schema_1 = batch_1_1.schema();
+        let schema_2 = batch_2.schema();
+        let schema_3 = batch_3.schema();
+
+        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![
+                            Ok(batch_1_1.clone()),
+                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+                            Ok(batch_1_2.clone()),
+                        ],
+                        Arc::clone(&schema_1),
+                    )) as _),
+                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![Ok(batch_2.clone())],
+                        Arc::clone(&schema_2),
+                    )) as _),
+                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
+                ])),
+                PartitionId::new(2),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([])),
+                PartitionId::new(1),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+        ])));
+
+        let actual: Vec<_> = response.flatten().collect().await;
+        let expected = vec![
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(2),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
+            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
+            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(1),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+        ];
+
+        assert_eq!(actual.len(), expected.len());
+        for (actual, expected) in actual.into_iter().zip(expected) {
+            match (actual, expected) {
+                (Ok(actual), Ok(expected)) => {
+                    assert_eq!(actual, expected);
+                }
+                (Err(_), Err(_)) => {
+                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
+                }
+                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
+                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
+            }
+        }
+    }

    #[tokio::test]
    async fn test_prepare_data_to_querier() {
@ -360,180 +601,44 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn test_prepare_data_to_querier_with_tombstones() {
-        test_helpers::maybe_start_logging();
+    pub struct TestRecordBatchStream {
+        schema: SchemaRef,
+        batches: Vec<Result<RecordBatch, ArrowError>>,
+    }

-        // make 7 scenarios for ingester data with tombstones
-        let mut scenarios = vec![];
-        for loc in &[
-            DataLocation::BUFFER,
-            DataLocation::BUFFER_SNAPSHOT,
-            DataLocation::BUFFER_PERSISTING,
-            DataLocation::BUFFER_SNAPSHOT_PERSISTING,
-            DataLocation::SNAPSHOT,
-            DataLocation::SNAPSHOT_PERSISTING,
-            DataLocation::PERSISTING,
-        ] {
-            let scenario = Arc::new(make_ingester_data_with_tombstones(*loc).await);
-            scenarios.push((loc, scenario));
+    impl TestRecordBatchStream {
+        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
+            Self { schema, batches }
        }
+    }

-        // read data from all scenarios without any filters
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec![],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
+    impl RecordBatchStream for TestRecordBatchStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    impl futures::Stream for TestRecordBatchStream {
+        type Item = Result<RecordBatch, ArrowError>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            _: &mut Context<'_>,
+        ) -> Poll<Option<Self::Item>> {
+            if self.batches.is_empty() {
+                Poll::Ready(None)
            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
+                Poll::Ready(Some(self.batches.remove(0)))
+            }
        }

-        // read data from all scenarios and filter out column day
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (self.batches.len(), Some(self.batches.len()))
        }
+    }

-        // read data from all scenarios, filter out column day, city Medford, time outside range [0, 42)
-        let expr = col("city").not_eq(lit("Medford"));
-        let pred = Predicate::default().with_expr(expr).with_range(0, 42);
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            Some(pred),
-        ));
-        // predicates and de-dup are NOT applied!, otherwise this would look like this:
-        // let expected = vec![
-        //     "+------------+------+--------------------------------+",
-        //     "| city       | temp | time                           |",
-        //     "+------------+------+--------------------------------+",
-        //     "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-        //     "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-        //     "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-        //     "+------------+------+--------------------------------+",
-        // ];
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
-        }
+    fn lp_to_batch(lp: &str) -> RecordBatch {
+        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
    }

    /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@ -6,26 +6,26 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::util::ensure_schema;
 use data_types::{
    ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary,
-    TimestampMinMax, Tombstone,
+    TimestampMinMax,
 };
-use datafusion::physical_plan::{
-    common::SizedRecordBatchStream,
-    metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
-    SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        common::SizedRecordBatchStream,
+        metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
+        SendableRecordBatchStream,
+    },
 };
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
-use predicate::{
-    delete_predicate::{tombstones_to_delete_predicates, tombstones_to_delete_predicates_iter},
-    Predicate,
-};
+use predicate::Predicate;
 use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
 use snafu::{ResultExt, Snafu};

-use crate::data::partition::SnapshotBatch;
+use crate::data::{partition::SnapshotBatch, table::TableName};

 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@ -53,11 +53,8 @@ pub(crate) struct QueryableBatch {
    /// data
    pub(crate) data: Vec<Arc<SnapshotBatch>>,

-    /// Delete predicates of the tombstones
-    pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
-
    /// This is needed to return a reference for a trait function
-    pub(crate) table_name: Arc<str>,
+    pub(crate) table_name: TableName,

    /// Partition ID
    pub(crate) partition_id: PartitionId,
@ -66,15 +63,12 @@ pub(crate) struct QueryableBatch {
 impl QueryableBatch {
    /// Initilaize a QueryableBatch
    pub(crate) fn new(
-        table_name: Arc<str>,
+        table_name: TableName,
        partition_id: PartitionId,
        data: Vec<Arc<SnapshotBatch>>,
-        deletes: Vec<Tombstone>,
    ) -> Self {
-        let delete_predicates = tombstones_to_delete_predicates(&deletes);
        Self {
            data,
-            delete_predicates,
            table_name,
            partition_id,
        }
@ -86,12 +80,6 @@ impl QueryableBatch {
        self
    }

-    /// Add more tombstones
-    pub(crate) fn add_tombstones(&mut self, deletes: &[Tombstone]) {
-        let delete_predicates = tombstones_to_delete_predicates_iter(deletes);
-        self.delete_predicates.extend(delete_predicates);
-    }
-
    /// return min and max of all the snapshots
    pub(crate) fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) {
        let min = self
@ -110,11 +98,6 @@ impl QueryableBatch {

        (min, max)
    }
-
-    /// return true if it has no data
-    pub(crate) fn is_empty(&self) -> bool {
-        self.data.is_empty()
-    }
 }

 impl QueryChunkMeta for QueryableBatch {
@ -144,16 +127,16 @@ impl QueryChunkMeta for QueryableBatch {
        None // Ingester data is not sorted
    }

-    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
-        self.delete_predicates.as_ref()
-    }
-
    fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
        // Note: we need to consider which option we want to go with
        //  . Return None here and avoid taking time to compute time's min max of RecordBacthes (current choice)
        //  . Compute time's min max here and avoid compacting non-overlapped QueryableBatches in the Ingester
        None
    }
+
+    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
+        &[]
+    }
 }

 impl QueryChunk for QueryableBatch {
@ -185,7 +168,7 @@ impl QueryChunk for QueryableBatch {
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
        _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

@ -199,7 +182,7 @@ impl QueryChunk for QueryableBatch {
        _ctx: IOxSessionContext,
        _column_name: &str,
        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

@ -210,12 +193,16 @@ impl QueryChunk for QueryableBatch {
        mut ctx: IOxSessionContext,
        _predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
        ctx.set_metadata("storage", "ingester");
        ctx.set_metadata("projection", format!("{}", selection));
        trace!(?selection, "selection");

-        let schema = self.schema().select(selection).context(SchemaSnafu)?;
+        let schema = self
+            .schema()
+            .select(selection)
+            .context(SchemaSnafu)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;

        // Get all record batches from their snapshots
        let batches = self
@ -234,7 +221,8 @@ impl QueryChunk for QueryableBatch {
                    .map(Arc::new);
                Some(batch)
            })
-            .collect::<Result<Vec<_>, _>>()?;
+            .collect::<Result<Vec<_>, _>>()
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;

        // Return stream of data
        let dummy_metrics = ExecutionPlanMetricsSet::new();
@ -257,165 +245,3 @@ impl QueryChunk for QueryableBatch {
        self
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use arrow::{
-        array::{
-            ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
-            TimestampNanosecondArray, UInt64Array,
-        },
-        datatypes::{DataType, Int32Type, TimeUnit},
-    };
-    use data_types::{DeleteExpr, Op, Scalar, TimestampRange};
-
-    use super::*;
-    use crate::test_util::create_tombstone;
-
-    #[tokio::test]
-    async fn test_merge_batch_schema() {
-        // Merge schema of the batches
-        // The fields in the schema are sorted by column name
-        let batches = create_batches();
-        let merged_schema = (*merge_record_batch_schemas(&batches)).clone();
-
-        // Expected Arrow schema
-        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![
-            arrow::datatypes::Field::new(
-                "dict",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                true,
-            ),
-            arrow::datatypes::Field::new("int64", DataType::Int64, true),
-            arrow::datatypes::Field::new("string", DataType::Utf8, true),
-            arrow::datatypes::Field::new("bool", DataType::Boolean, true),
-            arrow::datatypes::Field::new(
-                "time",
-                DataType::Timestamp(TimeUnit::Nanosecond, None),
-                false,
-            ),
-            arrow::datatypes::Field::new("uint64", DataType::UInt64, false),
-            arrow::datatypes::Field::new("float64", DataType::Float64, true),
-        ]));
-        let expected_schema = Schema::try_from(arrow_schema)
-            .unwrap()
-            .sort_fields_by_name();
-
-        assert_eq!(
-            expected_schema, merged_schema,
-            "\nExpected:\n{:#?}\nActual:\n{:#?}",
-            expected_schema, merged_schema
-        );
-    }
-
-    #[tokio::test]
-    async fn test_tombstones_to_delete_predicates() {
-        // create tombstones
-        let tombstones = vec![
-            create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"),
-            create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"),
-        ];
-
-        // This new queryable batch will convert tombstone to delete predicates
-        let query_batch =
-            QueryableBatch::new("test_table".into(), PartitionId::new(0), vec![], tombstones);
-        let predicates = query_batch.delete_predicates();
-        let expected = vec![
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 200),
-                exprs: vec![DeleteExpr {
-                    column: String::from("temp"),
-                    op: Op::Eq,
-                    scalar: Scalar::I64(10),
-                }],
-            }),
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 350),
-                exprs: vec![
-                    DeleteExpr {
-                        column: String::from("temp"),
-                        op: Op::Ne,
-                        scalar: Scalar::I64(10),
-                    },
-                    DeleteExpr {
-                        column: String::from("city"),
-                        op: Op::Eq,
-                        scalar: Scalar::String(String::from(r#"Boston"#)),
-                    },
-                ],
-            }),
-        ];
-
-        assert_eq!(expected, predicates);
-    }
-
-    // ----------------------------------------------------------------------------------------------
-    // Data for testing
-
-    // Create pure RecordBatches without knowledge of Influx datatype
-    fn create_batches() -> Vec<Arc<RecordBatch>> {
-        // Batch 1: <dict, i64, str, bool, time>  & 3 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![Some("a"), None, Some("b")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let int64_array: ArrayRef =
-            Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("and"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new(
-            [Some(true), None, Some(false)]
-                .iter()
-                .collect::<BooleanArray>(),
-        );
-        let ts_array: ArrayRef = Arc::new(
-            [Some(150), Some(200), Some(1526823730000000000)]
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("int64", int64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        // Batch 2: <dict, u64, f64, str, bool, time> & 2 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![None, Some("d")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::<UInt64Array>()); // not null
-        let float64_array: ArrayRef =
-            Arc::new([Some(1.0), Some(2.0)].iter().collect::<Float64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::<BooleanArray>());
-        let ts_array: ArrayRef = Arc::new(
-            [Some(100), Some(1626823730000000000)] // not null
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("uint64", uint64_array, false), // not null
-            ("float64", float64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        vec![Arc::new(batch1), Arc::new(batch2)]
-    }
-}
--- a/ingester/src/server/grpc.rs
+++ b/ingester/src/server/grpc.rs
@ -30,8 +30,8 @@ use trace::ctx::SpanContext;
 use write_summary::WriteSummary;

 use crate::{
-    data::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
    handler::IngestHandler,
+    querier_handler::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
 };

 /// This type is responsible for managing all gRPC services exposed by
@ -410,9 +410,6 @@ impl Stream for GetStream {
                            parquet_max_sequence_number: status
                                .parquet_max_sequence_number
                                .map(|x| x.get()),
-                            tombstone_max_sequence_number: status
-                                .tombstone_max_sequence_number
-                                .map(|x| x.get()),
                        }),
                    };
                    prost::Message::encode(&app_metadata, &mut bytes)
@ -467,8 +464,9 @@ mod tests {
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
    use schema::selection::Selection;

+    use crate::querier_handler::PartitionStatus;
+
    use super::*;
-    use crate::data::partition::PartitionStatus;

    #[tokio::test]
    async fn test_get_stream_empty() {
@ -489,7 +487,6 @@ mod tests {
                    partition_id: PartitionId::new(1),
                    status: PartitionStatus {
                        parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                    },
                }),
                Ok(FlatIngesterQueryResponse::StartSnapshot { schema }),
@ -502,7 +499,6 @@ mod tests {
                        partition_id: 1,
                        status: Some(proto::PartitionStatus {
                            parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                        }),
                    },
                }),
@ -527,7 +523,6 @@ mod tests {
                    partition_id: PartitionId::new(1),
                    status: PartitionStatus {
                        parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                    },
                }),
                Err(ArrowError::IoError("foo".into())),
@ -535,7 +530,6 @@ mod tests {
                    partition_id: PartitionId::new(1),
                    status: PartitionStatus {
                        parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                    },
                }),
            ],
@ -546,7 +540,6 @@ mod tests {
                        partition_id: 1,
                        status: Some(proto::PartitionStatus {
                            parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                        }),
                    },
                }),
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@ -396,6 +396,12 @@ something clever.",
            if let Some(delta) = duration_since_production {
                // Update the TTBR metric before potentially sleeping.
                self.time_to_be_readable.set(delta);
+                trace!(
+                    kafka_topic=%self.topic_name,
+                    shard_index=%self.shard_index,
+                    delta=%delta.as_millis(),
+                    "reporting TTBR for shard (ms)"
+                );
            }

            if should_pause {
@ -939,7 +945,7 @@ mod tests {
            Ok(DmlOperation::Write(make_write("good_op", 2)))
        ]],
        sink_rets = [
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound{namespace: "bananas".to_string() }),
            Ok(true),
        ],
        want_ttbr = 2,
--- a/ingester/src/stream_handler/mod.rs
+++ b/ingester/src/stream_handler/mod.rs
@ -17,7 +17,7 @@
 //! [`LifecycleManager`]: crate::lifecycle::LifecycleManager
 //! [`LifecycleHandle::can_resume_ingest()`]: crate::lifecycle::LifecycleHandle::can_resume_ingest()

-pub mod handler;
+pub(crate) mod handler;
 mod periodic_watermark_fetcher;
 mod sink;

@ -25,8 +25,8 @@ mod sink;
 pub mod mock_sink;
 #[cfg(test)]
 pub mod mock_watermark_fetcher;
-pub mod sink_adaptor;
-pub mod sink_instrumentation;
+pub(crate) mod sink_adaptor;
+pub(crate) mod sink_instrumentation;

-pub use periodic_watermark_fetcher::*;
-pub use sink::*;
+pub(crate) use periodic_watermark_fetcher::*;
+pub(crate) use sink::*;
--- a/ingester/src/stream_handler/periodic_watermark_fetcher.rs
+++ b/ingester/src/stream_handler/periodic_watermark_fetcher.rs
@ -24,7 +24,7 @@ use super::sink_instrumentation::WatermarkFetcher;
 /// Emits an error metric named `write_buffer_watermark_fetch_errors` that
 /// increments once per fetch error.
 #[derive(Debug)]
-pub struct PeriodicWatermarkFetcher {
+pub(crate) struct PeriodicWatermarkFetcher {
    last_watermark: Arc<AtomicI64>,
    poll_handle: JoinHandle<()>,
 }
--- a/ingester/src/stream_handler/sink.rs
+++ b/ingester/src/stream_handler/sink.rs
@ -5,7 +5,7 @@ use dml::DmlOperation;

 /// A [`DmlSink`] handles [`DmlOperation`] instances read from a shard.
 #[async_trait]
-pub trait DmlSink: Debug + Send + Sync {
+pub(crate) trait DmlSink: Debug + Send + Sync {
    /// Apply `op` read from a shard, returning `Ok(true)` if ingest should
    /// be paused.
    async fn apply(&self, op: DmlOperation) -> Result<bool, crate::data::Error>;
--- a/ingester/src/stream_handler/sink_instrumentation.rs
+++ b/ingester/src/stream_handler/sink_instrumentation.rs
@ -414,11 +414,13 @@ mod tests {
        let got = test(
            op,
            &metrics,
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound {
+                namespace: "bananas".to_string(),
+            }),
            Some(12345),
        )
        .await;
-        assert_matches!(got, Err(crate::data::Error::TableNotPresent));
+        assert_matches!(got, Err(crate::data::Error::NamespaceNotFound { .. }));

        // Validate the various write buffer metrics
        assert_matches!(
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@ -9,17 +9,16 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::assert_batches_eq;
 use bitflags::bitflags;
 use data_types::{
-    CompactionLevel, NamespaceId, NonEmptyString, PartitionId, PartitionKey, Sequence,
-    SequenceNumber, ShardId, ShardIndex, TableId, Timestamp, Tombstone, TombstoneId,
+    CompactionLevel, NamespaceId, PartitionId, PartitionKey, Sequence, SequenceNumber, ShardId,
+    ShardIndex, TableId,
 };
-use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
+use dml::{DmlMeta, DmlOperation, DmlWrite};
 use iox_catalog::{interface::Catalog, mem::MemCatalog};
 use iox_query::test::{raw_data, TestChunk};
 use iox_time::{SystemProvider, Time};
 use mutable_batch_lp::lines_to_batches;
 use object_store::memory::InMemory;
 use parquet_file::metadata::IoxMetadata;
-use predicate::delete_predicate::parse_delete_predicate;
 use schema::sort::SortKey;
 use uuid::Uuid;

@ -28,31 +27,10 @@ use crate::{
        partition::{resolver::CatalogPartitionResolver, PersistingBatch, SnapshotBatch},
        IngesterData,
    },
-    lifecycle::{LifecycleConfig, LifecycleHandle, LifecycleManager},
+    lifecycle::{LifecycleConfig, LifecycleManager},
    query::QueryableBatch,
 };

-/// Create tombstone for testing
-pub(crate) fn create_tombstone(
-    id: i64,
-    table_id: i64,
-    shard_id: i64,
-    seq_num: i64,
-    min_time: i64,
-    max_time: i64,
-    predicate: &str,
-) -> Tombstone {
-    Tombstone {
-        id: TombstoneId::new(id),
-        table_id: TableId::new(table_id),
-        shard_id: ShardId::new(shard_id),
-        sequence_number: SequenceNumber::new(seq_num),
-        min_time: Timestamp::new(min_time),
-        max_time: Timestamp::new(max_time),
-        serialized_predicate: predicate.to_string(),
-    }
-}
-
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn make_meta(
    object_store_id: Uuid,
@ -93,15 +71,8 @@ pub(crate) fn make_persisting_batch(
    partition_id: i64,
    object_store_id: Uuid,
    batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<PersistingBatch> {
-    let queryable_batch = make_queryable_batch_with_deletes(
-        table_name,
-        partition_id,
-        seq_num_start,
-        batches,
-        tombstones,
-    );
+    let queryable_batch = make_queryable_batch(table_name, partition_id, seq_num_start, batches);
    Arc::new(PersistingBatch {
        shard_id: ShardId::new(shard_id),
        table_id: TableId::new(table_id),
@ -116,16 +87,6 @@ pub(crate) fn make_queryable_batch(
    partition_id: i64,
    seq_num_start: i64,
    batches: Vec<Arc<RecordBatch>>,
-) -> Arc<QueryableBatch> {
-    make_queryable_batch_with_deletes(table_name, partition_id, seq_num_start, batches, vec![])
-}
-
-pub(crate) fn make_queryable_batch_with_deletes(
-    table_name: &str,
-    partition_id: i64,
-    seq_num_start: i64,
-    batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<QueryableBatch> {
    // make snapshots for the batches
    let mut snapshots = vec![];
@ -140,7 +101,6 @@ pub(crate) fn make_queryable_batch_with_deletes(
        table_name.into(),
        PartitionId::new(partition_id),
        snapshots,
-        tombstones,
    ))
 }

@ -655,65 +615,24 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
        let _ignored = ingester
            .shard(shard_id)
            .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
            .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
            .await;
    } else if loc.contains(DataLocation::SNAPSHOT) {
        // move partition 1 data to snapshot
        let _ignored = ingester
            .shard(shard_id)
            .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
            .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
            .await;
    }

    ingester
 }

-pub(crate) async fn make_ingester_data_with_tombstones(loc: DataLocation) -> IngesterData {
-    // Whatever data because they won't be used in the tests
-    let metrics: Arc<metric::Registry> = Default::default();
-    let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
-    let object_store = Arc::new(InMemory::new());
-    let exec = Arc::new(iox_query::exec::Executor::new(1));
-    let lifecycle = LifecycleManager::new(
-        LifecycleConfig::new(
-            200_000_000,
-            100_000_000,
-            100_000_000,
-            Duration::from_secs(100_000_000),
-            Duration::from_secs(100_000_000),
-            100_000_000,
-        ),
-        Arc::clone(&metrics),
-        Arc::new(SystemProvider::default()),
-    );
-
-    // Make data for one shard and two tables
-    let shard_index = ShardIndex::new(0);
-    let (shard_id, _, _) =
-        populate_catalog(&*catalog, shard_index, TEST_NAMESPACE, TEST_TABLE).await;
-
-    let ingester = IngesterData::new(
-        object_store,
-        Arc::clone(&catalog),
-        [(shard_id, shard_index)],
-        exec,
-        Arc::new(CatalogPartitionResolver::new(catalog)),
-        backoff::BackoffConfig::default(),
-        metrics,
-    );
-
-    // Make partitions per requested
-    make_one_partition_with_tombstones(&ingester, &lifecycle.handle(), loc, shard_index, shard_id)
-        .await;
-
-    ingester
-}
-
 /// Make data for one or two partitions per requested
 pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> Vec<DmlOperation> {
    // In-memory data includes these rows but split between 4 groups go into
@ -783,133 +702,6 @@ pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) ->
    ops
 }

-/// Make data for one partition with tombstones
-async fn make_one_partition_with_tombstones(
-    ingester: &IngesterData,
-    lifecycle_handle: &dyn LifecycleHandle,
-    loc: DataLocation,
-    shard_index: ShardIndex,
-    shard_id: ShardId,
-) {
-    // In-memory data includes these rows but split between 4 groups go into
-    // different batches of parittion 1 or partittion 2  as requeted
-    // let expected = vec![
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| city       | day | temp | time                           |",
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |", // in group 1 - seq_num: 2
-    //         "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |", // in group 2 - seq_num: 3
-    //         "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |", // in group 1 - seq_num: 1  --> will get deleted
-    //         "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |", // in group 3 - seq_num: 5  --> will get deleted
-    //         "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 8  (after the tombstone's seq num)
-    //         "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4
-    //         "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 9
-    //         "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6
-    //         "+------------+-----+------+--------------------------------+",
-    //     ];
-
-    let (ops, seq_num) =
-        make_first_partition_data(&PartitionKey::from(TEST_PARTITION_1), shard_index);
-
-    // Apply all ops
-    for op in ops {
-        ingester
-            .buffer_operation(shard_id, op, lifecycle_handle)
-            .await
-            .unwrap();
-    }
-
-    if loc.contains(DataLocation::PERSISTING) {
-        // Move partition 1 data to persisting
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(TEST_NAMESPACE)
-            .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    } else if loc.contains(DataLocation::SNAPSHOT) {
-        // move partition 1 data to snapshot
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(TEST_NAMESPACE)
-            .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    }
-
-    // Add tombstones
-    // Depending on where the existing data is, they (buffer & snapshot) will be either moved to a new snapshot after
-    // applying the tombstone or (persisting) stay where they are and the tombstones is kept to get applied later
-    // ------------------------------------------
-    // Delete
-    let mut seq_num = seq_num.get();
-    seq_num += 1;
-
-    let delete = parse_delete_predicate(
-        "1970-01-01T00:00:00.000000010Z",
-        "1970-01-01T00:00:00.000000050Z",
-        "city=Boston",
-    )
-    .unwrap();
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Delete(DmlDelete::new(
-                TEST_NAMESPACE.to_string(),
-                delete,
-                NonEmptyString::new(TEST_TABLE),
-                DmlMeta::sequenced(
-                    Sequence {
-                        shard_index,
-                        sequence_number: SequenceNumber::new(seq_num),
-                    },
-                    Time::MIN,
-                    None,
-                    42,
-                ),
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-
-    // Group 4: in buffer of p1 after the tombstone
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Medford day="sun",temp=55 22"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-    seq_num += 1;
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Reading day="mon",temp=58 40"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-}
-
 pub(crate) fn make_write_op(
    partition_key: &PartitionKey,
    shard_index: ShardIndex,
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@ -463,7 +463,10 @@ pub trait PartitionRepo: Send + Sync {
        partition_id: PartitionId,
    ) -> Result<Option<PartitionInfo>>;

-    /// Update the sort key for the partition
+    /// Update the sort key for the partition.
+    ///
+    /// NOTE: it is expected that ONLY the ingesters update sort keys for
+    /// existing partitions.
    async fn update_sort_key(
        &mut self,
        partition_id: PartitionId,
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@ -1878,7 +1878,7 @@ LIMIT $4;
        sqlx::query_as::<_, PartitionParam>(
            r#"
 SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id,
-       parquet_file.table_id, 
+       parquet_file.table_id,
       count(case when to_delete is null then 1 end) total_count,
       max(case when compaction_level= $4 then parquet_file.created_at end)
 FROM   parquet_file
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@ -11,7 +11,7 @@ chrono = { version = "0.4", default-features = false }
 chrono-english = "0.1.4"
 clap = { version = "4", features = ["derive", "env", "cargo"] }
 futures = "0.3"
-handlebars = "4.3.4"
+handlebars = "4.3.5"
 humantime = "2.1.0"
 influxdb2_client = { path = "../influxdb2_client" }
 itertools = "0.10.5"
@ -22,7 +22,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
 regex = "1.6"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 toml = "0.5.9"
--- a/iox_query/src/exec/seriesset/converter.rs
+++ b/iox_query/src/exec/seriesset/converter.rs
@ -762,7 +762,7 @@ mod tests {
        .unwrap();

        // Input has one row that has no value (NULL value) for tag_b, which is its own series
-        let input = stream_from_batch(batch);
+        let input = stream_from_batch(batch.schema(), batch);

        let table_name = "foo";
        let tag_columns = ["tag_a", "tag_b"];
@ -873,7 +873,8 @@ mod tests {
                    .collect();

                // stream from those batches
-                stream_from_batches(batches)
+                assert!(!batches.is_empty());
+                stream_from_batches(batches[0].schema(), batches)
            })
            .collect()
    }
--- a/iox_query/src/frontend/influxrpc.rs
+++ b/iox_query/src/frontend/influxrpc.rs
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@ -14,7 +14,7 @@ use async_trait::async_trait;
 use data_types::{
    ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use exec::{stringset::StringSet, IOxSessionContext};
 use hashbrown::HashMap;
 use observability_deps::tracing::{debug, trace};
@ -141,9 +141,6 @@ impl Drop for QueryCompletedToken {
 /// This avoids storing potentially large strings
 pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;

-/// Error type for [`QueryDatabase`] operations.
-pub type QueryDatabaseError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// A `Database` is the main trait implemented by the IOx subsystems
 /// that store actual data.
 ///
@ -154,12 +151,15 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
    /// Returns a set of chunks within the partition with data that may match
    /// the provided predicate. If possible, chunks which have no rows that can
    /// possibly match the predicate may be omitted.
+    /// If projection is None, returned chunks will include all columns of its original data. Otherwise,
+    /// returned chunks will includs PK columns (tags and time) and columns specified in the projection.
    async fn chunks(
        &self,
        table_name: &str,
        predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
        ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError>;
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;

    /// Record that particular type of query was run / planned
    fn record_query(
@ -175,9 +175,6 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
    fn as_meta(&self) -> &dyn QueryDatabaseMeta;
 }

-/// Error type for [`QueryChunk`] operations.
-pub type QueryChunkError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// Collection of data that shares the same partition key
 pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
    /// returns the Id of this chunk. Ids are unique within a
@ -200,7 +197,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
    fn apply_predicate_to_metadata(
        &self,
        predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
        Ok(self
            .summary()
            .map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow()))
@ -216,7 +213,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        ctx: IOxSessionContext,
        predicate: &Predicate,
        columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;

    /// Return a set of Strings containing the distinct values in the
    /// specified columns. If the predicate can be evaluated entirely
@ -228,7 +225,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        ctx: IOxSessionContext,
        column_name: &str,
        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;

    /// Provides access to raw `QueryChunk` data as an
    /// asynchronous stream of `RecordBatch`es filtered by a *required*
@ -248,7 +245,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError>;
+    ) -> Result<SendableRecordBatchStream, DataFusionError>;

    /// Returns chunk type. Useful in tests and debug logs.
    fn chunk_type(&self) -> &str;
--- a/iox_query/src/provider/adapter.rs
+++ b/iox_query/src/provider/adapter.rs
@ -262,7 +262,7 @@ mod tests {
        let batch = make_batch();

        let output_schema = batch.schema();
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
        let adapter_stream =
            SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();

@ -291,7 +291,7 @@ mod tests {
            Field::new("c", DataType::Utf8, false),
            Field::new("a", DataType::Int32, false),
        ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
        let adapter_stream =
            SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();

@ -321,7 +321,7 @@ mod tests {
            Field::new("d", DataType::Float32, true),
            Field::new("a", DataType::Int32, false),
        ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
        let adapter_stream =
            SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();

@ -349,7 +349,7 @@ mod tests {
            Field::new("c", DataType::Utf8, false),
            Field::new("a", DataType::Int32, false),
        ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
        let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());

        assert_contains!(
@ -368,7 +368,7 @@ mod tests {
            Field::new("b", DataType::Int32, false),
            Field::new("a", DataType::Int32, false),
        ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
        let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());

        assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'");
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@ -8,8 +8,8 @@ use crate::{
        stringset::{StringSet, StringSetRef},
        ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext,
    },
-    Predicate, PredicateMatch, QueryChunk, QueryChunkError, QueryChunkMeta, QueryCompletedToken,
-    QueryDatabase, QueryDatabaseError, QueryText,
+    Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase,
+    QueryText,
 };
 use arrow::{
    array::{
@ -24,7 +24,7 @@ use data_types::{
    ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues,
    Statistics, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use datafusion_util::stream_from_batches;
 use futures::StreamExt;
 use hashbrown::HashSet;
@ -108,18 +108,54 @@ impl QueryDatabase for TestDatabase {
        &self,
        table_name: &str,
        predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
        _ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
        // save last predicate
        *self.chunks_predicate.lock() = predicate.clone();

-        let partitions = self.partitions.lock();
-        Ok(partitions
+        let partitions = self.partitions.lock().clone();
+        let chunks = partitions
            .values()
            .flat_map(|x| x.values())
            .filter(|x| x.table_name == table_name)
-            .map(|x| Arc::clone(x) as _)
-            .collect())
+            .map(|x| Arc::clone(x) as Arc<dyn QueryChunk>)
+            .collect::<Vec<_>>();
+
+        // Return chunks with fewer columns if a projection is specified
+        let mut new_chunks = Vec::with_capacity(chunks.len());
+        for c in chunks {
+            let schema = c.schema();
+            let cols = schema.select_given_and_pk_columns(projection);
+            let cols = cols.iter().map(|c| c.as_str()).collect::<Vec<_>>();
+            let selection = Selection::Some(&cols);
+
+            let read_result =
+                c.read_filter(IOxSessionContext::with_testing(), predicate, selection);
+            if read_result.is_err() {
+                return Err(read_result.err().unwrap());
+            }
+            let mut stream = read_result.unwrap();
+
+            let mut new_chunk = TestChunk::new(c.table_name());
+            while let Some(b) = stream.next().await {
+                let b = b.expect("Error in stream");
+                new_chunk.table_data.push(Arc::new(b));
+            }
+
+            let new_chunk = if !new_chunk.table_data.is_empty() {
+                let new_schema = Schema::try_from(new_chunk.table_data[0].schema()).unwrap();
+                let new_chunk = new_chunk.add_schema_to_table(new_schema, true, None);
+                Arc::new(new_chunk) as _
+            } else {
+                // No data, return the original empty chunk with the original schema
+                c
+            };
+
+            new_chunks.push(new_chunk);
+        }
+
+        Ok(new_chunks)
    }

    fn record_query(
@ -327,9 +363,9 @@ impl TestChunk {
    }

    /// Checks the saved error, and returns it if any, otherwise returns OK
-    fn check_error(&self) -> Result<(), QueryChunkError> {
+    fn check_error(&self) -> Result<(), DataFusionError> {
        if let Some(message) = self.saved_error.as_ref() {
-            Err(message.clone().into())
+            Err(DataFusionError::External(message.clone().into()))
        } else {
            Ok(())
        }
@ -509,12 +545,8 @@ impl TestChunk {
        mut self,
        new_column_schema: Schema,
        add_column_summary: bool,
-        stats: Option<Statistics>,
+        input_stats: Option<Statistics>,
    ) -> Self {
-        // assume the new schema has exactly a single table
-        assert_eq!(new_column_schema.len(), 1);
-        let (col_type, new_field) = new_column_schema.field(0);
-
        let mut merger = SchemaMerger::new();
        merger = merger.merge(&new_column_schema).unwrap();
        merger = merger
@ -522,34 +554,38 @@ impl TestChunk {
            .expect("merging was successful");
        self.schema = merger.build();

-        if add_column_summary {
-            let influxdb_type = col_type.map(|t| match t {
-                InfluxColumnType::Tag => InfluxDbType::Tag,
-                InfluxColumnType::Field(_) => InfluxDbType::Field,
-                InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
-            });
+        for i in 0..new_column_schema.len() {
+            let (col_type, new_field) = new_column_schema.field(i);
+            if add_column_summary {
+                let influxdb_type = col_type.map(|t| match t {
+                    InfluxColumnType::Tag => InfluxDbType::Tag,
+                    InfluxColumnType::Field(_) => InfluxDbType::Field,
+                    InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
+                });

-            let stats = stats.unwrap_or_else(|| match new_field.data_type() {
-                DataType::Boolean => Statistics::Bool(StatValues::default()),
-                DataType::Int64 => Statistics::I64(StatValues::default()),
-                DataType::UInt64 => Statistics::U64(StatValues::default()),
-                DataType::Utf8 => Statistics::String(StatValues::default()),
-                DataType::Dictionary(_, value_type) => {
-                    assert!(matches!(**value_type, DataType::Utf8));
-                    Statistics::String(StatValues::default())
-                }
-                DataType::Float64 => Statistics::F64(StatValues::default()),
-                DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
-                _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
-            });
+                let stats = input_stats.clone();
+                let stats = stats.unwrap_or_else(|| match new_field.data_type() {
+                    DataType::Boolean => Statistics::Bool(StatValues::default()),
+                    DataType::Int64 => Statistics::I64(StatValues::default()),
+                    DataType::UInt64 => Statistics::U64(StatValues::default()),
+                    DataType::Utf8 => Statistics::String(StatValues::default()),
+                    DataType::Dictionary(_, value_type) => {
+                        assert!(matches!(**value_type, DataType::Utf8));
+                        Statistics::String(StatValues::default())
+                    }
+                    DataType::Float64 => Statistics::F64(StatValues::default()),
+                    DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
+                    _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
+                });

-            let column_summary = ColumnSummary {
-                name: new_field.name().clone(),
-                influxdb_type,
-                stats,
-            };
+                let column_summary = ColumnSummary {
+                    name: new_field.name().clone(),
+                    influxdb_type,
+                    stats,
+                };

-            self.table_summary.columns.push(column_summary);
+                self.table_summary.columns.push(column_summary);
+            }
        }

        self
@ -921,13 +957,17 @@ impl QueryChunk for TestChunk {
        _ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
        self.check_error()?;

        // save the predicate
        self.predicates.lock().push(predicate.clone());

-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
            None => self.table_data.clone(),
            Some(projection) => self
                .table_data
@ -938,7 +978,8 @@ impl QueryChunk for TestChunk {
                })
                .collect::<std::result::Result<Vec<_>, ArrowError>>()?,
        };
-        Ok(stream_from_batches(batches))
+
+        Ok(stream_from_batches(self.schema().as_arrow(), batches))
    }

    fn chunk_type(&self) -> &str {
@ -948,7 +989,7 @@ impl QueryChunk for TestChunk {
    fn apply_predicate_to_metadata(
        &self,
        predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
        self.check_error()?;

        // save the predicate
@ -967,7 +1008,7 @@ impl QueryChunk for TestChunk {
        _ctx: IOxSessionContext,
        _column_name: &str,
        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        // Model not being able to get column values from metadata
        Ok(None)
    }
@ -977,7 +1018,7 @@ impl QueryChunk for TestChunk {
        _ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        self.check_error()?;

        // save the predicate
--- a/iox_tests/Cargo.toml
+++ b/iox_tests/Cargo.toml
@ -14,7 +14,7 @@ iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
--- a/ioxd_common/Cargo.toml
+++ b/ioxd_common/Cargo.toml
@ -40,7 +40,7 @@ log = "0.4"
 parking_lot = "0.12"
 reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 serde_urlencoded = "0.7.0"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
--- a/ioxd_compactor/Cargo.toml
+++ b/ioxd_compactor/Cargo.toml
@ -15,7 +15,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 iox_query = { path = "../iox_query" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_time = { path = "../iox_time" }
 trace = { path = "../trace" }

--- a/ioxd_ingester/Cargo.toml
+++ b/ioxd_ingester/Cargo.toml
@ -11,7 +11,7 @@ ingester = { path = "../ingester" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_query = { path = "../iox_query" }
 trace = { path = "../trace" }
 write_buffer = { path = "../write_buffer" }
--- a/ioxd_querier/Cargo.toml
+++ b/ioxd_querier/Cargo.toml
@ -11,7 +11,7 @@ generated_types = { path = "../generated_types" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 querier = { path = "../querier" }
 iox_query = { path = "../iox_query" }
 router = { path = "../router" }
--- a/ioxd_router/Cargo.toml
+++ b/ioxd_router/Cargo.toml
@ -11,7 +11,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 router = { path = "../router" }
 sharder = { path = "../sharder" }
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@ -10,7 +10,7 @@ bytes = "1.2"
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 metric = { version = "0.1.0", path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 pin-project = "1.0.12"
 tokio = { version = "1.21", features = ["io-util"] }
 workspace-hack = { path = "../workspace-hack" }
--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@ -14,7 +14,7 @@ datafusion_util = { path = "../datafusion_util" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
 iox_time = { path = "../iox_time" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet = {version = "23.0.0", features = ["experimental"]}
--- a/parquet_to_line_protocol/Cargo.toml
+++ b/parquet_to_line_protocol/Cargo.toml
@ -10,7 +10,7 @@ datafusion = { path = "../datafusion" }
 influxdb_line_protocol = { path = "../influxdb_line_protocol" }
 futures = {version = "0.3"}
 num_cpus = "1.13.1"
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 parquet_file  = { path = "../parquet_file" }
 schema = { path = "../schema" }
 tokio = "1.0"
--- a/predicate/Cargo.toml
+++ b/predicate/Cargo.toml
@ -13,9 +13,9 @@ itertools = "0.10"
 observability_deps = { path = "../observability_deps" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
-sqlparser = "0.24.0"
+sqlparser = "0.25.0"
 workspace-hack = { path = "../workspace-hack"}

 [dev-dependencies]
--- a/predicate/src/lib.rs
+++ b/predicate/src/lib.rs
@ -12,7 +12,6 @@

 pub mod delete_expr;
 pub mod delete_predicate;
-pub mod rewrite;
 pub mod rpc_predicate;

 use arrow::{
--- a/predicate/src/rpc_predicate.rs
+++ b/predicate/src/rpc_predicate.rs
@ -1,19 +1,23 @@
+mod column_rewrite;
 mod field_rewrite;
 mod measurement_rewrite;
+mod rewrite;
 mod value_rewrite;

-use crate::{rewrite, Predicate};
+use crate::Predicate;

 use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::lit;
 use datafusion::logical_plan::{
-    Column, Expr, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
+    Column, Expr, ExprRewritable, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
 };
+use observability_deps::tracing::{debug, trace};
 use schema::Schema;
 use std::collections::BTreeSet;
 use std::sync::Arc;

+use self::column_rewrite::MissingColumnRewriter;
 use self::field_rewrite::FieldProjectionRewriter;
 use self::measurement_rewrite::rewrite_measurement_references;
 use self::value_rewrite::rewrite_field_value_references;
@ -187,6 +191,7 @@ fn normalize_predicate(
    let mut predicate = predicate.clone();

    let mut field_projections = FieldProjectionRewriter::new(Arc::clone(&schema));
+    let mut missing_columums = MissingColumnRewriter::new(Arc::clone(&schema));

    let mut field_value_exprs = vec![];

@ -194,24 +199,38 @@ fn normalize_predicate(
        .exprs
        .into_iter()
        .map(|e| {
-            rewrite_measurement_references(table_name, e)
+            debug!(?e, "rewriting expr");
+
+            let e = rewrite_measurement_references(table_name, e)
+                .map(|e| log_rewrite(e, "rewrite_measurement_references"))
                // Rewrite any references to `_value = some_value` to literal true values.
                // Keeps track of these expressions, which can then be used to
                // augment field projections with conditions using `CASE` statements.
                .and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e))
+                .map(|e| log_rewrite(e, "rewrite_field_value_references"))
                // Rewrite any references to `_field` with a literal
                // and keep track of referenced field names to add to
                // the field column projection set.
                .and_then(|e| field_projections.rewrite_field_exprs(e))
+                .map(|e| log_rewrite(e, "field_projections"))
+                // remove references to columns that don't exist in this schema
+                .and_then(|e| e.rewrite(&mut missing_columums))
+                .map(|e| log_rewrite(e, "missing_columums"))
                // apply IOx specific rewrites (that unlock other simplifications)
                .and_then(rewrite::rewrite)
-                // Call the core DataFusion simplification logic
+                .map(|e| log_rewrite(e, "rewrite"))
+                // Call DataFusion simplification logic
                .and_then(|e| {
                    let adapter = SimplifyAdapter::new(schema.as_ref());
                    // simplify twice to ensure "full" cleanup
                    e.simplify(&adapter)?.simplify(&adapter)
                })
+                .map(|e| log_rewrite(e, "simplify_expr"))
                .and_then(rewrite::simplify_predicate)
+                .map(|e| log_rewrite(e, "simplify_expr"));
+
+            debug!(?e, "rewritten expr");
+            e
        })
        // Filter out literal true so is_empty works correctly
        .filter(|f| match f {
@ -227,6 +246,11 @@ fn normalize_predicate(
    field_projections.add_to_predicate(predicate)
 }

+fn log_rewrite(expr: Expr, description: &str) -> Expr {
+    trace!(?expr, %description, "After rewrite");
+    expr
+}
+
 struct SimplifyAdapter<'a> {
    schema: &'a Schema,
    execution_props: ExecutionProps,
@ -290,9 +314,27 @@ mod tests {

    use super::*;
    use arrow::datatypes::DataType;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        scalar::ScalarValue,
+    };
    use test_helpers::assert_contains;

+    #[test]
+    fn test_normalize_predicate_coerced() {
+        let schema = schema();
+        let predicate = normalize_predicate(
+            "table",
+            Arc::clone(&schema),
+            &Predicate::new().with_expr(col("t1").eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(col("t1").eq(lit("f1")));
+
+        assert_eq!(predicate, expected);
+    }
+
    #[test]
    fn test_normalize_predicate_field_rewrite() {
        let predicate = normalize_predicate(
@ -336,6 +378,20 @@ mod tests {
        assert_eq!(predicate, expected);
    }

+    #[test]
+    fn test_normalize_predicate_field_non_tag() {
+        // should treat
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None)));
+        assert_eq!(predicate, expected);
+    }
+
    #[test]
    fn test_normalize_predicate_field_rewrite_multi_field_unsupported() {
        let err = normalize_predicate(
--- a/predicate/src/rpc_predicate/column_rewrite.rs
+++ b/predicate/src/rpc_predicate/column_rewrite.rs
@ -0,0 +1,99 @@
+use std::sync::Arc;
+
+use datafusion::{
+    error::Result as DataFusionResult, logical_plan::ExprRewriter, prelude::*, scalar::ScalarValue,
+};
+use schema::Schema;
+
+/// Logic for rewriting expressions from influxrpc that reference non
+/// existent columns to NULL
+#[derive(Debug)]
+pub(crate) struct MissingColumnRewriter {
+    /// The input schema
+    schema: Arc<Schema>,
+}
+
+impl MissingColumnRewriter {
+    /// Create a new [`MissingColumnRewriter`] targeting the given schema
+    pub(crate) fn new(schema: Arc<Schema>) -> Self {
+        Self { schema }
+    }
+
+    fn column_exists(&self, col: &Column) -> DataFusionResult<bool> {
+        // todo a real error here (rpc_predicates shouldn't have table/relation qualifiers)
+        assert!(col.relation.is_none());
+
+        if self.schema.find_index_of(&col.name).is_some() {
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+}
+
+fn lit_null() -> Expr {
+    lit(ScalarValue::Utf8(None))
+}
+
+impl ExprRewriter for MissingColumnRewriter {
+    fn mutate(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        Ok(match expr {
+            Expr::Column(col) if !self.column_exists(&col)? => lit_null(),
+            expr => expr,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{arrow::datatypes::DataType, logical_plan::ExprRewritable};
+    use schema::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn all_columns_defined_no_rewrite() {
+        // t1 = "foo"
+        let expr = col("t1").eq(lit("foo"));
+        assert_eq!(rewrite(expr.clone()), expr);
+
+        // f1 > 1.0
+        let expr = col("f1").gt(lit(1.0));
+        assert_eq!(rewrite(expr.clone()), expr);
+    }
+
+    #[test]
+    fn all_columns_not_defined() {
+        // non_defined = "foo" --> NULL = "foo"
+        let expr = col("non_defined").eq(lit("foo"));
+        let expected = lit_null().eq(lit("foo"));
+        assert_eq!(rewrite(expr), expected);
+
+        // non_defined = 1.4 --> NULL = 1.4
+        let expr = col("non_defined").eq(lit(1.4));
+        // No type is inferred so this is a literal null string (even though it maybe should be a literal float)
+        let expected = lit_null().eq(lit(1.4));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    #[test]
+    fn some_columns_not_defined() {
+        // t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar"
+        let expr = col("t1")
+            .eq(lit("foo"))
+            .and(col("non_defined").eq(lit("bar")));
+        let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar")));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    fn rewrite(expr: Expr) -> Expr {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .field("f1", DataType::Int64)
+            .build()
+            .unwrap();
+
+        let mut rewriter = MissingColumnRewriter::new(Arc::new(schema));
+        expr.rewrite(&mut rewriter).unwrap()
+    }
+}
--- a/predicate/src/rpc_predicate/field_rewrite.rs
+++ b/predicate/src/rpc_predicate/field_rewrite.rs
@ -55,8 +55,8 @@ impl FieldProjectionRewriter {
        }
    }

-    // Rewrites the predicate. See the description on
-    // [`FieldProjectionRewriter`] for more details.
+    /// Rewrites the predicate. See the description on
+    /// [`FieldProjectionRewriter`] for more details.
    pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult<Expr> {
        // for predicates like `A AND B AND C`
        // rewrite `A`, `B` and `C` separately and put them back together
--- a/predicate/src/rpc_predicate/rewrite.rs
+++ b/predicate/src/rpc_predicate/rewrite.rs
--- a/querier/Cargo.toml
+++ b/querier/Cargo.toml
@ -18,7 +18,7 @@ generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
--- a/querier/src/cache/read_buffer.rs
+++ b/querier/src/cache/read_buffer.rs
@ -470,9 +470,9 @@ mod tests {
            .into_iter()
            .map(lp_to_record_batch)
            .map(Arc::new)
-            .collect();
+            .collect::<Vec<_>>();

-        let stream = stream_from_batches(batches);
+        let stream = stream_from_batches(batches[0].schema(), batches);

        let metric_registry = metric::Registry::new();

--- a/querier/src/chunk/query_access.rs
+++ b/querier/src/chunk/query_access.rs
@ -7,13 +7,16 @@ use arrow::{
 use data_types::{
    ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::{
-    stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+    },
 };
 use futures::{Stream, TryStreamExt};
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::debug;
 use predicate::Predicate;
@ -114,7 +117,7 @@ impl QueryChunk for QuerierChunk {
        mut ctx: IOxSessionContext,
        predicate: &Predicate,
        columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        ctx.set_metadata("projection", format!("{}", columns));
        ctx.set_metadata("predicate", format!("{}", &predicate));

@ -161,10 +164,10 @@ impl QueryChunk for QuerierChunk {
                        None
                    }
                    Err(other) => {
-                        return Err(Box::new(Error::RBChunk {
+                        return Err(DataFusionError::External(Box::new(Error::RBChunk {
                            source: other,
                            chunk_id: self.id(),
-                        }))
+                        })))
                    }
                };

@ -178,7 +181,7 @@ impl QueryChunk for QuerierChunk {
        mut ctx: IOxSessionContext,
        column_name: &str,
        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        ctx.set_metadata("column_name", column_name.to_string());
        ctx.set_metadata("predicate", format!("{}", &predicate));

@ -205,11 +208,13 @@ impl QueryChunk for QuerierChunk {
                };
                ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));

-                let mut values = rb_chunk.column_values(
-                    rb_predicate,
-                    Selection::Some(&[column_name]),
-                    BTreeMap::new(),
-                )?;
+                let mut values = rb_chunk
+                    .column_values(
+                        rb_predicate,
+                        Selection::Some(&[column_name]),
+                        BTreeMap::new(),
+                    )
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;

                // The InfluxRPC frontend only supports getting column values
                // for one column at a time (this is a restriction on the Influx
@ -221,7 +226,8 @@ impl QueryChunk for QuerierChunk {
                    .context(ColumnNameNotFoundSnafu {
                        chunk_id: self.id(),
                        column_name,
-                    })?;
+                    })
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
                ctx.set_metadata("output_values", values.len() as i64);

                Ok(Some(values))
@ -234,7 +240,7 @@ impl QueryChunk for QuerierChunk {
        mut ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
        let span_recorder = SpanRecorder::new(
            ctx.span()
                .map(|span| span.child("QuerierChunk::read_filter")),
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@ -11,6 +11,7 @@ use data_types::{
    ChunkId, ChunkOrder, IngesterMapping, PartitionId, SequenceNumber, ShardId, ShardIndex,
    TableSummary, TimestampMinMax,
 };
+use datafusion::error::DataFusionError;
 use datafusion_util::MemoryStream;
 use futures::{stream::FuturesUnordered, TryStreamExt};
 use generated_types::{
@ -24,7 +25,7 @@ use influxdb_iox_client::flight::{
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
    util::compute_timenanosecond_min_max,
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use iox_time::{Time, TimeProvider};
 use metric::{DurationHistogram, Metric};
@ -612,9 +613,7 @@ impl IngesterStreamDecoder {
                    partition_id,
                    shard_id,
                    status.parquet_max_sequence_number.map(SequenceNumber::new),
-                    status
-                        .tombstone_max_sequence_number
-                        .map(SequenceNumber::new),
+                    None,
                    partition_sort_key,
                );
                self.current_partition = Some(partition);
@ -1097,7 +1096,7 @@ impl QueryChunk for IngesterChunk {
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
        _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        // TODO maybe some special handling?
        Ok(None)
    }
@ -1107,7 +1106,7 @@ impl QueryChunk for IngesterChunk {
        _ctx: IOxSessionContext,
        _column_name: &str,
        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
        // TODO maybe some special handling?
        Ok(None)
    }
@ -1117,11 +1116,15 @@ impl QueryChunk for IngesterChunk {
        _ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
-    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, DataFusionError> {
        trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data");

        // Apply selection to in-memory batch
-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
            None => self.batches.clone(),
            Some(projection) => self
                .batches
@ -1333,7 +1336,6 @@ mod tests {
                            partition_id: 1,
                            status: Some(PartitionStatus {
                                parquet_max_sequence_number: None,
-                                tombstone_max_sequence_number: None,
                            }),
                        },
                    ))],
@ -1389,7 +1391,6 @@ mod tests {
                                partition_id: 1,
                                status: Some(PartitionStatus {
                                    parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                }),
                            },
                        )),
@ -1399,7 +1400,6 @@ mod tests {
                                partition_id: 2,
                                status: Some(PartitionStatus {
                                    parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                }),
                            },
                        )),
@ -1409,7 +1409,6 @@ mod tests {
                                partition_id: 1,
                                status: Some(PartitionStatus {
                                    parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                }),
                            },
                        )),
@ -1489,7 +1488,6 @@ mod tests {
                                    partition_id: 1,
                                    status: Some(PartitionStatus {
                                        parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                    }),
                                },
                            )),
@ -1519,7 +1517,6 @@ mod tests {
                                    partition_id: 2,
                                    status: Some(PartitionStatus {
                                        parquet_max_sequence_number: Some(21),
-                                        tombstone_max_sequence_number: Some(22),
                                    }),
                                },
                            )),
@ -1544,7 +1541,6 @@ mod tests {
                                    partition_id: 3,
                                    status: Some(PartitionStatus {
                                        parquet_max_sequence_number: Some(31),
-                                        tombstone_max_sequence_number: Some(32),
                                    }),
                                },
                            )),
@ -1574,10 +1570,7 @@ mod tests {
            p1.parquet_max_sequence_number,
            Some(SequenceNumber::new(11))
        );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
        assert_eq!(p1.chunks.len(), 2);
        assert_eq!(p1.chunks[0].schema().as_arrow(), schema_1_1);
        assert_eq!(p1.chunks[0].batches.len(), 2);
@ -1594,10 +1587,7 @@ mod tests {
            p2.parquet_max_sequence_number,
            Some(SequenceNumber::new(21))
        );
-        assert_eq!(
-            p2.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(22))
-        );
+        assert_eq!(p2.tombstone_max_sequence_number, None);
        assert_eq!(p2.chunks.len(), 1);
        assert_eq!(p2.chunks[0].schema().as_arrow(), schema_2_1);
        assert_eq!(p2.chunks[0].batches.len(), 1);
@ -1610,10 +1600,7 @@ mod tests {
            p3.parquet_max_sequence_number,
            Some(SequenceNumber::new(31))
        );
-        assert_eq!(
-            p3.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(32))
-        );
+        assert_eq!(p3.tombstone_max_sequence_number, None);
        assert_eq!(p3.chunks.len(), 1);
        assert_eq!(p3.chunks[0].schema().as_arrow(), schema_3_1);
        assert_eq!(p3.chunks[0].batches.len(), 1);
@ -1733,7 +1720,6 @@ mod tests {
                                    partition_id: 1,
                                    status: Some(PartitionStatus {
                                        parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                    }),
                                },
                            )),
@ -1773,10 +1759,7 @@ mod tests {
            p1.parquet_max_sequence_number,
            Some(SequenceNumber::new(11))
        );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
        assert_eq!(p1.chunks.len(), 1);
    }

--- a/querier/src/namespace/query_access.rs
+++ b/querier/src/namespace/query_access.rs
@ -11,10 +11,11 @@ use data_types::NamespaceId;
 use datafusion::{
    catalog::{catalog::CatalogProvider, schema::SchemaProvider},
    datasource::TableProvider,
+    error::DataFusionError,
 };
 use iox_query::{
    exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext},
-    QueryChunk, QueryCompletedToken, QueryDatabase, QueryDatabaseError, QueryText, DEFAULT_SCHEMA,
+    QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
 };
 use observability_deps::tracing::{debug, trace};
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
@ -40,8 +41,9 @@ impl QueryDatabase for QuerierNamespace {
        &self,
        table_name: &str,
        predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
        ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
        debug!(%table_name, %predicate, "Finding chunks for table");
        // get table metadata
        let table = match self.tables.get(table_name).map(Arc::clone) {
@ -57,7 +59,7 @@ impl QueryDatabase for QuerierNamespace {
            .chunks(
                predicate,
                ctx.span().map(|span| span.child("querier table chunks")),
-                &None, // todo: pushdown projection to chunks
+                projection,
            )
            .await?;

@ -627,7 +629,7 @@ mod tests {
            .unwrap_err();
        assert_eq!(
            err.to_string(),
-            format!("Cannot build plan: External error: Chunk pruning failed: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
+            format!("Cannot build plan: Resources exhausted: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
        );
    }

--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@ -8,6 +8,7 @@ use crate::{
    IngesterConnection,
 };
 use data_types::{ColumnId, PartitionId, ShardIndex, TableId, TimestampMinMax};
+use datafusion::error::DataFusionError;
 use futures::{join, StreamExt};
 use iox_query::pruning::prune_summaries;
 use iox_query::{exec::Executor, provider, provider::ChunkPruner, QueryChunk};
@ -65,6 +66,17 @@ pub enum Error {

 pub type Result<T, E = Error> = std::result::Result<T, E>;

+impl From<Error> for DataFusionError {
+    fn from(err: Error) -> Self {
+        match err {
+            Error::ChunkPruning {
+                source: err @ provider::Error::TooMuchData { .. },
+            } => Self::ResourcesExhausted(err.to_string()),
+            _ => Self::External(Box::new(err) as _),
+        }
+    }
+}
+
 /// Args to create a [`QuerierTable`].
 pub struct QuerierTableArgs {
    pub sharder: Arc<JumpHash<Arc<ShardIndex>>>,
--- a/querier/src/table/query_access/mod.rs
+++ b/querier/src/table/query_access/mod.rs
@ -66,8 +66,7 @@ impl TableProvider for QuerierTable {
                ctx.child_span("querier table chunks"),
                projection,
            )
-            .await
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+            .await?;

        for chunk in chunks {
            builder = builder.add_chunk(chunk);
--- a/querier/src/table/state_reconciler.rs
+++ b/querier/src/table/state_reconciler.rs
@ -23,6 +23,7 @@ use crate::{
 use self::interface::{IngesterPartitionInfo, ParquetFileInfo, TombstoneInfo};

 #[derive(Snafu, Debug)]
+#[allow(missing_copy_implementations)]
 pub enum ReconcileError {
    #[snafu(display("Compactor processed file that the querier would need to split apart which is not yet implemented"))]
    CompactorConflict,
--- a/query_tests/cases/in/delete_all.expected
+++ b/query_tests/cases/in/delete_all.expected
@ -1,25 +0,0 @@
-- Test Setup: OneDeleteSimpleExprOneChunkDeleteAll
-- SQL: SELECT * from cpu;
-++
-++
-- SQL: SELECT time from cpu;
-++
-++
-- SQL: SELECT count(*), count(bar), count(time) from cpu;
-+-----------------+----------------+-----------------+
-| COUNT(UInt8(1)) | COUNT(cpu.bar) | COUNT(cpu.time) |
-+-----------------+----------------+-----------------+
-| 0               | 0              | 0               |
-+-----------------+----------------+-----------------+
-- SQL: SELECT min(bar), max(bar), min(time), max(time) from cpu;
-+--------------+--------------+---------------+---------------+
-| MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+--------------+--------------+---------------+---------------+
-|              |              |               |               |
-+--------------+--------------+---------------+---------------+
-- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-|              |
-+--------------+
--- a/query_tests/cases/in/delete_all.sql
+++ b/query_tests/cases/in/delete_all.sql
@ -1,17 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteSimpleExprOneChunkDeleteAll
-
-- select *
-SELECT * from cpu;
-
-- select one specific column
-SELECT time from cpu;
-
-- select aggregate of every column inlcuding star
-SELECT count(*), count(bar), count(time) from cpu;
-
-- select aggregate of every column
-SELECT min(bar), max(bar), min(time), max(time) from cpu;
-
-- select aggregate of one column
-SELECT max(bar) from cpu;
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.expected
+++ b/query_tests/cases/in/delete_multi_expr_one_chunk.expected
@ -1,207 +0,0 @@
-- Test Setup: OneDeleteMultiExprsOneChunk
-- SQL: SELECT * from cpu order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu order by time, bar;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-| 1970-01-01T00:00:00.000000040Z | 1   |
-+--------------------------------+-----+
-- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 2   |
-+-----+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 2               | 2               | 2              | 1            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000040Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 2               |
-+-----------------+
-- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 2              |
-+----------------+
-- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 2              |
-+----------------+
-- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 2               |
-+-----------------+
-- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
-- SQL: SELECT foo from cpu;
-- Results After Sorting
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
-- SQL: SELECT min(foo) as min_foo from cpu order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-+---------+
-- SQL: SELECT max(foo) as max_foo from cpu order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| you     |
-+---------+
-- SQL: SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
-- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
-- SQL: SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-+--------------------------------+---------+
-| time                           | max_foo |
-+--------------------------------+---------+
-| 1970-01-01T00:00:00.000000020Z | you     |
-| 1970-01-01T00:00:00.000000040Z | me      |
-+--------------------------------+---------+
-- SQL: SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
-- SQL: SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-+-----+---------+
-| bar | max_foo |
-+-----+---------+
-| 1   | me      |
-| 2   | you     |
-+-----+---------+
-- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
-- SQL: SELECT min(time) as min_time from cpu order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
-- SQL: SELECT min(time) as min_time from cpu group by bar order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
-- SQL: SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-+-----+--------------------------------+
-| bar | min_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000040Z |
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu group by foo order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
-- SQL: SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-+-----+--------------------------------+
-| foo | max_time                       |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
-- SQL: SELECT time from cpu;
-- Results After Sorting
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
-- SQL: SELECT max(bar) from cpu order by 1;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 2            |
-+--------------+
-- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
-- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
-- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
-- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
-- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
-+----+--------------------------------+
-| mi | ma                             |
-+----+--------------------------------+
-| 2  | 1970-01-01T00:00:00.000000020Z |
-+----+--------------------------------+
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.sql
+++ b/query_tests/cases/in/delete_multi_expr_one_chunk.sql
@ -1,61 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteMultiExprsOneChunk
-
-- select *
-SELECT * from cpu order by bar, foo, time;
-
-SELECT time, bar from cpu order by time, bar;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
-- IOX_COMPARE: sorted
-SELECT foo from cpu;
-
-SELECT min(foo) as min_foo from cpu order by min_foo;
-SELECT max(foo) as max_foo from cpu order by max_foo;
-
-SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-
-SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-
-SELECT min(time) as min_time from cpu order by min_time;
-SELECT max(time) as max_time from cpu order by max_time;
-
-SELECT min(time) as min_time from cpu group by bar order by min_time;
-SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-SELECT max(time) as max_time from cpu group by foo order by max_time;
-SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-
-- IOX_COMPARE: sorted
-SELECT time from cpu;
-
-SELECT max(bar) from cpu order by 1;
-
--------------------------------------------------------
-- With selection predicate
-
-SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-
-SELECT foo from cpu where bar >= 1.0 order by foo;
-
-SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-
-SELECT * from cpu where foo = 'you' order by bar, foo, time;
-
-SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.expected
+++ b/query_tests/cases/in/delete_simple_pred_one_chunk.expected
@ -1,91 +0,0 @@
-- Test Setup: OneDeleteSimpleExprOneChunk
-- SQL: SELECT * from cpu;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
-- SQL: SELECT min(bar), max(bar) from cpu;
-+--------------+--------------+
-| MIN(cpu.bar) | MAX(cpu.bar) |
-+--------------+--------------+
-| 2            | 2            |
-+--------------+--------------+
-- SQL: SELECT time from cpu;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
-- SQL: SELECT max(time)  from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
-- SQL: SELECT min(time)  from cpu group by bar;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
-- SQL: SELECT bar, min(time)  from cpu group by bar;
-+-----+--------------------------------+
-| bar | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
-- SQL: SELECT count(time), max(time)  from cpu;
-+-----------------+--------------------------------+
-| COUNT(cpu.time) | MAX(cpu.time)                  |
-+-----------------+--------------------------------+
-| 1               | 1970-01-01T00:00:00.000000020Z |
-+-----------------+--------------------------------+
-- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 1               |
-+-----------------+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT * from cpu where bar = 2.0;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
-- SQL: SELECT * from cpu where bar != 2.0;
-++
-++
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| 0               | 0               | 0              |              |              |               |               |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-- SQL: SELECT time from cpu where bar=2;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
-- SQL: SELECT bar from cpu where bar!= 2;
-++
-++
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.sql
+++ b/query_tests/cases/in/delete_simple_pred_one_chunk.sql
@ -1,37 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteSimpleExprOneChunk
-
-- select *
-SELECT * from cpu;
-
-SELECT time, bar from cpu;
-
-SELECT min(bar), max(bar) from cpu;
-
-SELECT time from cpu;
-
-SELECT max(time)  from cpu;
-SELECT min(time)  from cpu group by bar;
-SELECT bar, min(time)  from cpu group by bar;
-
-SELECT count(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
----------------------------------------------------------------
-- Now add selection predicate
-SELECT * from cpu where bar = 2.0;
-
-SELECT * from cpu where bar != 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-
-SELECT time from cpu where bar=2;
-
-SELECT bar from cpu where bar!= 2;
-
-
--- a/query_tests/cases/in/delete_three_chunks_1.expected
+++ b/query_tests/cases/in/delete_three_chunks_1.expected
@ -1,85 +0,0 @@
-- Test Setup: ThreeDeleteThreeChunks
-- SQL: SELECT * from cpu order by foo, bar, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-+-----+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000042Z | 1   |
-| 1970-01-01T00:00:00.000000062Z | 1   |
-| 1970-01-01T00:00:00.000000070Z | 3   |
-| 1970-01-01T00:00:00.000000050Z | 4   |
-| 1970-01-01T00:00:00.000000060Z | 5   |
-| 1970-01-01T00:00:00.000000080Z | 7   |
-+--------------------------------+-----+
-- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 1   |
-| 1   |
-| 3   |
-| 4   |
-| 5   |
-| 7   |
-+-----+
-- SQL: SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-+---+---+---+----+--------------------------------+--------------------------------+
-| t | c | b | mi | mt                             | mat                            |
-+---+---+---+----+--------------------------------+--------------------------------+
-| 7 | 7 | 7 | 1  | 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+---+---+---+----+--------------------------------+--------------------------------+
-- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 7               |
-+-----------------+
-- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 7              |
-+----------------+
-- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 7              |
-+----------------+
-- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 7               |
-+-----------------+
-- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
-- SQL: SELECT foo from cpu order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| you |
-+-----+
--- a/query_tests/cases/in/delete_three_chunks_1.sql
+++ b/query_tests/cases/in/delete_three_chunks_1.sql
@ -1,23 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: ThreeDeleteThreeChunks
-
-- select *
-SELECT * from cpu order by foo, bar, time;
-
-SELECT time, bar from cpu order by bar, time;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
-SELECT foo from cpu order by foo;
--- a/query_tests/cases/in/delete_three_chunks_2.expected
+++ b/query_tests/cases/in/delete_three_chunks_2.expected
@ -1,77 +0,0 @@
-- Test Setup: ThreeDeleteThreeChunks
-- SQL: SELECT min(foo) from cpu;
-+--------------+
-| MIN(cpu.foo) |
-+--------------+
-| me           |
-+--------------+
-- SQL: SELECT max(foo) from cpu;
-+--------------+
-| MAX(cpu.foo) |
-+--------------+
-| you          |
-+--------------+
-- SQL: SELECT min(time) from cpu;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
-- SQL: SELECT max(time) from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
-- SQL: SELECT foo, min(time) from cpu group by foo;
-- Results After Sorting
-+-----+--------------------------------+
-| foo | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000070Z |
-+-----+--------------------------------+
-- SQL: SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-+-----+--------------------------------+
-| bar | max_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000062Z |
-| 3   | 1970-01-01T00:00:00.000000070Z |
-| 4   | 1970-01-01T00:00:00.000000050Z |
-| 5   | 1970-01-01T00:00:00.000000060Z |
-| 7   | 1970-01-01T00:00:00.000000080Z |
-+-----+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu group by bar order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
-- SQL: SELECT time from cpu order by time;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-| 1970-01-01T00:00:00.000000042Z |
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
-- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 7            |
-+--------------+
-- SQL: SELECT min(time), max(time) from cpu;
-+--------------------------------+--------------------------------+
-| MIN(cpu.time)                  | MAX(cpu.time)                  |
-+--------------------------------+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+--------------------------------+
--- a/query_tests/cases/in/delete_three_chunks_2.sql
+++ b/query_tests/cases/in/delete_three_chunks_2.sql
@ -1,19 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: ThreeDeleteThreeChunks
-
-SELECT min(foo) from cpu;
-SELECT max(foo) from cpu;
-
-SELECT min(time) from cpu;
-SELECT max(time) from cpu;
-
-- IOX_COMPARE: sorted
-SELECT foo, min(time) from cpu group by foo;
-SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-SELECT max(time) as max_time from cpu group by bar order by max_time;
-
-SELECT time from cpu order by time;
-
-SELECT max(bar) from cpu;
-
-SELECT min(time), max(time) from cpu;
--- a/Show More
+++ b/Show More