diff --git a/Cargo.lock b/Cargo.lock
index aefdbb86b7..92a4115b4f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1050,7 +1050,7 @@ dependencies = [
  "influxdb_line_protocol",
  "iox_time",
  "observability_deps",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "percent-encoding",
  "schema",
  "serde",
@@ -1094,7 +1094,7 @@ dependencies = [
  "log",
  "num_cpus",
  "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "parking_lot 0.12.1",
  "parquet",
  "paste",
@@ -1116,7 +1116,7 @@ source = "git+https://github.com/apache/arrow-datafusion.git?rev=c7f3a70a79ee840
 dependencies = [
  "arrow",
  "object_store",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "parquet",
  "sqlparser 0.23.0",
 ]
@@ -1163,7 +1163,7 @@ dependencies = [
  "hashbrown",
  "lazy_static",
  "md-5",
- "ordered-float 3.1.0",
+ "ordered-float 3.2.0",
  "paste",
  "rand",
  "regex",
@@ -1741,9 +1741,9 @@ dependencies = [
 
 [[package]]
 name = "handlebars"
-version = "4.3.4"
+version = "4.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56b224eaa4987c03c30b251de7ef0c15a6a59f34222905850dbc3026dfb24d5f"
+checksum = "433e4ab33f1213cdc25b5fa45c76881240cfe79284cf2b395e8b9e312a30a2fd"
 dependencies = [
  "log",
  "pest",
@@ -2061,7 +2061,9 @@ dependencies = [
  "data_types",
  "datafusion 0.1.0",
  "dotenvy",
+ "flate2",
  "futures",
+ "futures-util",
  "generated_types",
  "hashbrown",
  "http",
@@ -2126,12 +2128,13 @@ dependencies = [
  "client_util",
  "futures-util",
  "generated_types",
- "mockito",
+ "influxdb_line_protocol",
  "prost 0.11.0",
  "rand",
  "reqwest",
  "thiserror",
  "tokio",
+ "tokio-stream",
  "tonic",
 ]
 
@@ -2182,7 +2185,7 @@ version = "0.1.0"
 dependencies = [
  "generated_types",
  "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
  "workspace-hack",
 ]
 
@@ -2222,6 +2225,7 @@ dependencies = [
  "pin-project",
  "predicate",
  "prost 0.11.0",
+ "rand",
  "schema",
  "snafu",
  "test_helpers",
@@ -2681,9 +2685,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.134"
+version = "0.2.135"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
+checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
 
 [[package]]
 name = "libloading"
@@ -3130,9 +3134,9 @@ dependencies = [
 
 [[package]]
 name = "object_store"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2168fee79ee3e7695905bc3a48777d807f82d956f821186fa7a2601c1295a73e"
+checksum = "56ce10a205d9f610ae3532943039c34c145930065ce0c4284134c897fe6073b1"
 dependencies = [
  "async-trait",
  "base64",
@@ -3142,7 +3146,7 @@ dependencies = [
  "itertools",
  "parking_lot 0.12.1",
  "percent-encoding",
- "quick-xml 0.24.1",
+ "quick-xml 0.25.0",
  "rand",
  "reqwest",
  "ring",
@@ -3207,9 +3211,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "3.1.0"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a"
+checksum = "129d36517b53c461acc6e1580aeb919c8ae6708a4b1eae61c4463a615d4f0411"
 dependencies = [
  "num-traits",
 ]
@@ -3581,7 +3585,7 @@ dependencies = [
  "schema",
  "serde_json",
  "snafu",
- "sqlparser 0.24.0",
+ "sqlparser 0.25.0",
  "test_helpers",
  "workspace-hack",
 ]
@@ -3670,9 +3674,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.43"
+version = "1.0.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
+checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
 dependencies = [
  "unicode-ident",
 ]
@@ -3942,9 +3946,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.24.1"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37dddbbe9df96afafcb8027fcf263971b726530e12f0787f620a7ba5b4846081"
+checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9"
 dependencies = [
  "memchr",
  "serde",
@@ -4412,9 +4416,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.85"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
+checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
 dependencies = [
  "itoa 1.0.3",
  "ryu",
@@ -4669,15 +4673,15 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
 name = "snafu"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2"
+checksum = "dd726aec4ebad65756394ff89a9b9598793d4e30121cd71690244c1e497b3aee"
 dependencies = [
  "doc-comment",
  "snafu-derive",
@@ -4685,9 +4689,9 @@ dependencies = [
 
 [[package]]
 name = "snafu-derive"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5"
+checksum = "712529e9b0b014eabaa345b38e06032767e3dc393e8b017e853b1d7247094e74"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -4748,9 +4752,9 @@ dependencies = [
 
 [[package]]
 name = "sqlparser"
-version = "0.24.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dac9c312566fdfc45a38ecf1924013c82af2a7d5315e46f67b1cc987f12be260"
+checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd"
 dependencies = [
  "log",
 ]
@@ -4953,9 +4957,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "1.0.101"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
+checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5228,9 +5232,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af"
+checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -5434,9 +5438,9 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.36"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
  "cfg-if",
  "log",
@@ -5447,9 +5451,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2"
+checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5458,9 +5462,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.29"
+version = "0.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
+checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
 dependencies = [
  "once_cell",
  "valuable",
diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml
index 761e13140b..0104b2eee1 100644
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@@ -11,10 +11,10 @@ humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tempfile = "3.1.0"
 trace = { path = "../trace" }
diff --git a/compactor/Cargo.toml b/compactor/Cargo.toml
index 8a366ab903..7cb6a78574 100644
--- a/compactor/Cargo.toml
+++ b/compactor/Cargo.toml
@@ -14,7 +14,7 @@ datafusion = { path = "../datafusion" }
 futures = "0.3"
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
 predicate = { path = "../predicate" }
diff --git a/compactor/src/cold.rs b/compactor/src/cold.rs
index 1eb3aad4ee..f4a59dcf65 100644
--- a/compactor/src/cold.rs
+++ b/compactor/src/cold.rs
@@ -45,7 +45,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
         compaction_type,
         CompactionLevel::Initial,
         compact_in_parallel,
-        false, // no split
+        true, // split
         candidates.clone().into(),
     )
     .await;
@@ -57,7 +57,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
             compaction_type,
             CompactionLevel::FileNonOverlapped,
             compact_in_parallel,
-            false, // don't split
+            true, // split
             candidates.into(),
         )
         .await;
@@ -812,24 +812,42 @@ mod tests {
 
         compact(compactor, true).await;
 
-        // Should have 1 non-soft-deleted file:
+        // Should have 2 non-soft-deleted file:
         //
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - the 2 level-2 files created after combining all 3 level 1 files created by the first step
         //   of compaction to compact remaining level 0 files
         let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 1, "{files:?}");
+        assert_eq!(files.len(), 2, "{files:?}");
         let files_and_levels: Vec<_> = files
             .iter()
             .map(|f| (f.id.get(), f.compaction_level))
             .collect();
 
         // The initial files are: L0 1-4, L1 5-6. The first step of cold compaction took files 1-5
-        // and compacted them into a l-1 file 7. The second step of cold compaction
-        // took 6 and 7 and combined them all into file 8.
-        assert_eq!(files_and_levels, vec![(8, CompactionLevel::Final)]);
+        // and compacted them into two l-1 files 7, 8. The second step of cold compaction
+        // took 6, 7, and 8 and combined them all into two files 9 and 10.
+        assert_eq!(
+            files_and_levels,
+            vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)]
+        );
 
         // ------------------------------------------------
         // Verify the parquet file content
+        // first file:
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z |",
+                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // second file
         let file = files.pop().unwrap();
         let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
@@ -847,9 +865,7 @@ mod tests {
                 "| 20        |      | VT   | 20   | 1970-01-01T00:00:00.000026Z    |",
                 "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
                 "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
-                "| 421       |      | OH   | 21   | 1970-01-01T00:00:00.000091Z    |",
                 "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
-                "| 81601     |      | PA   | 15   | 1970-01-01T00:00:00.000090Z    |",
                 "+-----------+------+------+------+--------------------------------+",
             ],
             &batches
@@ -1027,14 +1043,14 @@ mod tests {
 
         compact(compactor, true).await;
 
-        // Should have 3 non-soft-deleted files:
+        // Should have 4 non-soft-deleted files:
         //
         // - pf4, the level 1 file untouched because it didn't fit in the memory budget
         // - pf6, the level 2 file untouched because it doesn't overlap anything
-        // - the level 2 file created after combining all 3 level 1 files created by the first step
+        // - two level-2 files created after combining all 3 level 1 files created by the first step
         //   of compaction to compact remaining level 0 files
         let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
-        assert_eq!(files.len(), 3, "{files:?}");
+        assert_eq!(files.len(), 4, "{files:?}");
         let files_and_levels: Vec<_> = files
             .iter()
             .map(|f| (f.id.get(), f.compaction_level))
@@ -1042,20 +1058,35 @@ mod tests {
 
         // File 4 was L1 but didn't fit in the memory budget, so was untouched.
         // File 6 was already L2 and did not overlap with anything, so was untouched.
-        // Cold compaction took files 1, 2, 3, 5 and compacted them into file 7.
+        // Cold compaction took files 1, 2, 3, 5 and compacted them into 2 files 7 and 8.
         assert_eq!(
             files_and_levels,
             vec![
                 (4, CompactionLevel::FileNonOverlapped),
                 (6, CompactionLevel::Final),
                 (7, CompactionLevel::Final),
+                (8, CompactionLevel::Final),
             ]
         );
 
         // ------------------------------------------------
         // Verify the parquet file content
-        let file1 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file1).await;
+        // newly created L-2 with largest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+------+-----------------------------+",
+                "| field_int | tag1 | tag2 | tag3 | time                        |",
+                "+-----------+------+------+------+-----------------------------+",
+                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z |",
+                "+-----------+------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
+        // newly created L-2 with smallest timestamp
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
             &[
                 "+-----------+------+------+------+--------------------------------+",
@@ -1068,15 +1099,14 @@ mod tests {
                 "| 1500      | WA   |      |      | 1970-01-01T00:00:00.000008Z    |",
                 "| 1601      |      | PA   | 15   | 1970-01-01T00:00:00.000000009Z |",
                 "| 21        |      | OH   | 21   | 1970-01-01T00:00:00.000000025Z |",
-                "| 270       | UT   |      |      | 1970-01-01T00:00:00.000025Z    |",
                 "| 70        | UT   |      |      | 1970-01-01T00:00:00.000020Z    |",
                 "+-----------+------+------+------+--------------------------------+",
             ],
             &batches
         );
-
-        let file0 = files.pop().unwrap();
-        let batches = table.read_parquet_file(file0).await;
+        // available L2 that does not overlap
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
         assert_batches_sorted_eq!(
             &[
                 "+-----------+------+------+-----------------------------+",
@@ -1088,6 +1118,20 @@ mod tests {
             ],
             &batches
         );
+        // available L1 that did not fit in the memory budget
+        let file = files.pop().unwrap();
+        let batches = table.read_parquet_file(file).await;
+        assert_batches_sorted_eq!(
+            &[
+                "+-----------+------+------+-----------------------------+",
+                "| field_int | tag2 | tag3 | time                        |",
+                "+-----------+------+------+-----------------------------+",
+                "| 1600      | WA   | 10   | 1970-01-01T00:00:00.000028Z |",
+                "| 20        | VT   | 20   | 1970-01-01T00:00:00.000026Z |",
+                "+-----------+------+------+-----------------------------+",
+            ],
+            &batches
+        );
     }
 
     struct TestDb {
diff --git a/compactor/src/query.rs b/compactor/src/query.rs
index ea6e219d4e..20a8d068cc 100644
--- a/compactor/src/query.rs
+++ b/compactor/src/query.rs
@@ -4,10 +4,10 @@ use data_types::{
     ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
     TableSummary, Timestamp, TimestampMinMax, Tombstone,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
 use parquet_file::chunk::ParquetChunk;
@@ -194,7 +194,7 @@ impl QueryChunk for QueryableParquetChunk {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -208,7 +208,7 @@ impl QueryChunk for QueryableParquetChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -230,7 +230,7 @@ impl QueryChunk for QueryableParquetChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         ctx.set_metadata("storage", "compactor");
         ctx.set_metadata("projection", format!("{}", selection));
         trace!(?selection, "selection");
@@ -238,7 +238,7 @@ impl QueryChunk for QueryableParquetChunk {
         self.data
             .read_filter(predicate, selection)
             .context(ReadParquetSnafu)
-            .map_err(|e| Box::new(e) as _)
+            .map_err(|e| DataFusionError::External(Box::new(e)))
     }
 
     /// Returns chunk type
diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs
index 75fd250dd0..38a9c8cd05 100644
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@@ -15,7 +15,7 @@ use datafusion::execution::context::TaskContext;
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::common::SizedRecordBatchStream;
 use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics};
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan};
 use datafusion::prelude::SessionContext;
 use datafusion::{
     arrow::{
@@ -236,12 +236,19 @@ where
 }
 
 /// Create a SendableRecordBatchStream a RecordBatch
-pub fn stream_from_batch(batch: RecordBatch) -> SendableRecordBatchStream {
-    stream_from_batches(vec![Arc::new(batch)])
+pub fn stream_from_batch(schema: Arc<Schema>, batch: RecordBatch) -> SendableRecordBatchStream {
+    stream_from_batches(schema, vec![Arc::new(batch)])
 }
 
 /// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema
-pub fn stream_from_batches(batches: Vec<Arc<RecordBatch>>) -> SendableRecordBatchStream {
+pub fn stream_from_batches(
+    schema: Arc<Schema>,
+    batches: Vec<Arc<RecordBatch>>,
+) -> SendableRecordBatchStream {
+    if batches.is_empty() {
+        return Box::pin(EmptyRecordBatchStream::new(schema));
+    }
+
     let dummy_metrics = ExecutionPlanMetricsSet::new();
     let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
     let stream = SizedRecordBatchStream::new(batches[0].schema(), batches, mem_metrics);
diff --git a/docs/underground_guide.md b/docs/underground_guide.md
index 201dd5e44b..c087bcce88 100644
--- a/docs/underground_guide.md
+++ b/docs/underground_guide.md
@@ -15,17 +15,25 @@ developers.
 Build IOx for release with pprof:
 
 ```shell
+cd influxdb_iox
 cargo build --release --features=pprof
 ```
 
-## Step 2: Start redpanda and postgres
+You can also install the `influxdb_iox` command locally via 
 
-Now, start up redpanda and postgres locally in docker containers:
+```shell
+cd influxdb_iox
+cargo install --path influxdb_iox
+```
+
+## Step 2: Start kafka and postgres
+
+Now, start up kafka and postgres locally in docker containers:
 ```shell
 # get rskafka from https://github.com/influxdata/rskafka
 cd rskafka
-# Run redpanda on localhost:9010
-docker-compose -f docker-compose-redpanda.yml up &
+# Run kafka on localhost:9010
+docker-compose -f docker-compose-kafka.yml up &
 # now run postgres
 docker run -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust postgres &
 ```
@@ -136,8 +144,8 @@ INFLUXDB_IOX_GRPC_BIND_ADDR=localhost:8084 \
 INFLUXDB_IOX_WRITE_BUFFER_TYPE=kafka \
 INFLUXDB_IOX_WRITE_BUFFER_ADDR=localhost:9010 \
 xINFLUXDB_IOX_WRITE_BUFFER_AUTO_CREATE_TOPICS=10 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_START=0 \
-INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_START=0 \
+INFLUXDB_IOX_SHARD_INDEX_RANGE_END=0 \
 INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES=5000000000 \
 INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES=4000000000 \
 INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost:5432/postgres \
@@ -151,6 +159,11 @@ LOG_FILTER=info \
 
 # Step 5: Ingest data
 
+You can load data using the influxdb_iox client:
+```shell
+influxdb_iox  --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp
+```
+
 Now you can post data to `http://localhost:8080` with your favorite load generating tool
 
 My favorite is https://github.com/alamb/low_card
@@ -171,3 +184,17 @@ posting fairly large requests (necessitating the
 # Step 6: Profile
 
 See [`profiling.md`](./profiling.md).
+
+
+# Step 7: Clean up local state
+
+If you find yourself needing to clean up postgres / kafka state use these commands:
+```shell
+docker ps -a -q | xargs docker stop
+docker rm rskafka_proxy_1
+docker rm rskafka_kafka-0_1
+docker rm rskafka_kafka-1_1
+docker rm rskafka_kafka-2_1
+docker rm rskafka_zookeeper_1
+docker volume rm  rskafka_kafka_0_data rskafka_kafka_1_data rskafka_kafka_2_data rskafka_zookeeper_data
+```
diff --git a/garbage_collector/Cargo.toml b/garbage_collector/Cargo.toml
index a3e1362cb8..84bf828604 100644
--- a/garbage_collector/Cargo.toml
+++ b/garbage_collector/Cargo.toml
@@ -11,7 +11,7 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 observability_deps = { path = "../observability_deps" }
 snafu = "0.7"
 tokio = { version = "1", features = ["macros", "rt", "sync"] }
diff --git a/generated_types/protos/influxdata/iox/ingester/v1/query.proto b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
index ff7cc66209..fc0ca483f2 100644
--- a/generated_types/protos/influxdata/iox/ingester/v1/query.proto
+++ b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
@@ -82,8 +82,9 @@ message PartitionStatus {
   // Max sequence number persisted
   optional int64 parquet_max_sequence_number = 1;
 
-  // Max sequence number for a tombstone associated
-  optional int64 tombstone_max_sequence_number = 2;
+  // Deprecated tombstone support in ingester (#5825).
+  reserved "tombstone_max_sequence_number";
+  reserved 2;
 }
 
 // Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s
diff --git a/import/Cargo.toml b/import/Cargo.toml
index c773711a23..20d0a3cdc3 100644
--- a/import/Cargo.toml
+++ b/import/Cargo.toml
@@ -13,11 +13,11 @@ futures = "0.3"
 generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
-object_store = { version = "0.5.0", features = ["aws"] }
+object_store = { version = "0.5.1", features = ["aws"] }
 observability_deps = { path = "../observability_deps" }
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.82"
+serde_json = "1.0.86"
 thiserror = "1.0.37"
 tokio = { version = "1.21" }
 tonic = { version = "0.8" }
diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml
index 060445779b..b3858aac87 100644
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@@ -9,7 +9,7 @@ bytes = "1.2"
 futures = { version = "0.3", default-features = false }
 reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 url = "2.3.1"
 uuid = { version = "1", features = ["v4"] }
diff --git a/influxdb_influxql_parser/src/common.rs b/influxdb_influxql_parser/src/common.rs
index 51266177d6..a6b245c397 100644
--- a/influxdb_influxql_parser/src/common.rs
+++ b/influxdb_influxql_parser/src/common.rs
@@ -2,6 +2,7 @@ use crate::expression::conditional::{conditional_expression, ConditionalExpressi
 use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use crate::literal::unsigned_integer;
+use crate::string::{regex, Regex};
 use core::fmt;
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
@@ -11,73 +12,82 @@ use nom::multi::separated_list1;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt::{Display, Formatter};
 
-/// Represents a fully-qualified measurement name.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub struct MeasurementNameExpression {
+/// Represents a measurement name as either an identifier or a regular expression.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MeasurementName {
+    /// A measurement name expressed as an [`Identifier`].
+    Name(Identifier),
+
+    /// A measurement name expressed as a [`Regex`].
+    Regex(Regex),
+}
+
+impl Parser for MeasurementName {
+    /// Parse a measurement name, which may be an identifier or a regular expression.
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            map(identifier, MeasurementName::Name),
+            map(regex, MeasurementName::Regex),
+        ))(i)
+    }
+}
+
+impl Display for MeasurementName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Name(ident) => fmt::Display::fmt(ident, f),
+            Self::Regex(regex) => fmt::Display::fmt(regex, f),
+        }
+    }
+}
+
+/// Represents a fully-qualified, 3-part measurement name.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct QualifiedMeasurementName {
     pub database: Option<Identifier>,
     pub retention_policy: Option<Identifier>,
-    pub name: Identifier,
+    pub name: MeasurementName,
 }
 
-impl MeasurementNameExpression {
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`.
-    pub fn new(name: Identifier) -> Self {
-        Self {
-            database: None,
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
-    pub fn new_db(name: Identifier, database: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: None,
-            name,
-        }
-    }
-
-    /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
-    pub fn new_db_rp(name: Identifier, database: Identifier, retention_policy: Identifier) -> Self {
-        Self {
-            database: Some(database),
-            retention_policy: Some(retention_policy),
-            name,
-        }
-    }
-}
-
-impl fmt::Display for MeasurementNameExpression {
+impl Display for QualifiedMeasurementName {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
             Self {
                 database: None,
                 retention_policy: None,
                 name,
-            } => write!(f, "{}", name)?,
+            } => write!(f, "{}", name),
             Self {
                 database: Some(db),
                 retention_policy: None,
                 name,
-            } => write!(f, "{}..{}", db, name)?,
+            } => write!(f, "{}..{}", db, name),
             Self {
                 database: None,
                 retention_policy: Some(rp),
                 name,
-            } => write!(f, "{}.{}", rp, name)?,
+            } => write!(f, "{}.{}", rp, name),
             Self {
                 database: Some(db),
                 retention_policy: Some(rp),
                 name,
-            } => write!(f, "{}.{}.{}", db, rp, name)?,
-        };
-        Ok(())
+            } => write!(f, "{}.{}.{}", db, rp, name),
+        }
     }
 }
 
-/// Match a 3-part measurement name expression.
-pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementNameExpression> {
+/// Match a fully-qualified, 3-part measurement name.
+///
+/// ```text
+/// qualified_measurement_name ::= measurement_name |
+///                              ( policy_name "." measurement_name ) |
+///                              ( db_name "." policy_name? "." measurement_name )
+///
+/// db_name          ::= identifier
+/// policy_name      ::= identifier
+/// measurement_name ::= identifier | regex_lit
+/// ```
+pub fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> {
     let (remaining_input, (opt_db_rp, name)) = pair(
         opt(alt((
             // database "." retention_policy "."
@@ -93,7 +103,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
             // retention_policy "."
             map(terminated(identifier, tag(".")), |rp| (None, Some(rp))),
         ))),
-        identifier,
+        MeasurementName::parse,
     )(i)?;
 
     // Extract possible `database` and / or `retention_policy`
@@ -104,7 +114,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
 
     Ok((
         remaining_input,
-        MeasurementNameExpression {
+        QualifiedMeasurementName {
             database,
             retention_policy,
             name,
@@ -290,35 +300,107 @@ mod tests {
     use crate::assert_expect_error;
     use nom::character::complete::alphanumeric1;
 
-    #[test]
-    fn test_measurement_name_expression() {
-        let (_, got) = measurement_name_expression("diskio").unwrap();
-        assert_eq!(
-            got,
-            MeasurementNameExpression {
+    impl From<&str> for MeasurementName {
+        /// Convert a `str` to [`MeasurementName::Name`].
+        fn from(s: &str) -> Self {
+            Self::Name(Identifier(s.into()))
+        }
+    }
+
+    impl QualifiedMeasurementName {
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`.
+        pub fn new(name: MeasurementName) -> Self {
+            Self {
                 database: None,
                 retention_policy: None,
-                name: "diskio".into(),
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
+        pub fn new_db(name: MeasurementName, database: Identifier) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: None,
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
+        pub fn new_db_rp(
+            name: MeasurementName,
+            database: Identifier,
+            retention_policy: Identifier,
+        ) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: Some(retention_policy),
+                name,
+            }
+        }
+    }
+
+    #[test]
+    fn test_qualified_measurement_name() {
+        use MeasurementName::*;
+
+        let (_, got) = qualified_measurement_name("diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Name("diskio".into()),
             }
         );
 
-        let (_, got) = measurement_name_expression("telegraf.autogen.diskio").unwrap();
+        let (_, got) = qualified_measurement_name("/diskio/").unwrap();
         assert_eq!(
             got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                 database: Some("telegraf".into()),
                 retention_policy: Some("autogen".into()),
-                name: "diskio".into(),
+                name: Name("diskio".into()),
             }
         );
 
-        let (_, got) = measurement_name_expression("telegraf..diskio").unwrap();
+        let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap();
         assert_eq!(
             got,
-            MeasurementNameExpression {
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
                 database: Some("telegraf".into()),
                 retention_policy: None,
-                name: "diskio".into(),
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Regex("diskio".into()),
             }
         );
     }
diff --git a/influxdb_influxql_parser/src/delete.rs b/influxdb_influxql_parser/src/delete.rs
index 3613e027ea..6d8a8c7cad 100644
--- a/influxdb_influxql_parser/src/delete.rs
+++ b/influxdb_influxql_parser/src/delete.rs
@@ -73,9 +73,14 @@ mod test {
         // Validate via the Display trait, as we don't need to validate the contents of the
         // FROM and / or WHERE clauses, given they are tested in their on modules.
 
+        // Measurement name expressed as an identifier
         let (_, got) = delete_statement("DELETE FROM foo").unwrap();
         assert_eq!(format!("{}", got), "DELETE FROM foo");
 
+        // Measurement name expressed as a regular expression
+        let (_, got) = delete_statement("DELETE FROM /foo/").unwrap();
+        assert_eq!(format!("{}", got), "DELETE FROM /foo/");
+
         let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap();
         assert_eq!(format!("{}", got), "DELETE FROM foo WHERE time > 10");
 
diff --git a/influxdb_influxql_parser/src/explain.rs b/influxdb_influxql_parser/src/explain.rs
new file mode 100644
index 0000000000..c9576aa3e8
--- /dev/null
+++ b/influxdb_influxql_parser/src/explain.rs
@@ -0,0 +1,140 @@
+#![allow(dead_code)] // Temporary
+
+use crate::internal::{expect, ParseResult};
+use crate::select::{select_statement, SelectStatement};
+use nom::branch::alt;
+use nom::bytes::complete::tag_no_case;
+use nom::character::complete::multispace1;
+use nom::combinator::{map, opt, value};
+use nom::sequence::{preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+/// Represents various options for an `EXPLAIN` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ExplainOption {
+    /// `EXPLAIN VERBOSE statement`
+    Verbose,
+    /// `EXPLAIN ANALYZE statement`
+    Analyze,
+    /// `EXPLAIN ANALYZE VERBOSE statement`
+    AnalyzeVerbose,
+}
+
+impl Display for ExplainOption {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Verbose => f.write_str("VERBOSE"),
+            Self::Analyze => f.write_str("ANALYZE"),
+            Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"),
+        }
+    }
+}
+
+/// Represents an `EXPLAIN` statement.
+///
+/// ```text
+/// explain         ::= "EXPLAIN" explain_options? select_statement
+/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? )
+/// ```
+#[derive(Debug, Clone, PartialEq)]
+pub struct ExplainStatement {
+    options: Option<ExplainOption>,
+    select: Box<SelectStatement>,
+}
+
+impl Display for ExplainStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str("EXPLAIN ")?;
+        if let Some(options) = &self.options {
+            write!(f, "{} ", options)?;
+        }
+        Display::fmt(&self.select, f)
+    }
+}
+
+/// Parse an `EXPLAIN` statement.
+pub fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> {
+    map(
+        tuple((
+            tag_no_case("EXPLAIN"),
+            opt(preceded(
+                multispace1,
+                alt((
+                    map(
+                        preceded(
+                            tag_no_case("ANALYZE"),
+                            opt(preceded(multispace1, tag_no_case("VERBOSE"))),
+                        ),
+                        |v| match v {
+                            // If the optional combinator is Some, then it matched VERBOSE
+                            Some(_) => ExplainOption::AnalyzeVerbose,
+                            _ => ExplainOption::Analyze,
+                        },
+                    ),
+                    value(ExplainOption::Verbose, tag_no_case("VERBOSE")),
+                )),
+            )),
+            multispace1,
+            expect(
+                "invalid EXPLAIN statement, expected SELECT statement",
+                select_statement,
+            ),
+        )),
+        |(_, options, _, select)| ExplainStatement {
+            options,
+            select: Box::new(select),
+        },
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::explain::{explain_statement, ExplainOption};
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_explain_statement() {
+        let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(format!("{}", got), "EXPLAIN SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(format!("{}", got), "EXPLAIN VERBOSE SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(format!("{}", got), "EXPLAIN ANALYZE SELECT val FROM temp");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            format!("{}", got),
+            "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
+        );
+
+        // Fallible cases
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SHOW DATABASES"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"),
+            "invalid EXPLAIN statement, expected SELECT statement"
+        );
+
+        // surfaces statement-specific errors
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/internal.rs b/influxdb_influxql_parser/src/internal.rs
index f9a2b2dcdc..a18c6f5a10 100644
--- a/influxdb_influxql_parser/src/internal.rs
+++ b/influxdb_influxql_parser/src/internal.rs
@@ -22,12 +22,10 @@ impl<I: Display> Display for Error<I> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Syntax { input: _, message } => {
-                write!(f, "Syntax error: {}", message)?;
+                write!(f, "Syntax error: {}", message)
             }
-            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind)?,
+            Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind),
         }
-
-        Ok(())
     }
 }
 
diff --git a/influxdb_influxql_parser/src/lib.rs b/influxdb_influxql_parser/src/lib.rs
index 32842c0615..231e3fe0e9 100644
--- a/influxdb_influxql_parser/src/lib.rs
+++ b/influxdb_influxql_parser/src/lib.rs
@@ -29,6 +29,7 @@ mod test_util;
 mod common;
 mod delete;
 mod drop;
+mod explain;
 mod expression;
 mod identifier;
 mod internal;
diff --git a/influxdb_influxql_parser/src/select.rs b/influxdb_influxql_parser/src/select.rs
index 111c0c869c..7b9764c182 100644
--- a/influxdb_influxql_parser/src/select.rs
+++ b/influxdb_influxql_parser/src/select.rs
@@ -1,6 +1,6 @@
 use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, order_by_clause, where_clause,
-    MeasurementNameExpression, OneOrMore, OrderByClause, Parser,
+    limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause,
+    OneOrMore, OrderByClause, Parser, QualifiedMeasurementName,
 };
 use crate::expression::arithmetic::Expr::Wildcard;
 use crate::expression::arithmetic::{
@@ -164,8 +164,7 @@ pub fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> {
 /// Represents a single measurement selection found in a `FROM` clause.
 #[derive(Clone, Debug, PartialEq)]
 pub enum MeasurementSelection {
-    Name(MeasurementNameExpression),
-    Regex(Regex),
+    Name(QualifiedMeasurementName),
     Subquery(Box<SelectStatement>),
 }
 
@@ -173,7 +172,6 @@ impl Display for MeasurementSelection {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
             Self::Name(ref name) => fmt::Display::fmt(name, f),
-            Self::Regex(ref re) => fmt::Display::fmt(re, f),
             Self::Subquery(ref subquery) => write!(f, "({})", subquery),
         }
     }
@@ -182,8 +180,7 @@ impl Display for MeasurementSelection {
 impl Parser for MeasurementSelection {
     fn parse(i: &str) -> ParseResult<&str, Self> {
         alt((
-            map(measurement_name_expression, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
+            map(qualified_measurement_name, MeasurementSelection::Name),
             map(
                 delimited(
                     preceded(multispace0, char('(')),
@@ -812,7 +809,7 @@ mod test {
         assert_matches!(got, MeasurementSelection::Name(_));
 
         let (_, got) = MeasurementSelection::parse("/regex/").unwrap();
-        assert_matches!(got, MeasurementSelection::Regex(_));
+        assert_matches!(got, MeasurementSelection::Name(_));
 
         let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap();
         assert_matches!(got, MeasurementSelection::Subquery(_));
diff --git a/influxdb_influxql_parser/src/show_measurements.rs b/influxdb_influxql_parser/src/show_measurements.rs
index 582d562df8..d5277fad9b 100644
--- a/influxdb_influxql_parser/src/show_measurements.rs
+++ b/influxdb_influxql_parser/src/show_measurements.rs
@@ -2,24 +2,21 @@
 //!
 //! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements
 
+use crate::common::{
+    limit_clause, offset_clause, qualified_measurement_name, where_clause, QualifiedMeasurementName,
+};
+use crate::expression::conditional::ConditionalExpression;
+use crate::identifier::{identifier, Identifier};
 use crate::internal::{expect, ParseResult};
 use nom::branch::alt;
 use nom::bytes::complete::{tag, tag_no_case};
-use nom::character::complete::{char, multispace0, multispace1};
+use nom::character::complete::{multispace0, multispace1};
 use nom::combinator::{map, opt, value};
 use nom::sequence::tuple;
 use nom::sequence::{pair, preceded, terminated};
 use std::fmt;
 use std::fmt::Formatter;
 
-use crate::common::{
-    limit_clause, measurement_name_expression, offset_clause, where_clause,
-    MeasurementNameExpression,
-};
-use crate::expression::conditional::ConditionalExpression;
-use crate::identifier::{identifier, Identifier};
-use crate::string::{regex, Regex};
-
 /// OnExpression represents an InfluxQL database or retention policy name
 /// or a wildcard.
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
@@ -110,18 +107,16 @@ impl fmt::Display for ShowMeasurementsStatement {
 
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum MeasurementExpression {
-    Equals(MeasurementNameExpression),
-    Regex(Regex),
+    Equals(QualifiedMeasurementName),
+    Regex(QualifiedMeasurementName),
 }
 
 impl fmt::Display for MeasurementExpression {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
-            Self::Equals(ref name) => write!(f, "= {}", name)?,
-            Self::Regex(ref re) => write!(f, "=~ {}", re)?,
-        };
-
-        Ok(())
+            Self::Equals(ref name) => write!(f, "= {}", name),
+            Self::Regex(ref re) => write!(f, "=~ {}", re),
+        }
     }
 }
 
@@ -140,23 +135,15 @@ fn with_measurement_clause(i: &str) -> ParseResult<&str, MeasurementExpression>
             "expected = or =~",
             alt((
                 map(
-                    tuple((
-                        tag("=~"),
-                        multispace0,
-                        expect("expected regular expression literal", regex),
-                    )),
-                    |(_, _, regex)| MeasurementExpression::Regex(regex),
+                    preceded(pair(tag("=~"), multispace0), qualified_measurement_name),
+                    MeasurementExpression::Regex,
                 ),
                 map(
-                    tuple((
-                        char('='),
-                        multispace0,
-                        expect(
-                            "expected measurement name or wildcard",
-                            measurement_name_expression,
-                        ),
-                    )),
-                    |(_, _, name)| MeasurementExpression::Equals(name),
+                    preceded(
+                        pair(tag("="), multispace0),
+                        expect("expected measurement name", qualified_measurement_name),
+                    ),
+                    MeasurementExpression::Equals,
                 ),
             )),
         ),
@@ -200,6 +187,7 @@ pub fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement
 mod test {
     use super::*;
     use crate::assert_expect_error;
+    use crate::common::MeasurementName;
     use crate::expression::arithmetic::Expr;
     use assert_matches::assert_matches;
 
@@ -232,7 +220,7 @@ mod test {
             ShowMeasurementsStatement {
                 on_expression: Some(OnExpression::Database("foo".into())),
                 measurement_expression: Some(MeasurementExpression::Equals(
-                    MeasurementNameExpression {
+                    QualifiedMeasurementName {
                         database: None,
                         retention_policy: None,
                         name: "bar".into(),
@@ -255,7 +243,9 @@ mod test {
             got,
             ShowMeasurementsStatement {
                 on_expression: Some(OnExpression::Database("foo".into())),
-                measurement_expression: Some(MeasurementExpression::Regex(Regex("bar".into()))),
+                measurement_expression: Some(MeasurementExpression::Regex(
+                    QualifiedMeasurementName::new(MeasurementName::Regex("bar".into()))
+                )),
                 condition: Some(Expr::Literal(true.into()).into()),
                 limit: None,
                 offset: None
@@ -343,33 +333,50 @@ mod test {
 
     #[test]
     fn test_with_measurement_clause() {
+        use crate::common::MeasurementName::*;
+
         let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap();
         assert_eq!(
             got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: None,
-                retention_policy: None,
-                name: "foo".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Name("foo".into())))
         );
 
         let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
 
         // Expressions are still valid when whitespace is omitted
 
         let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap();
         assert_eq!(
             got,
-            MeasurementExpression::Equals(MeasurementNameExpression {
-                database: Some("foo".into()),
-                retention_policy: None,
-                name: "bar".into()
-            })
+            MeasurementExpression::Equals(QualifiedMeasurementName::new_db(
+                Name("bar".into()),
+                "foo".into()
+            ))
         );
 
         let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap();
-        assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662
+
+        let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Regex(QualifiedMeasurementName::new(Name("foo".into())))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap();
+        assert_eq!(
+            got,
+            MeasurementExpression::Equals(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
 
         // Fallible cases
 
@@ -379,28 +386,16 @@ mod test {
             "invalid WITH clause, expected MEASUREMENT"
         );
 
-        // Must have a regex for equal regex operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement =~ foo"),
-            "expected regular expression literal"
-        );
-
         // Unsupported regex not equal operator
         assert_expect_error!(
             with_measurement_clause("WITH measurement !~ foo"),
             "expected = or =~"
         );
 
-        // Must have an identifier for equal operator
-        assert_expect_error!(
-            with_measurement_clause("WITH measurement = /foo/"),
-            "expected measurement name or wildcard"
-        );
-
         // Must have an identifier
         assert_expect_error!(
             with_measurement_clause("WITH measurement = 1"),
-            "expected measurement name or wildcard"
+            "expected measurement name"
         );
     }
 }
diff --git a/influxdb_influxql_parser/src/simple_from_clause.rs b/influxdb_influxql_parser/src/simple_from_clause.rs
index f3d7ab0481..07528a9fc2 100644
--- a/influxdb_influxql_parser/src/simple_from_clause.rs
+++ b/influxdb_influxql_parser/src/simple_from_clause.rs
@@ -1,41 +1,12 @@
-use crate::common::{measurement_name_expression, MeasurementNameExpression, OneOrMore, Parser};
+use crate::common::{
+    qualified_measurement_name, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName,
+};
 use crate::identifier::{identifier, Identifier};
 use crate::internal::ParseResult;
-use crate::string::{regex, Regex};
-use nom::branch::alt;
 use nom::bytes::complete::tag_no_case;
 use nom::character::complete::multispace1;
-use nom::combinator::map;
 use nom::sequence::{pair, preceded};
 use std::fmt;
-use std::fmt::Formatter;
-
-/// Represents a single measurement selection found in a `FROM` measurement clause.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum MeasurementSelection<T: Parser> {
-    Name(T),
-    Regex(Regex),
-}
-
-impl<T: Parser> Parser for MeasurementSelection<T> {
-    fn parse(i: &str) -> ParseResult<&str, Self> {
-        alt((
-            map(T::parse, MeasurementSelection::Name),
-            map(regex, MeasurementSelection::Regex),
-        ))(i)
-    }
-}
-
-impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::Name(ref name) => fmt::Display::fmt(name, f)?,
-            Self::Regex(ref re) => fmt::Display::fmt(re, f)?,
-        };
-
-        Ok(())
-    }
-}
 
 /// Represents a `FROM` clause of a `DELETE` or `SHOW` statement.
 ///
@@ -43,7 +14,7 @@ impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
 /// for measurements names.
 ///
 /// A `FROM` clause for a number of `SHOW` statements can accept a 3-part measurement name or
-pub type FromMeasurementClause<U> = OneOrMore<MeasurementSelection<U>>;
+pub type FromMeasurementClause<U> = OneOrMore<U>;
 
 fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasurementClause<T>> {
     preceded(
@@ -54,9 +25,9 @@ fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasu
     )(i)
 }
 
-impl Parser for MeasurementNameExpression {
+impl Parser for QualifiedMeasurementName {
     fn parse(i: &str) -> ParseResult<&str, Self> {
-        measurement_name_expression(i)
+        qualified_measurement_name(i)
     }
 }
 
@@ -68,10 +39,9 @@ impl Parser for MeasurementNameExpression {
 /// It is defined by the following EBNF notation:
 ///
 /// ```text
-/// from_clause ::= "FROM" measurement_selection ("," measurement_selection)*
-/// measurement_selection ::= measurement
+/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)*
 ///
-/// measurement      ::= measurement_name |
+/// qualified_measurement_name ::= measurement_name |
 ///                      ( policy_name "." measurement_name ) |
 ///                      ( db_name "." policy_name? "." measurement_name )
 ///
@@ -92,7 +62,7 @@ impl Parser for MeasurementNameExpression {
 /// ```text
 /// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar
 /// ```
-pub type ShowFromClause = FromMeasurementClause<MeasurementNameExpression>;
+pub type ShowFromClause = FromMeasurementClause<QualifiedMeasurementName>;
 
 /// Parse a `FROM` clause for various `SHOW` statements.
 pub fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> {
@@ -106,7 +76,7 @@ impl Parser for Identifier {
 }
 
 /// Represents a `FROM` clause for a `DELETE` statement.
-pub type DeleteFromClause = FromMeasurementClause<Identifier>;
+pub type DeleteFromClause = FromMeasurementClause<MeasurementName>;
 
 /// Parse a `FROM` clause for a `DELETE` statement.
 pub fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> {
@@ -119,49 +89,52 @@ mod test {
 
     #[test]
     fn test_show_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;
 
         let (_, from) = show_from_clause("FROM c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new("c".into()))])
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))])
         );
 
         let (_, from) = show_from_clause("FROM a..c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db(
+                Name("c".into()),
                 "a".into()
-            ))])
+            )])
         );
 
         let (_, from) = show_from_clause("FROM a.b.c").unwrap();
         assert_eq!(
             from,
-            ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db_rp(
-                "c".into(),
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp(
+                Name("c".into()),
                 "a".into(),
                 "b".into()
-            ))])
+            )])
         );
 
         let (_, from) = show_from_clause("FROM /reg/").unwrap();
-        assert_eq!(from, ShowFromClause::new(vec![Regex("reg".into())]));
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))])
+        );
 
         let (_, from) = show_from_clause("FROM c, /reg/").unwrap();
         assert_eq!(
             from,
             ShowFromClause::new(vec![
-                Name(MeasurementNameExpression::new("c".into())),
-                Regex("reg".into())
+                QualifiedMeasurementName::new(Name("c".into())),
+                QualifiedMeasurementName::new(Regex("reg".into()))
             ])
         );
     }
 
     #[test]
     fn test_delete_from_clause() {
-        use crate::simple_from_clause::MeasurementSelection::*;
+        use crate::common::MeasurementName::*;
 
         let (_, from) = delete_from_clause("FROM c").unwrap();
         assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())]));
diff --git a/influxdb_influxql_parser/src/statement.rs b/influxdb_influxql_parser/src/statement.rs
index 0455051e81..3275685c54 100644
--- a/influxdb_influxql_parser/src/statement.rs
+++ b/influxdb_influxql_parser/src/statement.rs
@@ -1,5 +1,6 @@
 use crate::delete::{delete_statement, DeleteStatement};
 use crate::drop::{drop_statement, DropMeasurementStatement};
+use crate::explain::{explain_statement, ExplainStatement};
 use crate::internal::ParseResult;
 use crate::select::{select_statement, SelectStatement};
 use crate::show::{show_statement, ShowDatabasesStatement};
@@ -19,6 +20,8 @@ pub enum Statement {
     Delete(Box<DeleteStatement>),
     /// Represents a `DROP MEASUREMENT` statement.
     DropMeasurement(Box<DropMeasurementStatement>),
+    /// Represents an `EXPLAIN` statement.
+    Explain(Box<ExplainStatement>),
     /// Represents a `SELECT` statement.
     Select(Box<SelectStatement>),
     /// Represents a `SHOW DATABASES` statement.
@@ -40,6 +43,7 @@ impl Display for Statement {
         match self {
             Self::Delete(s) => Display::fmt(s, f),
             Self::DropMeasurement(s) => Display::fmt(s, f),
+            Self::Explain(s) => Display::fmt(s, f),
             Self::Select(s) => Display::fmt(s, f),
             Self::ShowDatabases(s) => Display::fmt(s, f),
             Self::ShowMeasurements(s) => Display::fmt(s, f),
@@ -56,6 +60,7 @@ pub fn statement(i: &str) -> ParseResult<&str, Statement> {
     alt((
         map(delete_statement, |s| Statement::Delete(Box::new(s))),
         map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))),
+        map(explain_statement, |s| Statement::Explain(Box::new(s))),
         map(select_statement, |s| Statement::Select(Box::new(s))),
         show_statement,
     ))(i)
@@ -77,6 +82,10 @@ mod test {
         let (got, _) = statement("DROP MEASUREMENT foo").unwrap();
         assert_eq!(got, "");
 
+        // explain_statement combinator
+        let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap();
+        assert_eq!(got, "");
+
         let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap();
         assert_eq!(got, "");
 
diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml
index 504bc00ea1..1e00c64fa0 100644
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@@ -25,7 +25,7 @@ ioxd_querier = { path = "../ioxd_querier"}
 ioxd_router = { path = "../ioxd_router"}
 ioxd_test = { path = "../ioxd_test"}
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 object_store_metrics = { path = "../object_store_metrics" }
 observability_deps = { path = "../observability_deps" }
 panic_logging = { path = "../panic_logging" }
@@ -47,6 +47,8 @@ clap = { version = "4", features = ["derive", "env"] }
 console-subscriber = { version = "0.1.8", optional = true, features = ["parking_lot"] }
 dotenvy = "0.15.5"
 futures = "0.3"
+futures-util = { version = "0.3" }
+flate2 = "1.0"
 hashbrown = "0.12"
 http = "0.2.8"
 humantime = "2.1.0"
@@ -55,7 +57,7 @@ libc = { version = "0.2" }
 num_cpus = "1.13.0"
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 rustyline = { version = "10.0", default-features = false }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 thiserror = "1.0.37"
 tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
diff --git a/influxdb_iox/src/commands/sql/repl.rs b/influxdb_iox/src/commands/sql/repl.rs
index cf4cb2c8dd..129367b906 100644
--- a/influxdb_iox/src/commands/sql/repl.rs
+++ b/influxdb_iox/src/commands/sql/repl.rs
@@ -53,7 +53,7 @@ pub enum Error {
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
 enum QueryEngine {
-    /// Run queries against the named database on the remote server
+    /// Run queries against the namespace on the remote server
     Remote(String),
 
     /// Run queries against a local `Observer` instance
@@ -177,7 +177,7 @@ pub struct Repl {
     /// Client for running sql
     flight_client: influxdb_iox_client::flight::Client,
 
-    /// database name against which SQL commands are run
+    /// namespace name against which SQL commands are run
     query_engine: Option<QueryEngine>,
 
     /// Formatter to use to format query results
@@ -239,8 +239,8 @@ impl Repl {
                         .map_err(|e| println!("{}", e))
                         .ok();
                 }
-                ReplCommand::UseDatabase { db_name } => {
-                    self.use_database(db_name);
+                ReplCommand::UseNamespace { db_name } => {
+                    self.use_namespace(db_name);
                 }
                 ReplCommand::SqlCommand { sql } => {
                     self.run_sql(sql).await.map_err(|e| println!("{}", e)).ok();
@@ -302,18 +302,18 @@ impl Repl {
         self.print_results(&[record_batch])
     }
 
-    // Run a command against the currently selected remote database
+    // Run a command against the currently selected remote namespace
     async fn run_sql(&mut self, sql: String) -> Result<()> {
         let start = Instant::now();
 
         let batches = match &mut self.query_engine {
             None => {
-                println!("Error: no database selected.");
-                println!("Hint: Run USE DATABASE <dbname> to select database");
+                println!("Error: no namespace selected.");
+                println!("Hint: Run USE NAMESPACE <dbname> to select namespace");
                 return Ok(());
             }
             Some(QueryEngine::Remote(db_name)) => {
-                info!(%db_name, %sql, "Running sql on remote database");
+                info!(%db_name, %sql, "Running sql on remote namespace");
 
                 scrape_query(&mut self.flight_client, db_name, &sql).await?
             }
@@ -349,9 +349,9 @@ impl Repl {
         }
     }
 
-    fn use_database(&mut self, db_name: String) {
-        info!(%db_name, "setting current database");
-        println!("You are now in remote mode, querying database {}", db_name);
+    fn use_namespace(&mut self, db_name: String) {
+        info!(%db_name, "setting current namespace");
+        println!("You are now in remote mode, querying namespace {}", db_name);
         self.set_query_engine(QueryEngine::Remote(db_name));
     }
 
diff --git a/influxdb_iox/src/commands/sql/repl_command.rs b/influxdb_iox/src/commands/sql/repl_command.rs
index 37fa4fb843..56f310ed7f 100644
--- a/influxdb_iox/src/commands/sql/repl_command.rs
+++ b/influxdb_iox/src/commands/sql/repl_command.rs
@@ -7,7 +7,7 @@ pub enum ReplCommand {
     ShowNamespaces,
     Observer,
     SetFormat { format: String },
-    UseDatabase { db_name: String },
+    UseNamespace { db_name: String },
     SqlCommand { sql: String },
     Exit,
 }
@@ -64,18 +64,18 @@ impl TryFrom<&str> for ReplCommand {
             ["observer"] => Ok(Self::Observer),
             ["exit"] => Ok(Self::Exit),
             ["quit"] => Ok(Self::Exit),
-            ["use", "database"] => {
-                Err("name not specified. Usage: USE DATABASE <name>".to_string())
-            } // USE DATABASE
-            ["use", "database", _name] => {
-                // USE DATABASE <name>
-                Ok(Self::UseDatabase {
+            ["use", "namespace"] => {
+                Err("name not specified. Usage: USE NAMESPACE <name>".to_string())
+            } // USE NAMESPACE
+            ["use", "namespace", _name] => {
+                // USE namespace <name>
+                Ok(Self::UseNamespace {
                     db_name: raw_commands[2].to_string(),
                 })
             }
             ["use", _command] => {
                 // USE <name>
-                Ok(Self::UseDatabase {
+                Ok(Self::UseNamespace {
                     db_name: raw_commands[1].to_string(),
                 })
             }
@@ -98,9 +98,9 @@ impl ReplCommand {
 Available commands (not case sensitive):
 HELP (this one)
 
-SHOW NAMESPACES: List databases available on the server
+SHOW NAMESPACES: List namespaces available on the server
 
-USE [DATABASE|NAMESPACE] <name>: Set the current remote database to name
+USE NAMESPACE <name>: Set the current remote namespace to name
 
 SET FORMAT <format>: Set the output format to Pretty, csv or json
 
@@ -108,9 +108,9 @@ OBSERVER: Locally query unified queryable views of remote system tables
 
 [EXIT | QUIT]: Quit this session and exit the program
 
-# Examples: use remote database foo
-SHOW DATABASES;
-USE DATABASE foo;
+# Examples: use remote namespace foo
+SHOW NAMESPACES;
+USE foo;
 
 # Basic IOx SQL Primer
 
@@ -199,35 +199,35 @@ mod tests {
     }
 
     #[test]
-    fn use_database() {
-        let expected = Ok(ReplCommand::UseDatabase {
+    fn use_namespace() {
+        let expected = Ok(ReplCommand::UseNamespace {
             db_name: "Foo".to_string(),
         });
         assert_eq!("use Foo".try_into(), expected);
-        assert_eq!("use Database Foo;".try_into(), expected);
-        assert_eq!("use Database Foo ;".try_into(), expected);
-        assert_eq!(" use Database Foo;   ".try_into(), expected);
-        assert_eq!("   use Database Foo;   ".try_into(), expected);
+        assert_eq!("use Namespace Foo;".try_into(), expected);
+        assert_eq!("use Namespace Foo ;".try_into(), expected);
+        assert_eq!(" use Namespace Foo;   ".try_into(), expected);
+        assert_eq!("   use Namespace Foo;   ".try_into(), expected);
 
-        // ensure that database name is case sensitive
-        let expected = Ok(ReplCommand::UseDatabase {
+        // ensure that namespace name is case sensitive
+        let expected = Ok(ReplCommand::UseNamespace {
             db_name: "FOO".to_string(),
         });
         assert_eq!("use FOO".try_into(), expected);
-        assert_eq!("use DATABASE FOO;".try_into(), expected);
-        assert_eq!("USE DATABASE FOO;".try_into(), expected);
+        assert_eq!("use NAMESPACE FOO;".try_into(), expected);
+        assert_eq!("USE NAMESPACE FOO;".try_into(), expected);
 
         let expected: Result<ReplCommand, String> =
-            Err("name not specified. Usage: USE DATABASE <name>".to_string());
-        assert_eq!("use Database;".try_into(), expected);
-        assert_eq!("use DATABASE".try_into(), expected);
-        assert_eq!("use database".try_into(), expected);
+            Err("name not specified. Usage: USE NAMESPACE <name>".to_string());
+        assert_eq!("use Namespace;".try_into(), expected);
+        assert_eq!("use NAMESPACE".try_into(), expected);
+        assert_eq!("use namespace".try_into(), expected);
 
-        let expected = sql_cmd("use database foo bar");
-        assert_eq!("use database foo bar".try_into(), expected);
+        let expected = sql_cmd("use namespace foo bar");
+        assert_eq!("use namespace foo bar".try_into(), expected);
 
-        let expected = sql_cmd("use database foo BAR");
-        assert_eq!("use database foo BAR".try_into(), expected);
+        let expected = sql_cmd("use namespace foo BAR");
+        assert_eq!("use namespace foo BAR".try_into(), expected);
     }
 
     #[test]
diff --git a/influxdb_iox/src/commands/write.rs b/influxdb_iox/src/commands/write.rs
index e5aff6bd88..857a81b320 100644
--- a/influxdb_iox/src/commands/write.rs
+++ b/influxdb_iox/src/commands/write.rs
@@ -1,6 +1,14 @@
+use futures::StreamExt;
 use influxdb_iox_client::{connection::Connection, write};
-use snafu::{ResultExt, Snafu};
-use std::{fs::File, io::Read, path::PathBuf};
+use observability_deps::tracing::info;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::{
+    fs::File,
+    io::{BufReader, Read},
+    num::NonZeroUsize,
+    path::PathBuf,
+    time::Instant,
+};
 
 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@@ -11,10 +19,30 @@ pub enum Error {
         source: std::io::Error,
     },
 
+    #[snafu(display("Error reading files: {:#?}", sources))]
+    ReadingFiles { sources: Vec<Error> },
+
     #[snafu(display("Client error: {source}"))]
     ClientError {
         source: influxdb_iox_client::error::Error,
     },
+
+    #[snafu(display("Error converting parquet: {}", source))]
+    Conversion {
+        source: parquet_to_line_protocol::Error,
+    },
+
+    #[snafu(display("Line protocol was not valid utf8: {}", source))]
+    InvalidUtf8 { source: std::string::FromUtf8Error },
+
+    #[snafu(display("Error decoding gzip {:?}:  {}", file_name, source))]
+    Gz {
+        file_name: PathBuf,
+        source: std::io::Error,
+    },
+
+    #[snafu(display("Max concurrent uploads must be greater than zero"))]
+    MaxConcurrentUploadsVerfication,
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -22,36 +50,176 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// Write data into the specified database
 #[derive(Debug, clap::Parser)]
 pub struct Config {
+    /// If specified, restricts the maxium amount of line protocol
+    /// sent per request to this many bytes. Defaults to 1MB
+    #[clap(action, long, short = 'b', default_value = "1048576")]
+    max_request_payload_size_bytes: usize,
+
+    /// Uploads up to this many http requests at a time. Defaults to 10
+    #[clap(action, long, short = 'c', default_value = "10")]
+    max_concurrent_uploads: usize,
+
     /// The namespace into which to write
     #[clap(action)]
     namespace: String,
 
-    /// File with data to load. Currently supported formats are .lp
+    /// File(s) with data to load. Currently supported formats are .lp (line protocol),
+    /// .parquet (IOx created parquet files), and .gz (gzipped line protocol)
     #[clap(action)]
-    file_name: PathBuf,
+    file_names: Vec<PathBuf>,
 }
 
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
+    let start = Instant::now();
+
     let Config {
         namespace,
-        file_name,
+        file_names,
+        max_request_payload_size_bytes,
+        max_concurrent_uploads,
     } = config;
-    let file_name = &file_name;
 
-    let mut file = File::open(file_name).context(ReadingFileSnafu { file_name })?;
+    let max_concurrent_uploads =
+        NonZeroUsize::new(max_concurrent_uploads).context(MaxConcurrentUploadsVerficationSnafu)?;
 
-    let mut lp_data = String::new();
-    file.read_to_string(&mut lp_data)
-        .context(ReadingFileSnafu { file_name })?;
+    info!(
+        num_files = file_names.len(),
+        max_request_payload_size_bytes, max_concurrent_uploads, "Beginning upload"
+    );
 
-    let mut client = write::Client::new(connection);
+    // first pass is to check that all the files exist and can be
+    // opened and if not fail fast.
+    let file_open_errors: Vec<_> = file_names
+        .iter()
+        .filter_map(|file_name| {
+            File::open(file_name)
+                .context(ReadingFileSnafu { file_name })
+                .err()
+        })
+        .collect();
+
+    ensure!(
+        file_open_errors.is_empty(),
+        ReadingFilesSnafu {
+            sources: file_open_errors
+        }
+    );
+
+    // if everything looked good, go through and read the files out
+    // them potentially in parallel.
+    let lp_stream = futures_util::stream::iter(file_names)
+        .map(|file_name| tokio::task::spawn(slurp_file(file_name)))
+        // Since the contents of each file are buffered into a string,
+        // limit the number that are open at once to the maximum
+        // possible uploads
+        .buffered(max_concurrent_uploads.into())
+        // warn and skip any errors
+        .filter_map(|res| async move {
+            match res {
+                Ok(Ok(lp_data)) => Some(lp_data),
+                Ok(Err(e)) => {
+                    eprintln!("WARNING: ignoring error : {}", e);
+                    None
+                }
+                Err(e) => {
+                    eprintln!("WARNING: ignoring task fail: {}", e);
+                    None
+                }
+            }
+        });
+
+    let mut client = write::Client::new(connection)
+        .with_max_concurrent_uploads(max_concurrent_uploads)
+        .with_max_request_payload_size_bytes(Some(max_request_payload_size_bytes));
 
     let total_bytes = client
-        .write_lp(namespace, lp_data)
+        .write_lp_stream(namespace, lp_stream)
         .await
         .context(ClientSnafu)?;
 
-    println!("{} Bytes OK", total_bytes);
+    let elapsed = Instant::now() - start;
+    let mb = (total_bytes as f64) / (1024.0 * 1024.0);
+    let mb_per_sec = (mb / (elapsed.as_millis() as f64)) * (1000.0);
+    println!("{total_bytes} Bytes OK in {elapsed:?}. {mb_per_sec:.2} MB/sec");
 
     Ok(())
 }
+
+/// Reads the contents of `file_name into a string
+///
+/// .parquet files --> iox parquet files (convert to parquet)
+/// .gz  --> treated as gzipped line protocol
+/// .lp (or anything else) --> treated as raw line protocol
+///
+async fn slurp_file(file_name: PathBuf) -> Result<String> {
+    let file_name = &file_name;
+
+    let extension = file_name
+        .extension()
+        .map(|extension| extension.to_ascii_lowercase());
+
+    match extension {
+        // Transform parquet to line protocol prior to upload
+        // Not the most efficient process, but it is expedient
+        Some(extension) if extension.to_string_lossy() == "parquet" => {
+            let mut lp_data = vec![];
+            parquet_to_line_protocol::convert_file(file_name, &mut lp_data)
+                .await
+                .context(ConversionSnafu)?;
+
+            let lp_data = String::from_utf8(lp_data).context(InvalidUtf8Snafu)?;
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from parquet file"
+            );
+            Ok(lp_data)
+        }
+        // decompress as gz
+        Some(extension) if extension.to_string_lossy() == "gz" => {
+            let mut lp_data = String::new();
+            let reader =
+                BufReader::new(File::open(&file_name).context(ReadingFileSnafu { file_name })?);
+
+            flate2::read::GzDecoder::new(reader)
+                .read_to_string(&mut lp_data)
+                .context(GzSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol from gzipped line protocol file"
+            );
+            Ok(lp_data)
+        }
+        // anything else, treat as line protocol
+        Some(_) | None => {
+            let lp_data =
+                std::fs::read_to_string(file_name).context(ReadingFileSnafu { file_name })?;
+
+            info!(
+                ?file_name,
+                file_size_bytes = lp_data.len(),
+                "Buffered line protocol file"
+            );
+            Ok(lp_data)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use clap::Parser;
+    use influxdb_iox_client::write::DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES;
+
+    use super::*;
+
+    #[test]
+    fn command_default_is_same_as_client_default() {
+        let config = Config::try_parse_from(vec!["my_db", "file1"]).unwrap();
+        assert_eq!(
+            Some(config.max_request_payload_size_bytes),
+            DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES
+        );
+    }
+}
diff --git a/influxdb_iox/tests/end_to_end_cases/cli.rs b/influxdb_iox/tests/end_to_end_cases/cli.rs
index 89f868cae8..941a7437ee 100644
--- a/influxdb_iox/tests/end_to_end_cases/cli.rs
+++ b/influxdb_iox/tests/end_to_end_cases/cli.rs
@@ -6,7 +6,6 @@ use predicates::prelude::*;
 use serde_json::Value;
 use std::time::{Duration, Instant};
 use tempfile::tempdir;
-use test_helpers::make_temp_file;
 use test_helpers_end_to_end::{
     maybe_skip_integration, AddAddrEnv, BindAddresses, MiniCluster, ServerType, Step, StepTest,
     StepTestState,
@@ -526,9 +525,6 @@ async fn write_and_query() {
         vec![
             Step::Custom(Box::new(|state: &mut StepTestState| {
                 async {
-                    // write line protocol to a temp file
-                    let lp_file = make_temp_file("m,tag=1 v=2 12345");
-                    let lp_file_path = lp_file.path().to_string_lossy().to_string();
                     let router_addr = state.cluster().router().router_http_base().to_string();
 
                     let namespace = state.cluster().namespace();
@@ -537,53 +533,48 @@ async fn write_and_query() {
                     // Validate the output of the schema CLI command
                     Command::cargo_bin("influxdb_iox")
                         .unwrap()
+                        .arg("-v")
                         .arg("-h")
                         .arg(&router_addr)
                         .arg("write")
                         .arg(&namespace)
-                        .arg(&lp_file_path)
+                        // raw line protocol ('h2o_temperature' measurement)
+                        .arg("../test_fixtures/lineproto/air_and_water.lp")
+                        // gzipped line protocol ('m0')
+                        .arg("../test_fixtures/lineproto/read_filter.lp.gz")
+                         // iox formatted parquet ('cpu' measurement)
+                        .arg("../test_fixtures/cpu.parquet")
                         .assert()
                         .success()
-                        .stdout(predicate::str::contains("17 Bytes OK"));
+                        // this number is the total size of
+                        // uncompressed line protocol stored in all
+                        // three files
+                        .stdout(predicate::str::contains("1137058 Bytes OK"));
                 }
                 .boxed()
             })),
             Step::Custom(Box::new(|state: &mut StepTestState| {
                 async {
-                    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
-                    let namespace = state.cluster().namespace();
+                    // data from 'air_and_water.lp'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from h2o_temperature order by time desc limit 10",
+                        "| 51.3           | coyote_creek | CA    | 55.1            | 1970-01-01T00:00:01.568756160Z |"
+                    ).await;
 
-                    let max_wait_time = Duration::from_secs(10);
-                    let expected = "| 1   | 1970-01-01T00:00:00.000012345Z | 2 |";
-                    println!("Waiting for {expected}");
+                    // data from 'read_filter.lp.gz'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from m0 order by time desc limit 10;",
+                        "| value1 | value9 | value9 | value49 | value0 | 2021-04-26T13:47:39.727574Z | 1  |"
+                    ).await;
 
-                    // Validate the output of running the query CLI command appears after at most max_wait_time
-                    let end = Instant::now() + max_wait_time;
-                    while Instant::now() < end {
-                        let maybe_result = Command::cargo_bin("influxdb_iox")
-                            .unwrap()
-                            .arg("-h")
-                            .arg(&querier_addr)
-                            .arg("query")
-                            .arg(&namespace)
-                            .arg("SELECT * from m")
-                            .assert()
-                            .success()
-                            .try_stdout(predicate::str::contains(expected));
-
-                        match maybe_result {
-                            Err(e) => {
-                                println!("Got err: {}, retrying", e);
-                            }
-                            Ok(r) => {
-                                println!("Success: {:?}", r);
-                                return;
-                            }
-                        }
-                        // sleep and try again
-                        tokio::time::sleep(Duration::from_millis(500)).await
-                    }
-                    panic!("Did not find expected output in allotted time");
+                    // data from 'cpu.parquet'
+                    wait_for_query_result(
+                        state,
+                        "SELECT * from cpu where cpu = 'cpu2' order by time desc limit 10",
+                        "cpu2 | MacBook-Pro-8.hsd1.ma.comcast.net | 2022-09-30T12:55:00Z"
+                    ).await;
                 }
                 .boxed()
             })),
@@ -593,6 +584,53 @@ async fn write_and_query() {
     .await
 }
 
+/// Runs the specified query in a loop for up to 10 seconds, waiting
+/// for the specified output to appear
+async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, expected: &str) {
+    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
+    let namespace = state.cluster().namespace();
+
+    let max_wait_time = Duration::from_secs(10);
+    println!("Waiting for {expected}");
+
+    // Validate the output of running the query CLI command appears after at most max_wait_time
+    let end = Instant::now() + max_wait_time;
+    while Instant::now() < end {
+        let assert = Command::cargo_bin("influxdb_iox")
+            .unwrap()
+            .arg("-h")
+            .arg(&querier_addr)
+            .arg("query")
+            .arg(&namespace)
+            .arg(query_sql)
+            .assert();
+
+        let assert = match assert.try_success() {
+            Err(e) => {
+                println!("Got err running command: {}, retrying", e);
+                continue;
+            }
+            Ok(a) => a,
+        };
+
+        match assert.try_stdout(predicate::str::contains(expected)) {
+            Err(e) => {
+                println!("No match: {}, retrying", e);
+            }
+            Ok(r) => {
+                println!("Success: {:?}", r);
+                return;
+            }
+        }
+        // sleep and try again
+        tokio::time::sleep(Duration::from_secs(1)).await
+    }
+    panic!(
+        "Did not find expected output {} within {:?}",
+        expected, max_wait_time
+    );
+}
+
 /// Test the schema cli command
 #[tokio::test]
 async fn namespaces_cli() {
diff --git a/influxdb_iox/tests/end_to_end_cases/ingester.rs b/influxdb_iox/tests/end_to_end_cases/ingester.rs
index 07ecd8fbbe..edf93bb305 100644
--- a/influxdb_iox/tests/end_to_end_cases/ingester.rs
+++ b/influxdb_iox/tests/end_to_end_cases/ingester.rs
@@ -52,7 +52,6 @@ async fn ingester_flight_api() {
             partition_id,
             status: Some(PartitionStatus {
                 parquet_max_sequence_number: None,
-                tombstone_max_sequence_number: None
             })
         },
     );
diff --git a/influxdb_iox/tests/end_to_end_cases/querier.rs b/influxdb_iox/tests/end_to_end_cases/querier.rs
index d5f1cfbe0e..b189098a64 100644
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@@ -7,7 +7,8 @@ use futures::FutureExt;
 use predicates::prelude::*;
 use test_helpers::assert_contains;
 use test_helpers_end_to_end::{
-    maybe_skip_integration, run_query, MiniCluster, Step, StepTest, StepTestState, TestConfig,
+    maybe_skip_integration, run_query, try_run_query, GrpcRequestBuilder, MiniCluster, Step,
+    StepTest, StepTestState, TestConfig,
 };
 
 #[tokio::test]
@@ -454,6 +455,87 @@ async fn issue_4631_b() {
     .await
 }
 
+#[tokio::test]
+async fn oom_protection() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let router_config = TestConfig::new_router(&database_url);
+    let ingester_config = TestConfig::new_ingester(&router_config);
+    let querier_config =
+        TestConfig::new_querier(&ingester_config).with_querier_max_table_query_bytes(1);
+    let mut cluster = MiniCluster::new()
+        .with_router(router_config)
+        .await
+        .with_ingester(ingester_config)
+        .await
+        .with_querier(querier_config)
+        .await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!("{},tag1=A,tag2=B val=42i 123457", table_name)),
+            Step::WaitForReadable,
+            Step::AssertNotPersisted,
+            // SQL query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let sql = format!("select * from {}", table_name);
+                    let err = try_run_query(
+                        sql,
+                        state.cluster().namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                    )
+                    .await
+                    .unwrap_err();
+
+                    if let influxdb_iox_client::flight::Error::GrpcError(status) = err {
+                        assert_eq!(
+                            status.code(),
+                            tonic::Code::ResourceExhausted,
+                            "Wrong status code: {}\n\nStatus:\n{}",
+                            status.code(),
+                            status,
+                        );
+                    } else {
+                        panic!("Not a gRPC error: {err}");
+                    }
+                }
+                .boxed()
+            })),
+            // InfluxRPC/storage query
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut storage_client = state.cluster().querier_storage_client();
+
+                    let read_filter_request = GrpcRequestBuilder::new()
+                        .source(state.cluster())
+                        .build_read_filter();
+
+                    let status = storage_client
+                        .read_filter(read_filter_request)
+                        .await
+                        .unwrap_err();
+                    assert_eq!(
+                        status.code(),
+                        tonic::Code::ResourceExhausted,
+                        "Wrong status code: {}\n\nStatus:\n{}",
+                        status.code(),
+                        status,
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 /// This structure holds information for tests that need to force a parquet file to be persisted
 struct ForcePersistenceSetup {
     // Set up a cluster that will will persist quickly
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
index 9b674c4a33..42a886d98c 100644
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@@ -13,6 +13,7 @@ format = ["arrow", "arrow_util"]
 # Workspace dependencies, in alphabetical order
 arrow_util = { path = "../arrow_util", optional = true }
 client_util = { path = "../client_util" }
+influxdb_line_protocol = { path = "../influxdb_line_protocol"}
 generated_types = { path = "../generated_types", default-features = false, features = ["data_types_conversions"] }
 
 # Crates.io dependencies, in alphabetical order
@@ -23,9 +24,7 @@ futures-util = { version = "0.3", optional = true }
 prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
+tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
+tokio-stream = "0.1.11"
 thiserror = "1.0.37"
 tonic = { version = "0.8" }
-
-[dev-dependencies] # In alphabetical order
-tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
-mockito = "0.31"
\ No newline at end of file
diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs
index 1ee584d8a0..4771970f11 100644
--- a/influxdb_iox_client/src/client/write.rs
+++ b/influxdb_iox_client/src/client/write.rs
@@ -1,15 +1,16 @@
-/// Re-export generated_types
-pub mod generated_types {
-    pub use generated_types::influxdata::pbdata::v1::*;
-}
+use std::{fmt::Debug, num::NonZeroUsize, sync::Arc};
 
 use client_util::{connection::HttpConnection, namespace_translation::split_namespace};
+use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt};
 
 use crate::{
     connection::Connection,
     error::{translate_response, Error},
 };
-use reqwest::Method;
+use reqwest::{Body, Method};
+
+/// The default value for the maximum size of each request, in bytes
+pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
 
 /// An IOx Write API client.
 ///
@@ -37,18 +38,67 @@ use reqwest::Method;
 /// ```
 #[derive(Debug, Clone)]
 pub struct Client {
-    inner: HttpConnection,
+    /// The inner client used to actually make requests.
+    ///
+    /// Uses a trait for test mocking.
+    ///
+    /// Does not expose the trait in the `Client` type to avoid
+    /// exposing an internal implementation detail (the trait) in the
+    /// public interface.
+    inner: Arc<dyn RequestMaker>,
+
+    /// If `Some`, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict
+    /// the amount sent per request. Defaults to `Some(1MB)`
+    ///
+    /// Splitting the upload size consumes a non trivial amount of CPU
+    /// to find line protocol boundaries. This can be disabled by
+    /// setting `max_request_payload_size_bytes` to `None`.
+    max_request_payload_size_bytes: Option<usize>,
+
+    /// Makes this many concurrent requests at a time. Defaults to 1
+    max_concurrent_uploads: NonZeroUsize,
 }
 
 impl Client {
     /// Creates a new client with the provided connection
     pub fn new(connection: Connection) -> Self {
+        Self::new_with_maker(Arc::new(connection.into_http_connection()))
+    }
+
+    /// Creates a new client with the provided request maker
+    fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
         Self {
-            inner: connection.into_http_connection(),
+            inner,
+            max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
+            max_concurrent_uploads: NonZeroUsize::new(1).unwrap(),
         }
     }
 
-    /// Write the [LineProtocol] formatted data in `lp_data` to
+    /// Override the default of sending 1MB of line protocol per request.
+    /// If `Some` is specified, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict the amount of
+    /// line protocol sent per request.
+    pub fn with_max_request_payload_size_bytes(
+        self,
+        max_request_payload_size_bytes: Option<usize>,
+    ) -> Self {
+        Self {
+            max_request_payload_size_bytes,
+            ..self
+        }
+    }
+
+    /// The client makes this many concurrent uploads at a
+    /// time. Defaults to 1.
+    pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self {
+        Self {
+            max_concurrent_uploads,
+            ..self
+        }
+    }
+
+    /// Write the [LineProtocol] formatted string in `lp_data` to
     /// namespace `namespace`.
     ///
     /// Returns the number of bytes which were written to the database
@@ -59,11 +109,24 @@ impl Client {
         namespace: impl AsRef<str> + Send,
         lp_data: impl Into<String> + Send,
     ) -> Result<usize, Error> {
-        let lp_data = lp_data.into();
-        let data_len = lp_data.len();
+        let sources = futures_util::stream::iter([lp_data.into()]);
 
-        let write_url = format!("{}api/v2/write", self.inner.uri());
+        self.write_lp_stream(namespace, sources).await
+    }
 
+    /// Write the stream of [LineProtocol] formatted strings in
+    /// `sources` to namespace `namespace`. It is assumed that
+    /// individual lines (points) do not cross these strings
+    ///
+    /// Returns the number of bytes, in total, which were written to
+    /// the database
+    ///
+    /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    pub async fn write_lp_stream(
+        &mut self,
+        namespace: impl AsRef<str> + Send,
+        sources: impl Stream<Item = String> + Send,
+    ) -> Result<usize, Error> {
         let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| {
             Error::invalid_argument(
                 "namespace",
@@ -71,47 +134,302 @@ impl Client {
             )
         })?;
 
-        let response = self
-            .inner
-            .client()
-            .request(Method::POST, &write_url)
-            .query(&[("bucket", bucket_id), ("org", org_id)])
-            .body(lp_data)
-            .send()
+        let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
+        let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
+
+        // make a stream and process in parallel
+        let results = sources
+            // split each input source in parallel, if possible
+            .flat_map(|source| {
+                split_lp(
+                    source,
+                    max_request_payload_size_bytes,
+                    max_concurrent_uploads,
+                )
+            })
+            // do the actual write
+            .map(|source| {
+                let org_id = org_id.to_string();
+                let bucket_id = bucket_id.to_string();
+                let inner = Arc::clone(&self.inner);
+
+                tokio::task::spawn(
+                    async move { inner.write_source(org_id, bucket_id, source).await },
+                )
+            })
+            // Do the uploads in parallel
+            .buffered(max_concurrent_uploads)
+            .try_collect::<Vec<_>>()
+            // handle panics in tasks
             .await
-            .map_err(Error::client)?;
+            .map_err(Error::client)?
+            // find / return any errors
+            .into_iter()
+            .collect::<Result<Vec<_>, Error>>()?;
 
-        translate_response(response).await?;
+        Ok(results.into_iter().sum())
+    }
+}
 
-        Ok(data_len)
+/// Something that knows how to send http data. Exists so it can be
+/// mocked out for testing
+trait RequestMaker: Debug + Send + Sync {
+    /// Write the body data to the specified org, bucket, and
+    /// returning the number of bytes written
+    ///
+    /// (this is implemented manually to avoid `async_trait`)
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>>;
+}
+
+impl RequestMaker for HttpConnection {
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>> {
+        let write_url = format!("{}api/v2/write", self.uri());
+
+        async move {
+            let body: Body = body.into();
+
+            let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0);
+
+            let response = self
+                .client()
+                .request(Method::POST, &write_url)
+                .query(&[("bucket", bucket_id), ("org", org_id)])
+                .body(body)
+                .send()
+                .await
+                .map_err(Error::client)?;
+
+            translate_response(response).await?;
+
+            Ok(data_len)
+        }
+        .boxed()
+    }
+}
+
+/// splits input line protocol into one or more sizes of at most
+/// `max_chunk` on line breaks in a separte tokio task
+fn split_lp(
+    input: String,
+    max_chunk_size: Option<usize>,
+    max_concurrent_uploads: usize,
+) -> impl Stream<Item = String> {
+    let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads);
+
+    tokio::task::spawn(async move {
+        match max_chunk_size {
+            None => {
+                // ignore errors (means the receiver hung up but nothing to communicate
+                tx.send(input).await.ok();
+            }
+            Some(max_chunk_size) => {
+                // use the actual line protocol parser to split on valid boundaries
+                let mut acc = LineAccumulator::new(max_chunk_size);
+                for l in influxdb_line_protocol::split_lines(&input) {
+                    if let Some(chunk) = acc.push(l) {
+                        // abort if receiver has hungup
+                        if tx.send(chunk).await.is_err() {
+                            return;
+                        }
+                    }
+                }
+                if let Some(chunk) = acc.flush() {
+                    tx.send(chunk).await.ok();
+                }
+            }
+        }
+    });
+
+    tokio_stream::wrappers::ReceiverStream::new(rx)
+}
+#[derive(Debug)]
+struct LineAccumulator {
+    current_chunk: String,
+    max_chunk_size: usize,
+}
+
+impl LineAccumulator {
+    fn new(max_chunk_size: usize) -> Self {
+        Self {
+            current_chunk: String::with_capacity(max_chunk_size),
+            max_chunk_size,
+        }
+    }
+
+    // Add data `l` to the current chunk being created, returning the
+    // current chunk if complete.
+    fn push(&mut self, l: &str) -> Option<String> {
+        let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size {
+            self.flush()
+        } else {
+            None
+        };
+
+        if !self.current_chunk.is_empty() {
+            self.current_chunk += "\n";
+        }
+
+        self.current_chunk += l;
+        chunk
+    }
+
+    /// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length
+    /// `self.current_chunk.len()` is zero
+    fn flush(&mut self) -> Option<String> {
+        if !self.current_chunk.is_empty() {
+            let mut new_chunk = String::with_capacity(self.max_chunk_size);
+            std::mem::swap(&mut new_chunk, &mut self.current_chunk);
+            Some(new_chunk)
+        } else {
+            None
+        }
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Mutex;
+
     use super::*;
-    use crate::connection::Builder;
 
     #[tokio::test]
-    /// Ensure the basic plumbing is hooked up correctly
-    async fn basic() {
-        let url = mockito::server_url();
-
-        let connection = Builder::new().build(&url).await.unwrap();
+    async fn test() {
+        let mock = Arc::new(MockRequestMaker::new());
 
         let namespace = "orgname_bucketname";
         let data = "m,t=foo f=4";
 
-        let m = mockito::mock("POST", "/api/v2/write?bucket=bucketname&org=orgname")
-            .with_status(201)
-            .match_body(data)
-            .create();
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucketname".into(),
+            body: data.into(),
+        }];
 
-        let res = Client::new(connection).write_lp(namespace, data).await;
-
-        m.assert();
-
-        let num_bytes = res.expect("Error making write request");
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
         assert_eq!(num_bytes, 11);
     }
+
+    #[tokio::test]
+    async fn test_max_request_payload_size() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = "m,t=foo f=4\n\
+                    m,t=bar f=3\n\
+                    m,t=fooddddddd f=4";
+
+        // expect the data to be broken up into two chunks:
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4\nm,t=bar f=3".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=fooddddddd f=4".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            // enough to get first two lines, but not last
+            .with_max_request_payload_size_bytes(Some(30))
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 41);
+    }
+
+    #[tokio::test]
+    async fn test_write_lp_stream() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "orgname_bucketname";
+        let data = futures_util::stream::iter(
+            vec!["m,t=foo f=4", "m,t=bar f=3"]
+                .into_iter()
+                .map(|s| s.to_string()),
+        );
+
+        // expect the data to come in two chunks
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=bar f=3".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp_stream(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 22);
+    }
+
+    #[derive(Debug, Clone, PartialEq)]
+    struct MockRequest {
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    }
+
+    #[derive(Debug)]
+    struct MockRequestMaker {
+        requests: Mutex<Vec<MockRequest>>,
+    }
+
+    impl MockRequestMaker {
+        fn new() -> Self {
+            Self {
+                requests: Mutex::new(vec![]),
+            }
+        }
+
+        /// get a copy of the requests that were made using this mock
+        fn requests(&self) -> Vec<MockRequest> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl RequestMaker for MockRequestMaker {
+        fn write_source(
+            &self,
+            org_id: String,
+            bucket_id: String,
+            body: String,
+        ) -> BoxFuture<'_, Result<usize, Error>> {
+            let sz = body.len();
+
+            self.requests.lock().unwrap().push(MockRequest {
+                org_id,
+                bucket_id,
+                body,
+            });
+
+            async move { Ok(sz) }.boxed()
+        }
+    }
 }
diff --git a/influxdb_line_protocol/Cargo.toml b/influxdb_line_protocol/Cargo.toml
index f82103288d..aae56dd1db 100644
--- a/influxdb_line_protocol/Cargo.toml
+++ b/influxdb_line_protocol/Cargo.toml
@@ -14,7 +14,7 @@ ffi = ["libc"]
 bytes = "1.2"
 libc = { version = "0.2", optional = true }
 nom = { version = "7", default-features = false, features = ["std"] }
-smallvec = { version = "1.9.0", features = ["union"] }
+smallvec = { version = "1.10.0", features = ["union"] }
 snafu = "0.7"
 observability_deps = { path = "../observability_deps" }
 workspace-hack = { path = "../workspace-hack"}
diff --git a/influxdb_line_protocol/src/lib.rs b/influxdb_line_protocol/src/lib.rs
index 07d9ca14ea..91c1c2077d 100644
--- a/influxdb_line_protocol/src/lib.rs
+++ b/influxdb_line_protocol/src/lib.rs
@@ -529,7 +529,7 @@ pub fn parse_lines(input: &str) -> impl Iterator<Item = Result<ParsedLine<'_>>>
 /// logic duplication for scanning fields, duplicating it also means
 /// we can be more sure of the compatibility of the rust parser and
 /// the canonical Go parser.
-fn split_lines(input: &str) -> impl Iterator<Item = &str> {
+pub fn split_lines(input: &str) -> impl Iterator<Item = &str> {
     // NB: This is ported as closely as possibly from the original Go code:
     let mut quoted = false;
     let mut fields = false;
diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml
index 152c099d2d..80a8496db6 100644
--- a/influxrpc_parser/Cargo.toml
+++ b/influxrpc_parser/Cargo.toml
@@ -4,8 +4,8 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sqlparser = "0.24.0"
-snafu = "0.7.1"
+sqlparser = "0.25.0"
+snafu = "0.7.2"
 
 generated_types = { path = "../generated_types" }
 workspace-hack = { path = "../workspace-hack"}
\ No newline at end of file
diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml
index beb94c37e9..b12ed95c1d 100644
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@@ -24,7 +24,7 @@ iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch"}
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
@@ -45,6 +45,7 @@ write_buffer = { path = "../write_buffer" }
 write_summary = { path = "../write_summary" }
 tokio-util = { version = "0.7.4" }
 trace = { path = "../trace" }
+rand = "0.8.5"
 
 [dev-dependencies]
 assert_matches = "1.5.0"
@@ -52,4 +53,4 @@ bitflags = {version = "1.3.2"}
 once_cell = "1"
 paste = "1.0.9"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
-tokio-stream = {version = "0.1.10", default_features = false }
+tokio-stream = {version = "0.1.11", default_features = false }
diff --git a/ingester/src/compact.rs b/ingester/src/compact.rs
index 040a1c983c..8a280cc751 100644
--- a/ingester/src/compact.rs
+++ b/ingester/src/compact.rs
@@ -18,7 +18,7 @@ use crate::{data::partition::PersistingBatch, query::QueryableBatch};
 
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
+pub(crate) enum Error {
     #[snafu(display("Error while building logical plan for Ingester's compaction"))]
     LogicalPlan {
         source: iox_query::frontend::reorg::Error,
@@ -86,11 +86,8 @@ pub(crate) async fn compact_persisting_batch(
     namespace_id: i64,
     partition_info: &PartitionInfo,
     batch: Arc<PersistingBatch>,
-) -> Result<Option<CompactedStream>> {
-    // Nothing to compact
-    if batch.data.data.is_empty() {
-        return Ok(None);
-    }
+) -> Result<CompactedStream> {
+    assert!(!batch.data.data.is_empty());
 
     let namespace_name = &partition_info.namespace_name;
     let table_name = &partition_info.table_name;
@@ -141,11 +138,11 @@ pub(crate) async fn compact_persisting_batch(
         sort_key: Some(metadata_sort_key),
     };
 
-    Ok(Some(CompactedStream {
+    Ok(CompactedStream {
         stream,
         iox_metadata,
         sort_key_update,
-    }))
+    })
 }
 
 /// Compact a given Queryable Batch
@@ -192,8 +189,8 @@ mod tests {
         create_batches_with_influxtype_same_columns_different_type,
         create_one_record_batch_with_influxtype_duplicates,
         create_one_record_batch_with_influxtype_no_duplicates,
-        create_one_row_record_batch_with_influxtype, create_tombstone, make_meta,
-        make_persisting_batch, make_queryable_batch, make_queryable_batch_with_deletes,
+        create_one_row_record_batch_with_influxtype, make_meta, make_persisting_batch,
+        make_queryable_batch,
     };
 
     // this test was added to guard against https://github.com/influxdata/influxdb_iox/issues/3782
@@ -226,7 +223,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -254,7 +250,6 @@ mod tests {
         let CompactedStream { stream, .. } =
             compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
                 .await
-                .unwrap()
                 .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -297,7 +292,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -328,7 +322,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -394,7 +387,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -426,7 +418,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -494,7 +485,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -527,7 +517,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -595,7 +584,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -629,7 +617,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -700,7 +687,6 @@ mod tests {
             partition_id,
             uuid,
             batches,
-            vec![],
         );
 
         // verify PK
@@ -739,7 +725,6 @@ mod tests {
             sort_key_update,
         } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
             .await
-            .unwrap()
             .unwrap();
 
         let output_batches = datafusion::physical_plan::common::collect(stream)
@@ -825,54 +810,6 @@ mod tests {
         assert_batches_eq!(&expected, &output_batches);
     }
 
-    #[tokio::test]
-    async fn test_compact_one_batch_no_dupilcates_with_deletes() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-        // verify no empty record batches - bug #3782
-        assert_eq!(output_batches.len(), 2);
-        assert_eq!(output_batches[0].num_rows(), 1);
-        assert_eq!(output_batches[1].num_rows(), 1);
-
-        // verify compacted data
-        // row with "tag1=UT" no longer available
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
     #[tokio::test]
     async fn test_compact_one_batch_with_duplicates() {
         // create input data
@@ -1019,23 +956,12 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_deletes(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates() {
         // create many-batches input data
         let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![create_tombstone(
-            1,
-            1,
-            1,
-            100,                          // delete's seq_number
-            0,                            // min time of data to get deleted
-            200000,                       // max time of data to get deleted
-            "tag2=CT and field_int=1000", // delete predicate
-        )];
 
         // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
 
         // verify PK
         let schema = compact_batch.schema();
@@ -1058,7 +984,6 @@ mod tests {
 
         // verify compacted data
         // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") are also removed
         // CORRECT RESULT
         let expected = vec![
             "+-----------+------+------+--------------------------------+",
@@ -1067,73 +992,15 @@ mod tests {
             "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
             "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
             "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
             "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
             "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
             "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
-            "+-----------+------+------+--------------------------------+",
-        ];
-
-        assert_batches_eq!(&expected, &output_batches);
-    }
-
-    #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes(
-    ) {
-        // create many-batches input data
-        let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,        // delete's seq_number
-                0,          // min time of data to get deleted
-                200000,     // max time of data to get deleted
-                "tag1!=MT", // delete predicate
-            ),
-        ];
-
-        // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        // verify PK
-        let schema = compact_batch.schema();
-        let pk = schema.primary_key();
-        let expected_pk = vec!["tag1", "tag2", "time"];
-        assert_eq!(expected_pk, pk);
-
-        let sort_key = compute_sort_key(
-            &schema,
-            compact_batch.data.iter().map(|sb| sb.data.as_ref()),
-        );
-        assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
-
-        // compact
-        let exc = Executor::new(1);
-        let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify compacted data
-        // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1!=MT") are also removed
-        let expected = vec![
-            "+-----------+------+------+--------------------------------+",
-            "| field_int | tag1 | tag2 | time                           |",
-            "+-----------+------+------+--------------------------------+",
-            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
-            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
             "+-----------+------+------+--------------------------------+",
         ];
 
@@ -1142,31 +1009,12 @@ mod tests {
 
     // BUG
     #[tokio::test]
-    async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes_2(
-    ) {
+    async fn test_compact_many_batches_different_columns_different_order_with_duplicates2() {
         // create many-batches input data
         let batches = create_batches_with_influxtype_different_columns_different_order().await;
-        let tombstones = vec![
-            create_tombstone(
-                1,
-                1,
-                1,
-                100,                          // delete's seq_number
-                0,                            // min time of data to get deleted
-                200000,                       // max time of data to get deleted
-                "tag2=CT and field_int=1000", // delete predicate
-            ),
-            create_tombstone(
-                1, 1, 1, 101,       // delete's seq_number
-                0,         // min time of data to get deleted
-                200000,    // max time of data to get deleted
-                "tag1=MT", // delete predicate
-            ),
-        ];
 
         // build queryable batch from the input batches
-        let compact_batch =
-            make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
+        let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
 
         // verify PK
         let schema = compact_batch.schema();
@@ -1189,29 +1037,22 @@ mod tests {
 
         // verify compacted data
         // data is sorted and all duplicates are removed
-        // all rows with ("tag2=CT and field_int=1000") and ("tag1=MT") are also removed
-        // CORRECT RESULT
-        // let expected = vec![
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| field_int | tag1 | tag2 | time                           |",
-        //     "+-----------+------+------+--------------------------------+",
-        //     "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
-        //     "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
-        //     "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
-        //     "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
-        //     "+-----------+------+------+--------------------------------+",
-        // ];
-        // current WRONMG result: "tag1 is null" is also eliminated
         let expected = vec![
             "+-----------+------+------+--------------------------------+",
             "| field_int | tag1 | tag2 | time                           |",
             "+-----------+------+------+--------------------------------+",
+            "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
             "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
+            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
+            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
             "+-----------+------+------+--------------------------------+",
         ];
 
diff --git a/ingester/src/data.rs b/ingester/src/data.rs
index 7c4a48386f..d1ec7d39a2 100644
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@@ -1,15 +1,12 @@
 //! Data for the lifecycle of the Ingester
 
-use std::{collections::BTreeMap, pin::Pin, sync::Arc};
+use std::{collections::BTreeMap, sync::Arc};
 
-use arrow::{error::ArrowError, record_batch::RecordBatch};
-use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use async_trait::async_trait;
 use backoff::{Backoff, BackoffConfig};
-use data_types::{PartitionId, SequenceNumber, ShardId, ShardIndex};
-use datafusion::physical_plan::SendableRecordBatchStream;
+use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId};
+
 use dml::DmlOperation;
-use futures::{Stream, StreamExt};
 use iox_catalog::interface::{get_table_schema_by_id, Catalog};
 use iox_query::exec::Executor;
 use iox_time::SystemProvider;
@@ -25,16 +22,12 @@ use crate::{
     lifecycle::LifecycleHandle,
 };
 
-pub mod namespace;
+pub(crate) mod namespace;
 pub mod partition;
-mod query_dedup;
-pub mod shard;
-pub mod table;
+pub(crate) mod shard;
+pub(crate) mod table;
 
-use self::{
-    partition::{resolver::PartitionProvider, PartitionStatus},
-    shard::ShardData,
-};
+use self::{partition::resolver::PartitionProvider, shard::ShardData, table::TableName};
 
 #[cfg(test)]
 mod triggers;
@@ -51,9 +44,6 @@ pub enum Error {
     #[snafu(display("Table {} not found in buffer", table_name))]
     TableNotFound { table_name: String },
 
-    #[snafu(display("Table must be specified in delete"))]
-    TableNotPresent,
-
     #[snafu(display("Error accessing catalog: {}", source))]
     Catalog {
         source: iox_catalog::interface::Error,
@@ -186,7 +176,7 @@ impl IngesterData {
             .get(&shard_id)
             .context(ShardNotFoundSnafu { shard_id })?;
         shard_data
-            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle, &self.exec)
+            .buffer_operation(dml_operation, &self.catalog, lifecycle_handle)
             .await
     }
 
@@ -220,7 +210,13 @@ impl IngesterData {
 #[async_trait]
 pub trait Persister: Send + Sync + 'static {
     /// Persits the partition ID. Will retry forever until it succeeds.
-    async fn persist(&self, partition_id: PartitionId);
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    );
 
     /// Updates the shard's `min_unpersisted_sequence_number` in the catalog.
     /// This number represents the minimum that might be unpersisted, which is the
@@ -235,7 +231,69 @@ pub trait Persister: Send + Sync + 'static {
 
 #[async_trait]
 impl Persister for IngesterData {
-    async fn persist(&self, partition_id: PartitionId) {
+    async fn persist(
+        &self,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: PartitionId,
+    ) {
+        // lookup the state from the ingester data. If something isn't found,
+        // it's unexpected. Crash so someone can take a look.
+        let shard_data = self
+            .shards
+            .get(&shard_id)
+            .unwrap_or_else(|| panic!("shard state for {shard_id} not in ingester data"));
+        let namespace = shard_data
+            .namespace_by_id(namespace_id)
+            .unwrap_or_else(|| panic!("namespace {namespace_id} not in shard {shard_id} state"));
+
+        let partition_key;
+        let batch;
+        {
+            let table_data = namespace.table_id(table_id).unwrap_or_else(|| {
+                panic!("table {table_id} in namespace {namespace_id} not in shard {shard_id} state")
+            });
+
+            let mut guard = table_data.write().await;
+            let partition = guard.get_partition(partition_id).unwrap_or_else(|| {
+                panic!(
+                    "partition {partition_id} in table {table_id} in namespace {namespace_id} not in shard {shard_id} state"
+                )
+            });
+
+            partition_key = partition.partition_key().clone();
+            batch = partition.snapshot_to_persisting_batch();
+        };
+
+        debug!(%shard_id, %namespace_id, %table_id, %partition_id, %partition_key, "persisting partition");
+
+        // Check if there is any data to persist.
+        let batch = match batch {
+            Some(v) if !v.data.data.is_empty() => v,
+            _ => {
+                warn!(
+                    %shard_id,
+                    %namespace_id,
+                    %table_id,
+                    %partition_id,
+                    %partition_key,
+                    "partition marked for persistence contains no data"
+                );
+                return;
+            }
+        };
+
+        // lookup column IDs from catalog
+        // TODO: this can be removed once the ingester uses column IDs internally as well
+        let table_schema = Backoff::new(&self.backoff_config)
+            .retry_all_errors("get table schema", || async {
+                let mut repos = self.catalog.repositories().await;
+                get_table_schema_by_id(table_id, repos.as_mut()).await
+            })
+            .await
+            .expect("retry forever");
+
         // lookup the partition_info from the catalog
         let partition_info = Backoff::new(&self.backoff_config)
             .retry_all_errors("get partition_info_by_id", || async {
@@ -243,217 +301,159 @@ impl Persister for IngesterData {
                 repos.partitions().partition_info_by_id(partition_id).await
             })
             .await
-            .expect("retry forever");
+            .expect("retry forever").unwrap_or_else(|| panic!("partition {partition_id} in table {table_id} in namespace {namespace_id} in shard {shard_id} has no partition info in catalog"));
 
-        // lookup the state from the ingester data. If something isn't found, it's unexpected. Crash
-        // so someone can take a look.
-        let partition_info = partition_info
-            .unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id));
-        let shard_data = self
-            .shards
-            .get(&partition_info.partition.shard_id)
-            .unwrap_or_else(|| {
-                panic!(
-                    "shard state for {} not in ingester data",
-                    partition_info.partition.shard_id
-                )
-            }); //{
-        let namespace = shard_data
-            .namespace(&partition_info.namespace_name)
-            .unwrap_or_else(|| {
-                panic!(
-                    "namespace {} not in shard {} state",
-                    partition_info.namespace_name, partition_info.partition.shard_id
-                )
-            });
-        debug!(?partition_id, ?partition_info, "persisting partition");
+        // do the CPU intensive work of compaction, de-duplication and sorting
+        let CompactedStream {
+            stream: record_stream,
+            iox_metadata,
+            sort_key_update,
+        } = compact_persisting_batch(
+            Arc::new(SystemProvider::new()),
+            &self.exec,
+            namespace.namespace_id().get(),
+            &partition_info,
+            Arc::clone(&batch),
+        )
+        .await
+        .expect("unable to compact persisting batch");
 
-        // lookup column IDs from catalog
-        // TODO: this can be removed once the ingester uses column IDs internally as well
-        let table_schema = Backoff::new(&self.backoff_config)
-            .retry_all_errors("get table schema", || async {
-                let mut repos = self.catalog.repositories().await;
-                let table = repos
-                    .tables()
-                    .get_by_namespace_and_name(namespace.namespace_id(), &partition_info.table_name)
-                    .await?
-                    .expect("table not found in catalog");
-                get_table_schema_by_id(table.id, repos.as_mut()).await
-            })
+        // Save the compacted data to a parquet file in object storage.
+        //
+        // This call retries until it completes.
+        let (md, file_size) = self
+            .store
+            .upload(record_stream, &iox_metadata)
             .await
-            .expect("retry forever");
+            .expect("unexpected fatal persist error");
 
-        let persisting_batch = namespace
-            .snapshot_to_persisting(
-                &partition_info.table_name,
-                &partition_info.partition.partition_key,
-            )
-            .await;
-
-        if let Some(persisting_batch) = persisting_batch {
-            // do the CPU intensive work of compaction, de-duplication and sorting
-            let compacted_stream = match compact_persisting_batch(
-                Arc::new(SystemProvider::new()),
-                &self.exec,
-                namespace.namespace_id().get(),
-                &partition_info,
-                Arc::clone(&persisting_batch),
-            )
-            .await
-            {
-                Err(e) => {
-                    // this should never error out. if it does, we need to crash hard so
-                    // someone can take a look.
-                    panic!("unable to compact persisting batch with error: {:?}", e);
-                }
-                Ok(Some(r)) => r,
-                Ok(None) => {
-                    warn!("persist called with no data");
-                    return;
-                }
-            };
-            let CompactedStream {
-                stream: record_stream,
-                iox_metadata,
-                sort_key_update,
-            } = compacted_stream;
-
-            // Save the compacted data to a parquet file in object storage.
-            //
-            // This call retries until it completes.
-            let (md, file_size) = self
-                .store
-                .upload(record_stream, &iox_metadata)
-                .await
-                .expect("unexpected fatal persist error");
-
-            // Update the sort key in the catalog if there are
-            // additional columns BEFORE adding parquet file to the
-            // catalog. If the order is reversed, the querier or
-            // compactor may see a parquet file with an inconsistent
-            // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
-            if let Some(new_sort_key) = sort_key_update {
-                let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
-                Backoff::new(&self.backoff_config)
-                    .retry_all_errors("update_sort_key", || async {
-                        let mut repos = self.catalog.repositories().await;
-                        let _partition = repos
-                            .partitions()
-                            .update_sort_key(partition_id, &sort_key)
-                            .await?;
-                        // compiler insisted on getting told the type of the error :shrug:
-                        Ok(()) as Result<(), iox_catalog::interface::Error>
-                    })
-                    .await
-                    .expect("retry forever");
-                debug!(
-                    ?partition_id,
-                    table = partition_info.table_name,
-                    ?new_sort_key,
-                    "adjusted sort key during batch compact & persist"
-                );
-            }
-
-            // Add the parquet file to the catalog until succeed
-            let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
-                table_schema.columns.get(name).expect("Unknown column").id
-            });
-
-            // Assert partitions are persisted in-order.
-            //
-            // It is an invariant that partitions are persisted in order so that
-            // both the per-shard, and per-partition watermarks are correctly
-            // advanced and accurate.
-            if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
-                assert!(
-                    parquet_file.max_sequence_number > last_persist,
-                    "out of order partition persistence, persisting {}, previously persisted {}",
-                    parquet_file.max_sequence_number.get(),
-                    last_persist.get(),
-                );
-            }
-
-            // Add the parquet file to the catalog.
-            //
-            // This has the effect of allowing the queriers to "discover" the
-            // parquet file by polling / querying the catalog.
+        // Update the sort key in the catalog if there are
+        // additional columns BEFORE adding parquet file to the
+        // catalog. If the order is reversed, the querier or
+        // compactor may see a parquet file with an inconsistent
+        // sort key. https://github.com/influxdata/influxdb_iox/issues/5090
+        if let Some(new_sort_key) = sort_key_update {
+            let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
             Backoff::new(&self.backoff_config)
-                .retry_all_errors("add parquet file to catalog", || async {
+                .retry_all_errors("update_sort_key", || async {
                     let mut repos = self.catalog.repositories().await;
-                    let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
-                    debug!(
-                        ?partition_id,
-                        table_id=?parquet_file.table_id,
-                        parquet_file_id=?parquet_file.id,
-                        table_name=%iox_metadata.table_name,
-                        "parquet file written to catalog"
-                    );
+                    let _partition = repos
+                        .partitions()
+                        .update_sort_key(partition_id, &sort_key)
+                        .await?;
                     // compiler insisted on getting told the type of the error :shrug:
                     Ok(()) as Result<(), iox_catalog::interface::Error>
                 })
                 .await
                 .expect("retry forever");
-
-            // Update the per-partition persistence watermark, so that new
-            // ingester instances skip the just-persisted ops during replay.
-            //
-            // This could be transactional with the above parquet insert to
-            // maintain catalog consistency, though in practice it is an
-            // unnecessary overhead - the system can tolerate replaying the ops
-            // that lead to this parquet file being generated, and tolerate
-            // creating a parquet file containing duplicate data (remedied by
-            // compaction).
-            //
-            // This means it is possible to observe a parquet file with a
-            // max_persisted_sequence_number >
-            // partition.persisted_sequence_number, either in-between these
-            // catalog updates, or for however long it takes a crashed ingester
-            // to restart and replay the ops, and re-persist a file containing
-            // the same (or subset of) data.
-            //
-            // The above is also true of the per-shard persist marker that
-            // governs the ingester's replay start point, which is
-            // non-transactionally updated after all partitions have persisted.
-            Backoff::new(&self.backoff_config)
-                .retry_all_errors("set partition persist marker", || async {
-                    self.catalog
-                        .repositories()
-                        .await
-                        .partitions()
-                        .update_persisted_sequence_number(
-                            parquet_file.partition_id,
-                            parquet_file.max_sequence_number,
-                        )
-                        .await
-                })
-                .await
-                .expect("retry forever");
-
-            // Record metrics
-            let attributes = Attributes::from([(
-                "shard_id",
-                format!("{}", partition_info.partition.shard_id).into(),
-            )]);
-            self.persisted_file_size_bytes
-                .recorder(attributes)
-                .record(file_size as u64);
-
-            // and remove the persisted data from memory
-            namespace
-                .mark_persisted(
-                    &partition_info.table_name,
-                    &partition_info.partition.partition_key,
-                    iox_metadata.max_sequence_number,
-                )
-                .await;
             debug!(
                 ?partition_id,
-                table_name=%partition_info.table_name,
-                partition_key=%partition_info.partition.partition_key,
-                max_sequence_number=%iox_metadata.max_sequence_number.get(),
-                "marked partition as persisted"
+                table = partition_info.table_name,
+                ?new_sort_key,
+                "adjusted sort key during batch compact & persist"
             );
         }
+
+        // Add the parquet file to the catalog until succeed
+        let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
+            table_schema.columns.get(name).expect("Unknown column").id
+        });
+
+        // Assert partitions are persisted in-order.
+        //
+        // It is an invariant that partitions are persisted in order so that
+        // both the per-shard, and per-partition watermarks are correctly
+        // advanced and accurate.
+        if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
+            assert!(
+                parquet_file.max_sequence_number > last_persist,
+                "out of order partition persistence, persisting {}, previously persisted {}",
+                parquet_file.max_sequence_number.get(),
+                last_persist.get(),
+            );
+        }
+
+        // Add the parquet file to the catalog.
+        //
+        // This has the effect of allowing the queriers to "discover" the
+        // parquet file by polling / querying the catalog.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("add parquet file to catalog", || async {
+                let mut repos = self.catalog.repositories().await;
+                let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
+                debug!(
+                    ?partition_id,
+                    table_id=?parquet_file.table_id,
+                    parquet_file_id=?parquet_file.id,
+                    table_name=%iox_metadata.table_name,
+                    "parquet file written to catalog"
+                );
+                // compiler insisted on getting told the type of the error :shrug:
+                Ok(()) as Result<(), iox_catalog::interface::Error>
+            })
+            .await
+            .expect("retry forever");
+
+        // Update the per-partition persistence watermark, so that new
+        // ingester instances skip the just-persisted ops during replay.
+        //
+        // This could be transactional with the above parquet insert to
+        // maintain catalog consistency, though in practice it is an
+        // unnecessary overhead - the system can tolerate replaying the ops
+        // that lead to this parquet file being generated, and tolerate
+        // creating a parquet file containing duplicate data (remedied by
+        // compaction).
+        //
+        // This means it is possible to observe a parquet file with a
+        // max_persisted_sequence_number >
+        // partition.persisted_sequence_number, either in-between these
+        // catalog updates, or for however long it takes a crashed ingester
+        // to restart and replay the ops, and re-persist a file containing
+        // the same (or subset of) data.
+        //
+        // The above is also true of the per-shard persist marker that
+        // governs the ingester's replay start point, which is
+        // non-transactionally updated after all partitions have persisted.
+        Backoff::new(&self.backoff_config)
+            .retry_all_errors("set partition persist marker", || async {
+                self.catalog
+                    .repositories()
+                    .await
+                    .partitions()
+                    .update_persisted_sequence_number(
+                        parquet_file.partition_id,
+                        parquet_file.max_sequence_number,
+                    )
+                    .await
+            })
+            .await
+            .expect("retry forever");
+
+        // Record metrics
+        let attributes = Attributes::from([(
+            "shard_id",
+            format!("{}", partition_info.partition.shard_id).into(),
+        )]);
+        self.persisted_file_size_bytes
+            .recorder(attributes)
+            .record(file_size as u64);
+
+        // and remove the persisted data from memory
+        let table_name = TableName::from(&partition_info.table_name);
+        namespace
+            .mark_persisted(
+                &table_name,
+                &partition_info.partition.partition_key,
+                iox_metadata.max_sequence_number,
+            )
+            .await;
+        debug!(
+            ?partition_id,
+            %table_name,
+            partition_key=%partition_info.partition.partition_key,
+            max_sequence_number=%iox_metadata.max_sequence_number.get(),
+            "marked partition as persisted"
+        );
     }
 
     async fn update_min_unpersisted_sequence_number(
@@ -475,172 +475,24 @@ impl Persister for IngesterData {
     }
 }
 
-/// Stream of snapshots.
-///
-/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
-pub(crate) type SnapshotStream =
-    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
-
-/// Response data for a single partition.
-pub(crate) struct IngesterQueryPartition {
-    /// Stream of snapshots.
-    snapshots: SnapshotStream,
-
-    /// Partition ID.
-    id: PartitionId,
-
-    /// Partition persistence status.
-    status: PartitionStatus,
-}
-
-impl std::fmt::Debug for IngesterQueryPartition {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryPartition")
-            .field("snapshots", &"<SNAPSHOT STREAM>")
-            .field("id", &self.id)
-            .field("status", &self.status)
-            .finish()
-    }
-}
-
-impl IngesterQueryPartition {
-    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
-        Self {
-            snapshots,
-            id,
-            status,
-        }
-    }
-}
-
-/// Stream of partitions in this response.
-pub(crate) type IngesterQueryPartitionStream =
-    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
-
-/// Response streams for querier<>ingester requests.
-///
-/// The data structure is constructed to allow lazy/streaming data generation. For easier
-/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
-pub struct IngesterQueryResponse {
-    /// Stream of partitions.
-    partitions: IngesterQueryPartitionStream,
-}
-
-impl std::fmt::Debug for IngesterQueryResponse {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("IngesterQueryResponse")
-            .field("partitions", &"<PARTITION STREAM>")
-            .finish()
-    }
-}
-
-impl IngesterQueryResponse {
-    /// Make a response
-    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
-        Self { partitions }
-    }
-
-    /// Flattens the data according to the wire protocol.
-    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
-        self.partitions
-            .flat_map(|partition_res| match partition_res {
-                Ok(partition) => {
-                    let head = futures::stream::once(async move {
-                        Ok(FlatIngesterQueryResponse::StartPartition {
-                            partition_id: partition.id,
-                            status: partition.status,
-                        })
-                    });
-                    let tail = partition
-                        .snapshots
-                        .flat_map(|snapshot_res| match snapshot_res {
-                            Ok(snapshot) => {
-                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
-
-                                let schema_captured = Arc::clone(&schema);
-                                let head = futures::stream::once(async {
-                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
-                                        schema: schema_captured,
-                                    })
-                                });
-
-                                let tail = snapshot.map(move |batch_res| match batch_res {
-                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
-                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
-                                    }),
-                                    Err(e) => Err(e),
-                                });
-
-                                head.chain(tail).boxed()
-                            }
-                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-                        });
-
-                    head.chain(tail).boxed()
-                }
-                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
-            })
-            .boxed()
-    }
-}
-
-/// Flattened version of [`IngesterQueryResponse`].
-pub(crate) type FlatIngesterQueryResponseStream =
-    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
-
-/// Element within the flat wire protocol.
-#[derive(Debug, PartialEq)]
-pub enum FlatIngesterQueryResponse {
-    /// Start a new partition.
-    StartPartition {
-        /// Partition ID.
-        partition_id: PartitionId,
-
-        /// Partition persistence status.
-        status: PartitionStatus,
-    },
-
-    /// Start a new snapshot.
-    ///
-    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
-    /// message.
-    StartSnapshot {
-        /// Snapshot schema.
-        schema: Arc<arrow::datatypes::Schema>,
-    },
-
-    /// Add a record batch to the snapshot that was announced by the last
-    /// [`StartSnapshot`](Self::StartSnapshot) message.
-    RecordBatch {
-        /// Record batch.
-        batch: RecordBatch,
-    },
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{
-        ops::DerefMut,
-        sync::Arc,
-        task::{Context, Poll},
-        time::Duration,
-    };
+    use std::{ops::DerefMut, sync::Arc, time::Duration};
 
-    use arrow::datatypes::SchemaRef;
     use assert_matches::assert_matches;
     use data_types::{
         ColumnId, ColumnSet, CompactionLevel, DeletePredicate, NamespaceSchema, NonEmptyString,
         ParquetFileParams, Sequence, Timestamp, TimestampRange,
     };
-    use datafusion::physical_plan::RecordBatchStream;
+
     use dml::{DmlDelete, DmlMeta, DmlWrite};
     use futures::TryStreamExt;
     use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
     use iox_time::Time;
     use metric::{MetricObserver, Observation};
-    use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch};
+    use mutable_batch_lp::lines_to_batches;
     use object_store::memory::InMemory;
-    use schema::selection::Selection;
+
     use uuid::Uuid;
 
     use super::*;
@@ -804,17 +656,20 @@ mod tests {
         // limits)
         assert!(!should_pause);
 
-        let partition_id = {
+        let (table_id, partition_id) = {
             let sd = data.shards.get(&shard1.id).unwrap();
-            let n = sd.namespace("foo").unwrap();
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("mem").is_some());
+            let n = sd.namespace(&"foo".into()).unwrap();
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"mem".into()).is_some());
             let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
-            p.id()
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
+            (mem_table.table_id(), p.partition_id())
         };
 
-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;
 
         // verify that a file got put into object store
         let file_paths: Vec<_> = object_store
@@ -945,17 +800,20 @@ mod tests {
         assert_progress(&data, shard_index, expected_progress).await;
 
         let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();
         let partition_id;
         let table_id;
         {
-            let mem_table = n.table_data("mem").unwrap();
-            assert!(n.table_data("cpu").is_some());
-            let mem_table = mem_table.write().await;
-            let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let mem_table = n.table_data(&"mem".into()).unwrap();
+            assert!(n.table_data(&"cpu".into()).is_some());
 
+            let mem_table = mem_table.write().await;
             table_id = mem_table.table_id();
-            partition_id = p.id();
+
+            let p = mem_table
+                .get_partition_by_key(&"1970-01-01".into())
+                .unwrap();
+            partition_id = p.partition_id();
         }
         {
             // verify the partition doesn't have a sort key before any data has been persisted
@@ -969,7 +827,8 @@ mod tests {
             assert!(partition_info.partition.sort_key.is_empty());
         }
 
-        data.persist(partition_id).await;
+        data.persist(shard1.id, namespace.id, table_id, partition_id)
+            .await;
 
         // verify that a file got put into object store
         let file_paths: Vec<_> = object_store
@@ -1061,7 +920,7 @@ mod tests {
             .unwrap();
         assert_eq!(partition_info.partition.sort_key, vec!["time"]);
 
-        let mem_table = n.table_data("mem").unwrap();
+        let mem_table = n.table_data(&"mem".into()).unwrap();
         let mem_table = mem_table.read().await;
 
         // verify that the parquet_max_sequence_number got updated
@@ -1177,7 +1036,7 @@ mod tests {
 
         // Get the namespace
         let sd = data.shards.get(&shard1.id).unwrap();
-        let n = sd.namespace("foo").unwrap();
+        let n = sd.namespace(&"foo".into()).unwrap();
 
         let expected_progress = ShardProgress::new().with_buffered(SequenceNumber::new(1));
         assert_progress(&data, shard_index, expected_progress).await;
@@ -1336,23 +1195,28 @@ mod tests {
             Arc::clone(&metrics),
             Arc::new(SystemProvider::new()),
         );
-        let exec = Executor::new(1);
 
         let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog)));
 
-        let data = NamespaceData::new(namespace.id, shard.id, partition_provider, &*metrics);
+        let data = NamespaceData::new(
+            namespace.id,
+            "foo".into(),
+            shard.id,
+            partition_provider,
+            &*metrics,
+        );
 
         // w1 should be ignored because the per-partition replay offset is set
         // to 1 already, so it shouldn't be buffered and the buffer should
         // remain empty.
         let should_pause = data
-            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle(), &exec)
+            .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle())
             .await
             .unwrap();
         {
-            let table_data = data.table_data("mem").unwrap();
+            let table_data = data.table_data(&"mem".into()).unwrap();
             let table = table_data.read().await;
-            let p = table.partition_data.get(&"1970-01-01".into()).unwrap();
+            let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
             assert_eq!(
                 p.max_persisted_sequence_number(),
                 Some(SequenceNumber::new(1))
@@ -1362,13 +1226,13 @@ mod tests {
         assert!(!should_pause);
 
         // w2 should be in the buffer
-        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle(), &exec)
+        data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle())
             .await
             .unwrap();
 
-        let table_data = data.table_data("mem").unwrap();
+        let table_data = data.table_data(&"mem".into()).unwrap();
         let table = table_data.read().await;
-        let partition = table.partition_data.get(&"1970-01-01".into()).unwrap();
+        let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
         assert_eq!(
             partition.data.buffer.as_ref().unwrap().min_sequence_number,
             SequenceNumber::new(2)
@@ -1454,19 +1318,6 @@ mod tests {
         .await
         .unwrap();
 
-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name)
-                .unwrap()
-                .table_data("mem")
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            None,
-        );
-
         let predicate = DeletePredicate {
             range: TimestampRange::new(1, 2),
             exprs: vec![],
@@ -1485,19 +1336,6 @@ mod tests {
         data.buffer_operation(shard1.id, DmlOperation::Delete(d1), &manager.handle())
             .await
             .unwrap();
-
-        assert_eq!(
-            data.shard(shard1.id)
-                .unwrap()
-                .namespace(&namespace.name)
-                .unwrap()
-                .table_data("mem")
-                .unwrap()
-                .read()
-                .await
-                .tombstone_max_sequence_number(),
-            Some(SequenceNumber::new(2)),
-        );
     }
 
     /// Verifies that the progress in data is the same as expected_progress
@@ -1513,132 +1351,4 @@ mod tests {
 
         assert_eq!(progresses, expected_progresses);
     }
-
-    #[tokio::test]
-    async fn test_ingester_query_response_flatten() {
-        let batch_1_1 = lp_to_batch("table x=1 0");
-        let batch_1_2 = lp_to_batch("table x=2 1");
-        let batch_2 = lp_to_batch("table y=1 10");
-        let batch_3 = lp_to_batch("table z=1 10");
-
-        let schema_1 = batch_1_1.schema();
-        let schema_2 = batch_2.schema();
-        let schema_3 = batch_3.schema();
-
-        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![
-                            Ok(batch_1_1.clone()),
-                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-                            Ok(batch_1_2.clone()),
-                        ],
-                        Arc::clone(&schema_1),
-                    )) as _),
-                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-                    Ok(Box::pin(TestRecordBatchStream::new(
-                        vec![Ok(batch_2.clone())],
-                        Arc::clone(&schema_2),
-                    )) as _),
-                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
-                ])),
-                PartitionId::new(2),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
-                },
-            )),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(IngesterQueryPartition::new(
-                Box::pin(futures::stream::iter([])),
-                PartitionId::new(1),
-                PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
-                },
-            )),
-        ])));
-
-        let actual: Vec<_> = response.flatten().collect().await;
-        let expected = vec![
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(2),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
-                },
-            }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
-            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
-            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
-            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
-            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
-            Err(ArrowError::IoError("some io error".into())),
-            Ok(FlatIngesterQueryResponse::StartPartition {
-                partition_id: PartitionId::new(1),
-                status: PartitionStatus {
-                    parquet_max_sequence_number: None,
-                    tombstone_max_sequence_number: None,
-                },
-            }),
-        ];
-
-        assert_eq!(actual.len(), expected.len());
-        for (actual, expected) in actual.into_iter().zip(expected) {
-            match (actual, expected) {
-                (Ok(actual), Ok(expected)) => {
-                    assert_eq!(actual, expected);
-                }
-                (Err(_), Err(_)) => {
-                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
-                }
-                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
-                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
-            }
-        }
-    }
-
-    fn lp_to_batch(lp: &str) -> RecordBatch {
-        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
-    }
-
-    pub struct TestRecordBatchStream {
-        schema: SchemaRef,
-        batches: Vec<Result<RecordBatch, ArrowError>>,
-    }
-
-    impl TestRecordBatchStream {
-        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
-            Self { schema, batches }
-        }
-    }
-
-    impl RecordBatchStream for TestRecordBatchStream {
-        fn schema(&self) -> SchemaRef {
-            Arc::clone(&self.schema)
-        }
-    }
-
-    impl futures::Stream for TestRecordBatchStream {
-        type Item = Result<RecordBatch, ArrowError>;
-
-        fn poll_next(
-            mut self: std::pin::Pin<&mut Self>,
-            _: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            if self.batches.is_empty() {
-                Poll::Ready(None)
-            } else {
-                Poll::Ready(Some(self.batches.remove(0)))
-            }
-        }
-
-        fn size_hint(&self) -> (usize, Option<usize>) {
-            (self.batches.len(), Some(self.batches.len()))
-        }
-    }
 }
diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs
index 6a5ddb9581..9aa414a535 100644
--- a/ingester/src/data/namespace.rs
+++ b/ingester/src/data/namespace.rs
@@ -1,36 +1,91 @@
 //! Namespace level data buffer structures.
 
-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};
 
-use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId};
+use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
+use observability_deps::tracing::warn;
 use parking_lot::RwLock;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use write_summary::ShardProgress;
 
 #[cfg(test)]
 use super::triggers::TestTriggers;
 use super::{
-    partition::{resolver::PartitionProvider, PersistingBatch},
-    table::TableData,
+    partition::resolver::PartitionProvider,
+    table::{TableData, TableName},
 };
 use crate::lifecycle::LifecycleHandle;
 
+/// A double-referenced map where [`TableData`] can be looked up by name, or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<TableName, Arc<tokio::sync::RwLock<TableData>>>,
+    by_id: HashMap<TableId, Arc<tokio::sync::RwLock<TableData>>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, t: TableData) -> Arc<tokio::sync::RwLock<TableData>> {
+        let name = t.table_name().clone();
+        let id = t.table_id();
+
+        let t = Arc::new(tokio::sync::RwLock::new(t));
+        self.by_name.insert(name, Arc::clone(&t));
+        self.by_id.insert(id, Arc::clone(&t));
+        t
+    }
+
+    fn by_name(&self, name: &TableName) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    fn by_id(&self, id: TableId) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
+/// The string name / identifier of a Namespace.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub(crate) struct NamespaceName(Arc<str>);
+
+impl<T> From<T> for NamespaceName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::ops::Deref for NamespaceName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl std::fmt::Display for NamespaceName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Data of a Namespace that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct NamespaceData {
     namespace_id: NamespaceId,
+    namespace_name: NamespaceName,
 
     /// The catalog ID of the shard this namespace is being populated from.
     shard_id: ShardId,
 
-    tables: RwLock<BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>>,
+    tables: RwLock<DoubleRef>,
     table_count: U64Counter,
 
     /// The resolver of `(shard_id, table_id, partition_key)` to
@@ -87,8 +142,9 @@ pub(crate) struct NamespaceData {
 
 impl NamespaceData {
     /// Initialize new tables with default partition template of daily
-    pub fn new(
+    pub(super) fn new(
         namespace_id: NamespaceId,
+        namespace_name: NamespaceName,
         shard_id: ShardId,
         partition_provider: Arc<dyn PartitionProvider>,
         metrics: &metric::Registry,
@@ -102,6 +158,7 @@ impl NamespaceData {
 
         Self {
             namespace_id,
+            namespace_name,
             shard_id,
             tables: Default::default(),
             table_count,
@@ -120,7 +177,6 @@ impl NamespaceData {
         dml_operation: DmlOperation,
         catalog: &Arc<dyn Catalog>,
         lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
     ) -> Result<bool, super::Error> {
         let sequence_number = dml_operation
             .meta()
@@ -146,6 +202,7 @@ impl NamespaceData {
                     .clone();
 
                 for (t, b) in write.into_tables() {
+                    let t = TableName::from(t);
                     let table_data = match self.table_data(&t) {
                         Some(t) => t,
                         None => self.insert_table(&t, catalog).await?,
@@ -171,19 +228,17 @@ impl NamespaceData {
                 Ok(pause_writes)
             }
             DmlOperation::Delete(delete) => {
-                let table_name = delete.table_name().context(super::TableNotPresentSnafu)?;
-                let table_data = match self.table_data(table_name) {
-                    Some(t) => t,
-                    None => self.insert_table(table_name, catalog).await?,
-                };
+                // Deprecated delete support:
+                // https://github.com/influxdata/influxdb_iox/issues/5825
+                warn!(
+                    shard_id=%self.shard_id,
+                    namespace_name=%self.namespace_name,
+                    namespace_id=%self.namespace_id,
+                    table_name=?delete.table_name(),
+                    sequence_number=?delete.meta().sequence(),
+                    "discarding unsupported delete op"
+                );
 
-                let mut table_data = table_data.write().await;
-
-                table_data
-                    .buffer_delete(delete.predicate(), sequence_number, &**catalog, executor)
-                    .await?;
-
-                // don't pause writes since deletes don't count towards memory limits
                 Ok(false)
             }
         }
@@ -194,16 +249,16 @@ impl NamespaceData {
     #[cfg(test)] // Only used in tests
     pub(crate) async fn snapshot(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
     ) -> Option<(
         Vec<Arc<super::partition::SnapshotBatch>>,
-        Option<Arc<PersistingBatch>>,
+        Option<Arc<super::partition::PersistingBatch>>,
     )> {
         if let Some(t) = self.table_data(table_name) {
             let mut t = t.write().await;
 
-            return t.partition_data.get_mut(partition_key).map(|p| {
+            return t.get_partition_by_key_mut(partition_key).map(|p| {
                 p.data
                     .generate_snapshot()
                     .expect("snapshot on mutable batch should never fail");
@@ -217,17 +272,17 @@ impl NamespaceData {
     /// Snapshots the mutable buffer for the partition, which clears it out and then moves all
     /// snapshots over to a persisting batch, which is returned. If there is no data to snapshot
     /// or persist, None will be returned.
+    #[cfg(test)] // Only used in tests
     pub(crate) async fn snapshot_to_persisting(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
-    ) -> Option<Arc<PersistingBatch>> {
+    ) -> Option<Arc<super::partition::PersistingBatch>> {
         if let Some(table_data) = self.table_data(table_name) {
             let mut table_data = table_data.write().await;
 
             return table_data
-                .partition_data
-                .get_mut(partition_key)
+                .get_partition_by_key_mut(partition_key)
                 .and_then(|partition_data| partition_data.snapshot_to_persisting_batch());
         }
 
@@ -237,45 +292,55 @@ impl NamespaceData {
     /// Gets the buffered table data
     pub(crate) fn table_data(
         &self,
-        table_name: &str,
+        table_name: &TableName,
     ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
         let t = self.tables.read();
-        t.get(table_name).cloned()
+        t.by_name(table_name)
+    }
+
+    /// Return the table data by ID.
+    pub(crate) fn table_id(
+        &self,
+        table_id: TableId,
+    ) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
+        let t = self.tables.read();
+        t.by_id(table_id)
     }
 
     /// Inserts the table or returns it if it happens to be inserted by some other thread
     async fn insert_table(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         catalog: &Arc<dyn Catalog>,
     ) -> Result<Arc<tokio::sync::RwLock<TableData>>, super::Error> {
         let mut repos = catalog.repositories().await;
+
         let info = repos
             .tables()
             .get_table_persist_info(self.shard_id, self.namespace_id, table_name)
             .await
             .context(super::CatalogSnafu)?
-            .context(super::TableNotFoundSnafu { table_name })?;
+            .ok_or_else(|| super::Error::TableNotFound {
+                table_name: table_name.to_string(),
+            })?;
 
         let mut t = self.tables.write();
 
-        let data = match t.entry(table_name.to_string()) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new(
+        Ok(match t.by_name(table_name) {
+            Some(v) => v,
+            None => {
+                self.table_count.inc(1);
+
+                // Insert the table and then return a ref to it.
+                t.insert(TableData::new(
                     info.table_id,
-                    table_name,
+                    table_name.clone(),
                     self.shard_id,
                     self.namespace_id,
-                    info.tombstone_max_sequence_number,
                     Arc::clone(&self.partition_provider),
-                ))));
-                self.table_count.inc(1);
-                Arc::clone(v)
+                ))
             }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };
-
-        Ok(data)
+        })
     }
 
     /// Walks down the table and partition and clears the persisting batch. The sequence number is
@@ -283,13 +348,13 @@ impl NamespaceData {
     /// data buffer.
     pub(super) async fn mark_persisted(
         &self,
-        table_name: &str,
+        table_name: &TableName,
         partition_key: &PartitionKey,
         sequence_number: SequenceNumber,
     ) {
         if let Some(t) = self.table_data(table_name) {
             let mut t = t.write().await;
-            let partition = t.partition_data.get_mut(partition_key);
+            let partition = t.get_partition_by_key_mut(partition_key);
 
             if let Some(p) = partition {
                 p.mark_persisted(sequence_number);
@@ -299,7 +364,7 @@ impl NamespaceData {
 
     /// Return progress from this Namespace
     pub(super) async fn progress(&self) -> ShardProgress {
-        let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect();
+        let tables: Vec<_> = self.tables.read().by_id.values().map(Arc::clone).collect();
 
         // Consolidate progtress across partitions.
         let mut progress = ShardProgress::new()
@@ -323,6 +388,12 @@ impl NamespaceData {
     pub(super) fn table_count(&self) -> &U64Counter {
         &self.table_count
     }
+
+    /// Returns the [`NamespaceName`] for this namespace.
+    #[cfg(test)]
+    pub(crate) fn namespace_name(&self) -> &NamespaceName {
+        &self.namespace_name
+    }
 }
 
 /// RAAI struct that sets buffering sequence number on creation and clears it on free
@@ -357,3 +428,92 @@ impl<'a> Drop for ScopedSequenceNumber<'a> {
         *buffering_sequence_number = None;
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_namespace_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let ns = NamespaceData::new(
+            ns_id,
+            NAMESPACE_NAME.into(),
+            shard_id,
+            partition_provider,
+            &*metrics,
+        );
+
+        // Assert the namespace name was stored
+        assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME);
+
+        // Assert the namespace does not contain the test data
+        assert!(ns.table_data(&TABLE_NAME.into()).is_none());
+        assert!(ns.table_id(table_id).is_none());
+
+        // Write some test data
+        ns.buffer_operation(
+            DmlOperation::Write(make_write_op(
+                &PartitionKey::from("banana-split"),
+                SHARD_INDEX,
+                NAMESPACE_NAME,
+                0,
+                r#"bananas,city=Medford day="sun",temp=55 22"#,
+            )),
+            &catalog,
+            &MockLifecycleHandle::default(),
+        )
+        .await
+        .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(ns.table_data(&TABLE_NAME.into()).is_some());
+        assert!(ns.table_id(table_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_tables_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}
diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs
index 1ec531fdbc..61dd4c36d2 100644
--- a/ingester/src/data/partition.rs
+++ b/ingester/src/data/partition.rs
@@ -3,18 +3,21 @@
 use std::sync::Arc;
 
 use arrow::record_batch::RecordBatch;
-use data_types::{
-    NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, Tombstone,
-};
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use observability_deps::tracing::*;
+use schema::{selection::Selection, sort::SortKey};
 use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;
 
-use self::buffer::{BufferBatch, DataBuffer};
-use crate::{data::query_dedup::query, query::QueryableBatch};
+use self::{
+    buffer::{BufferBatch, DataBuffer},
+    resolver::DeferredSortKey,
+};
+use crate::{querier_handler::PartitionStatus, query::QueryableBatch};
+
+use super::table::TableName;
 
 mod buffer;
 pub mod resolver;
@@ -28,20 +31,6 @@ pub(crate) struct UnpersistedPartitionData {
     pub(crate) partition_status: PartitionStatus,
 }
 
-/// Status of a partition that has unpersisted data.
-///
-/// Note that this structure is specific to a partition (which itself is bound to a table and
-/// shard)!
-#[derive(Debug, Clone, PartialEq, Eq)]
-#[allow(missing_copy_implementations)]
-pub struct PartitionStatus {
-    /// Max sequence number persisted
-    pub parquet_max_sequence_number: Option<SequenceNumber>,
-
-    /// Max sequence number for a tombstone
-    pub tombstone_max_sequence_number: Option<SequenceNumber>,
-}
-
 /// PersistingBatch contains all needed info and data for creating
 /// a parquet file for given set of SnapshotBatches
 #[derive(Debug, PartialEq, Clone)]
@@ -132,7 +121,28 @@ impl SnapshotBatch {
     }
 }
 
-/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard
+/// The load state of the [`SortKey`] for a given partition.
+#[derive(Debug)]
+pub(crate) enum SortKeyState {
+    /// The [`SortKey`] has not yet been fetched from the catalog, and will be
+    /// lazy loaded (or loaded in the background) by a call to
+    /// [`DeferredSortKey::get()`].
+    Deferred(DeferredSortKey),
+    /// The sort key is known and specified.
+    Provided(Option<SortKey>),
+}
+
+impl SortKeyState {
+    async fn get(&self) -> Option<SortKey> {
+        match self {
+            Self::Deferred(v) => v.get().await,
+            Self::Provided(v) => v.clone(),
+        }
+    }
+}
+
+/// Data of an IOx Partition of a given Table of a Namespace that belongs to a
+/// given Shard
 #[derive(Debug)]
 pub struct PartitionData {
     /// The catalog ID of the partition this buffer is for.
@@ -140,12 +150,23 @@ pub struct PartitionData {
     /// The string partition key for this partition.
     partition_key: PartitionKey,
 
+    /// The sort key of this partition.
+    ///
+    /// This can known, in which case this field will contain a
+    /// [`SortKeyState::Provided`] with the [`SortKey`], or unknown with a value
+    /// of [`SortKeyState::Deferred`] causing it to be loaded from the catalog
+    /// (potentially) in the background or at read time.
+    ///
+    /// Callers should use [`Self::sort_key()`] to be abstracted away from these
+    /// fetch details.
+    sort_key: SortKeyState,
+
     /// The shard, namespace & table IDs for this partition.
     shard_id: ShardId,
     namespace_id: NamespaceId,
     table_id: TableId,
     /// The name of the table this partition is part of.
-    table_name: Arc<str>,
+    table_name: TableName,
 
     pub(super) data: DataBuffer,
 
@@ -156,18 +177,21 @@ pub struct PartitionData {
 
 impl PartitionData {
     /// Initialize a new partition data buffer
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         id: PartitionId,
         partition_key: PartitionKey,
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
+        sort_key: SortKeyState,
         max_persisted_sequence_number: Option<SequenceNumber>,
     ) -> Self {
         Self {
             id,
             partition_key,
+            sort_key,
             shard_id,
             namespace_id,
             table_id,
@@ -209,100 +233,36 @@ impl PartitionData {
         sequence_number: SequenceNumber,
         mb: MutableBatch,
     ) -> Result<(), super::Error> {
-        match &mut self.data.buffer {
+        let (min_sequence_number, max_sequence_number) = match &mut self.data.buffer {
             Some(buf) => {
                 buf.max_sequence_number = sequence_number.max(buf.max_sequence_number);
                 buf.data.extend_from(&mb).context(super::BufferWriteSnafu)?;
+                (buf.min_sequence_number, buf.max_sequence_number)
             }
             None => {
                 self.data.buffer = Some(BufferBatch {
                     min_sequence_number: sequence_number,
                     max_sequence_number: sequence_number,
                     data: mb,
-                })
+                });
+                (sequence_number, sequence_number)
             }
-        }
+        };
+        trace!(
+            min_sequence_number=?min_sequence_number,
+            max_sequence_number=?max_sequence_number,
+            "buffered write"
+        );
 
         Ok(())
     }
 
-    /// Buffers a new tombstone:
-    ///   . All the data in the `buffer` and `snapshots` will be replaced with one
-    ///     tombstone-applied snapshot
-    ///   . The tombstone is only added in the `deletes_during_persisting` if the `persisting`
-    ///     exists
-    pub(super) async fn buffer_tombstone(&mut self, executor: &Executor, tombstone: Tombstone) {
-        self.data.add_tombstone(tombstone.clone());
-
-        // ----------------------------------------------------------
-        // First apply the tombstone on all in-memory & non-persisting data
-        // Make a QueryableBatch for all buffer + snapshots + the given tombstone
-        let max_sequence_number = tombstone.sequence_number;
-        let query_batch = match self.data.snapshot_to_queryable_batch(
-            &self.table_name,
-            self.id,
-            Some(tombstone.clone()),
-        ) {
-            Some(query_batch) if !query_batch.is_empty() => query_batch,
-            _ => {
-                // No need to proceed further
-                return;
-            }
-        };
-
-        let (min_sequence_number, _) = query_batch.min_max_sequence_numbers();
-        assert!(min_sequence_number <= max_sequence_number);
-
-        // Run query on the QueryableBatch to apply the tombstone.
-        let stream = match query(executor, Arc::new(query_batch)).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to apply tombstones on snapshots: {:?}", e);
-            }
-            Ok(stream) => stream,
-        };
-        let record_batches = match datafusion::physical_plan::common::collect(stream).await {
-            Err(e) => {
-                // this should never error out. if it does, we need to crash hard so
-                // someone can take a look.
-                panic!("unable to collect record batches: {:?}", e);
-            }
-            Ok(batches) => batches,
-        };
-
-        // Merge all result record batches into one record batch
-        // and make a snapshot for it
-        let snapshot = if !record_batches.is_empty() {
-            let record_batch =
-                arrow::compute::concat_batches(&record_batches[0].schema(), &record_batches)
-                    .unwrap_or_else(|e| {
-                        panic!("unable to concat record batches: {:?}", e);
-                    });
-            let snapshot = SnapshotBatch {
-                min_sequence_number,
-                max_sequence_number,
-                data: Arc::new(record_batch),
-            };
-
-            Some(Arc::new(snapshot))
-        } else {
-            None
-        };
-
-        // ----------------------------------------------------------
-        // Add the tombstone-applied data back in as one snapshot
-        if let Some(snapshot) = snapshot {
-            self.data.snapshots.push(snapshot);
-        }
-    }
-
     /// Return the progress from this Partition
     pub(super) fn progress(&self) -> ShardProgress {
         self.data.progress()
     }
 
-    pub(super) fn id(&self) -> PartitionId {
+    pub(super) fn partition_id(&self) -> PartitionId {
         self.id
     }
 
@@ -347,6 +307,13 @@ impl PartitionData {
     pub fn namespace_id(&self) -> NamespaceId {
         self.namespace_id
     }
+
+    /// Return the [`SortKey`] for this partition.
+    ///
+    /// NOTE: this MAY involve querying the catalog with unbounded retries.
+    pub async fn sort_key(&self) -> Option<SortKey> {
+        self.sort_key.get().await
+    }
 }
 
 #[cfg(test)]
@@ -355,7 +322,6 @@ mod tests {
     use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
 
     use super::*;
-    use crate::test_util::create_tombstone;
 
     #[test]
     fn snapshot_buffer_different_but_compatible_schemas() {
@@ -366,6 +332,7 @@ mod tests {
             NamespaceId::new(42),
             TableId::new(1),
             "foo".into(),
+            SortKeyState::Provided(None),
             None,
         );
 
@@ -401,7 +368,7 @@ mod tests {
 
     // Test deletes mixed with writes on a single parittion
     #[tokio::test]
-    async fn writes_and_deletes() {
+    async fn writes() {
         // Make a partition with empty DataBuffer
         let s_id = 1;
         let t_id = 1;
@@ -413,9 +380,9 @@ mod tests {
             NamespaceId::new(42),
             TableId::new(t_id),
             "restaurant".into(),
+            SortKeyState::Provided(None),
             None,
         );
-        let exec = Executor::new(1);
 
         // ------------------------------------------
         // Fill `buffer`
@@ -438,42 +405,8 @@ mod tests {
             SequenceNumber::new(2)
         );
         assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
         assert_eq!(p.data.persisting, None);
 
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 3
-        let ts = create_tombstone(
-            1,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            3,         // delete's seq_number
-            0,         // min time of data to get deleted
-            20,        // max time of data to get deleted
-            "day=thu", // delete predicate
-        );
-        // one row will get deleted, the other is moved to snapshot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has one row since the other one got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+--------+-----+------+--------------------------------+",
-            "| city   | day | temp | time                           |",
-            "+--------+-----+------+--------------------------------+",
-            "| Boston | fri | 50   | 1970-01-01T00:00:00.000000010Z |",
-            "+--------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 3);
-
         // ------------------------------------------
         // Fill `buffer`
         // --- seq_num: 4
@@ -493,50 +426,15 @@ mod tests {
         // verify data
         assert_eq!(
             p.data.buffer.as_ref().unwrap().min_sequence_number,
-            SequenceNumber::new(4)
+            SequenceNumber::new(1)
         );
         assert_eq!(
             p.data.buffer.as_ref().unwrap().max_sequence_number,
             SequenceNumber::new(5)
         );
-        assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
+        assert_eq!(p.data.snapshots.len(), 0);
         assert_eq!(p.data.persisting, None);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 6
-        let ts = create_tombstone(
-            2,             // tombstone id
-            t_id,          // table id
-            s_id,          // shard id
-            6,             // delete's seq_number
-            10,            // min time of data to get deleted
-            50,            // max time of data to get deleted
-            "city=Boston", // delete predicate
-        );
-        // two rows will get deleted, one from existing snapshot, one from the buffer being moved
-        // to snpashot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // one snpashot
-        assert_eq!(p.data.deletes_during_persisting().len(), 0);
-        assert_eq!(p.data.persisting, None);
-        // snapshot only has two rows since the other 2 rows with city=Boston have got deleted
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+---------+-----+------+--------------------------------+",
-            "| city    | day | temp | time                           |",
-            "+---------+-----+------+--------------------------------+",
-            "| Andover | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "+---------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 6);
+        assert!(p.data.buffer.is_some());
 
         // ------------------------------------------
         // Persisting
@@ -545,32 +443,12 @@ mod tests {
         // verify data
         assert!(p.data.buffer.is_none()); // always empty after issuing persit
         assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit
-        assert_eq!(p.data.deletes_during_persisting().len(), 0); // deletes not happen yet
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 7
-        let ts = create_tombstone(
-            3,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            7,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=55", // delete predicate
-        );
-        // if a query come while persisting, the row with temp=55 will be deleted before
-        // data is sent back to Querier
-        p.buffer_tombstone(&exec, ts).await;
-
         // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-                                          // no snpashots becasue buffer has not data yet and the
-                                          // snapshot was empty too
-        assert_eq!(p.data.snapshots.len(), 0);
-        assert_eq!(p.data.deletes_during_persisting().len(), 1); // tombstone added since data is
-                                                                 // persisting
+        assert!(p.data.buffer.is_none());
+        assert_eq!(p.data.snapshots.len(), 0); // no snpashots becasue buffer has not data yet and the
+                                               // snapshot was empty too
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
         // ------------------------------------------
@@ -591,7 +469,6 @@ mod tests {
             SequenceNumber::new(8)
         ); // 1 newly added mutable batch of 3 rows of data
         assert_eq!(p.data.snapshots.len(), 0); // still empty
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
 
         // ------------------------------------------
@@ -600,7 +477,6 @@ mod tests {
         // verify data
         assert!(p.data.buffer.is_none()); // empty after snapshot
         assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer
-        assert_eq!(p.data.deletes_during_persisting().len(), 1);
         assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
         // snapshot has three rows moved from buffer
         let data = (*p.data.snapshots[0].data).clone();
@@ -616,41 +492,5 @@ mod tests {
         assert_batches_sorted_eq!(&expected, &[data]);
         assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
         assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 8);
-
-        // ------------------------------------------
-        // Delete
-        // --- seq_num: 9
-        let ts = create_tombstone(
-            4,         // tombstone id
-            t_id,      // table id
-            s_id,      // shard id
-            9,         // delete's seq_number
-            10,        // min time of data to get deleted
-            50,        // max time of data to get deleted
-            "temp=60", // delete predicate
-        );
-        // the row with temp=60 will be removed from the sanphot
-        p.buffer_tombstone(&exec, ts).await;
-
-        // verify data
-        assert!(p.data.buffer.is_none()); // always empty after delete
-        assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied
-        assert_eq!(p.data.deletes_during_persisting().len(), 2); // one more tombstone added make it 2
-        assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
-        // snapshot has only 2 rows because the row with tem=60 was removed
-        let data = (*p.data.snapshots[0].data).clone();
-        let expected = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Wilmington | sun | 55   | 1970-01-01T00:00:00.000000035Z |",
-            "| Boston     | sun | 62   | 1970-01-01T00:00:00.000000038Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        assert_batches_sorted_eq!(&expected, &[data]);
-        assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
-        assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 9);
-
-        exec.join().await;
     }
 }
diff --git a/ingester/src/data/partition/buffer.rs b/ingester/src/data/partition/buffer.rs
index 739da735fa..866e7a966c 100644
--- a/ingester/src/data/partition/buffer.rs
+++ b/ingester/src/data/partition/buffer.rs
@@ -2,13 +2,15 @@
 
 use std::sync::Arc;
 
-use data_types::{PartitionId, SequenceNumber, ShardId, TableId, Tombstone};
+use data_types::{PartitionId, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
 use schema::selection::Selection;
 use snafu::ResultExt;
 use uuid::Uuid;
 use write_summary::ShardProgress;
 
+use crate::data::table::TableName;
+
 use super::{PersistingBatch, QueryableBatch, SnapshotBatch};
 
 /// Data of an IOx partition split into batches
@@ -38,14 +40,6 @@ pub(crate) struct DataBuffer {
     /// Buffer of incoming writes
     pub(crate) buffer: Option<BufferBatch>,
 
-    /// Buffer of tombstones whose time range may overlap with this partition.
-    /// All tombstones were already applied to corresponding snapshots. This list
-    /// only keep the ones that come during persisting. The reason
-    /// we keep them becasue if a query comes, we need to apply these tombstones
-    /// on the persiting data before sending it to the Querier
-    /// When the `persiting` is done and removed, this list will get empty, too
-    deletes_during_persisting: Vec<Tombstone>,
-
     /// Data in `buffer` will be moved to a `snapshot` when one of these happens:
     ///  . A background persist is called
     ///  . A read request from Querier
@@ -70,14 +64,6 @@ pub(crate) struct DataBuffer {
 }
 
 impl DataBuffer {
-    /// Add a new tombstones into the [`DataBuffer`].
-    pub(super) fn add_tombstone(&mut self, tombstone: Tombstone) {
-        // Only keep this tombstone if some data is being persisted
-        if self.persisting.is_some() {
-            self.deletes_during_persisting.push(tombstone);
-        }
-    }
-
     /// If a [`BufferBatch`] exists, convert it to a [`SnapshotBatch`] and add
     /// it to the list of snapshots.
     ///
@@ -109,9 +95,8 @@ impl DataBuffer {
     /// Both buffer and snapshots will be empty after this
     pub(super) fn snapshot_to_queryable_batch(
         &mut self,
-        table_name: &Arc<str>,
+        table_name: &TableName,
         partition_id: PartitionId,
-        tombstone: Option<Tombstone>,
     ) -> Option<QueryableBatch> {
         self.generate_snapshot()
             .expect("This mutable batch snapshot error should be impossible.");
@@ -119,21 +104,11 @@ impl DataBuffer {
         let mut data = vec![];
         std::mem::swap(&mut data, &mut self.snapshots);
 
-        let mut tombstones = vec![];
-        if let Some(tombstone) = tombstone {
-            tombstones.push(tombstone);
-        }
-
         // only produce batch if there is any data
         if data.is_empty() {
             None
         } else {
-            Some(QueryableBatch::new(
-                Arc::clone(table_name),
-                partition_id,
-                data,
-                tombstones,
-            ))
+            Some(QueryableBatch::new(table_name.clone(), partition_id, data))
         }
     }
 
@@ -164,15 +139,13 @@ impl DataBuffer {
         shard_id: ShardId,
         table_id: TableId,
         partition_id: PartitionId,
-        table_name: &Arc<str>,
+        table_name: &TableName,
     ) -> Option<Arc<PersistingBatch>> {
         if self.persisting.is_some() {
             panic!("Unable to snapshot while persisting. This is an unexpected state.")
         }
 
-        if let Some(queryable_batch) =
-            self.snapshot_to_queryable_batch(table_name, partition_id, None)
-        {
+        if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, partition_id) {
             let persisting_batch = Arc::new(PersistingBatch {
                 shard_id,
                 table_id,
@@ -197,12 +170,7 @@ impl DataBuffer {
         };
 
         // persisting data
-        let mut queryable_batch = (*persisting.data).clone();
-
-        // Add new tombstones if any
-        queryable_batch.add_tombstones(&self.deletes_during_persisting);
-
-        Some(queryable_batch)
+        Some((*persisting.data).clone())
     }
 
     /// Return the progress in this DataBuffer
@@ -239,12 +207,6 @@ impl DataBuffer {
 
     pub(crate) fn mark_persisted(&mut self) {
         self.persisting = None;
-        self.deletes_during_persisting.clear()
-    }
-
-    #[cfg(test)]
-    pub(super) fn deletes_during_persisting(&self) -> &[Tombstone] {
-        self.deletes_during_persisting.as_ref()
     }
 }
 
diff --git a/ingester/src/data/partition/resolver/cache.rs b/ingester/src/data/partition/resolver/cache.rs
index 0dda53f057..7f282ae38c 100644
--- a/ingester/src/data/partition/resolver/cache.rs
+++ b/ingester/src/data/partition/resolver/cache.rs
@@ -1,13 +1,18 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Duration};
 
 use async_trait::async_trait;
+use backoff::BackoffConfig;
 use data_types::{
     NamespaceId, Partition, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
 };
+use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 
-use crate::data::partition::PartitionData;
+use crate::data::{
+    partition::{resolver::DeferredSortKey, PartitionData, SortKeyState},
+    table::TableName,
+};
 
 use super::r#trait::PartitionProvider;
 
@@ -43,6 +48,18 @@ struct Entry {
 /// Each cache hit _removes_ the entry from the cache - this eliminates the
 /// memory overhead for items that were hit. This is the expected (only valid!)
 /// usage pattern.
+///
+/// # Deferred Sort Key Loading
+///
+/// This cache does NOT cache the [`SortKey`] for each [`PartitionData`], as the
+/// sort key can be large and is likely unique per table, and thus not
+/// share-able across instances / prohibitively expensive to cache.
+///
+/// Instead cached instances are returned with a deferred sort key resolver
+/// which attempts to fetch the sort key in the background some time after
+/// construction.
+///
+/// [`SortKey`]: schema::sort::SortKey
 #[derive(Debug)]
 pub(crate) struct PartitionCache<T> {
     // The inner delegate called for a cache miss.
@@ -59,13 +76,31 @@ pub(crate) struct PartitionCache<T> {
     /// a faster search for cache misses.
     #[allow(clippy::type_complexity)]
     entries: Mutex<HashMap<PartitionKey, HashMap<ShardId, HashMap<TableId, Entry>>>>,
+
+    /// Data needed to construct the [`DeferredSortKey`] for cached entries.
+    catalog: Arc<dyn Catalog>,
+    backoff_config: BackoffConfig,
+    /// The maximum amount of time a [`DeferredSortKey`] may wait until
+    /// pre-fetching the sort key in the background.
+    max_smear: Duration,
 }
 
 impl<T> PartitionCache<T> {
     /// Initialise a [`PartitionCache`] containing the specified partitions.
     ///
     /// Any cache miss is passed through to `inner`.
-    pub(crate) fn new<P>(inner: T, partitions: P) -> Self
+    ///
+    /// Any cache hit returns a [`PartitionData`] configured with a
+    /// [`SortKeyState::Deferred`] for deferred key loading in the background.
+    /// The [`DeferredSortKey`] is initialised with the given `catalog`,
+    /// `backoff_config`, and `max_smear` maximal load wait duration.
+    pub(crate) fn new<P>(
+        inner: T,
+        partitions: P,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self
     where
         P: IntoIterator<Item = Partition>,
     {
@@ -97,6 +132,9 @@ impl<T> PartitionCache<T> {
         Self {
             entries: Mutex::new(entries),
             inner,
+            catalog,
+            backoff_config,
+            max_smear,
         }
     }
 
@@ -154,7 +192,7 @@ where
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         // Use the cached PartitionKey instead of the caller's partition_key,
         // instead preferring to reuse the already-shared Arc<str> in the cache.
@@ -171,6 +209,12 @@ where
                 namespace_id,
                 table_id,
                 table_name,
+                SortKeyState::Deferred(DeferredSortKey::new(
+                    cached.partition_id,
+                    self.max_smear,
+                    Arc::clone(&__self.catalog),
+                    self.backoff_config.clone(),
+                )),
                 cached.max_sequence_number,
             );
         }
@@ -186,6 +230,8 @@ where
 
 #[cfg(test)]
 mod tests {
+    use iox_catalog::mem::MemCatalog;
+
     use crate::data::partition::resolver::MockPartitionProvider;
 
     use super::*;
@@ -197,6 +243,22 @@ mod tests {
     const TABLE_ID: TableId = TableId::new(3);
     const TABLE_NAME: &str = "platanos";
 
+    fn new_cache<P>(
+        inner: MockPartitionProvider,
+        partitions: P,
+    ) -> PartitionCache<MockPartitionProvider>
+    where
+        P: IntoIterator<Item = Partition>,
+    {
+        PartitionCache::new(
+            inner,
+            partitions,
+            Duration::from_secs(10_000_000),
+            Arc::new(MemCatalog::new(Arc::new(metric::Registry::default()))),
+            BackoffConfig::default(),
+        )
+    }
+
     #[tokio::test]
     async fn test_miss() {
         let data = PartitionData::new(
@@ -206,11 +268,12 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         );
         let inner = MockPartitionProvider::default().with_partition(data);
 
-        let cache = PartitionCache::new(inner, []);
+        let cache = new_cache(inner, []);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
@@ -221,7 +284,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -238,11 +301,11 @@ mod tests {
             shard_id: SHARD_ID,
             table_id: TABLE_ID,
             partition_key: stored_partition_key.clone(),
-            sort_key: Default::default(),
+            sort_key: vec!["dos".to_string(), "bananas".to_string()],
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
 
         let callers_partition_key = PartitionKey::from(PARTITION_KEY);
         let got = cache
@@ -255,7 +318,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -274,7 +337,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_miss_partition_jey() {
+    async fn test_miss_partition_key() {
         let other_key = PartitionKey::from("test");
         let other_key_id = PartitionId::new(99);
         let inner = MockPartitionProvider::default().with_partition(PartitionData::new(
@@ -284,6 +347,7 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -296,7 +360,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 other_key.clone(),
@@ -307,7 +371,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), other_key_id);
+        assert_eq!(got.partition_id(), other_key_id);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -323,6 +387,7 @@ mod tests {
             NAMESPACE_ID,
             other_table,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -335,7 +400,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
@@ -346,7 +411,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), SHARD_ID);
         assert_eq!(got.table_id(), other_table);
         assert_eq!(got.table_name(), TABLE_NAME);
@@ -362,6 +427,7 @@ mod tests {
             NAMESPACE_ID,
             TABLE_ID,
             TABLE_NAME.into(),
+            SortKeyState::Provided(None),
             None,
         ));
 
@@ -374,7 +440,7 @@ mod tests {
             persisted_sequence_number: Default::default(),
         };
 
-        let cache = PartitionCache::new(inner, [partition]);
+        let cache = new_cache(inner, [partition]);
         let got = cache
             .get_partition(
                 PARTITION_KEY.into(),
@@ -385,7 +451,7 @@ mod tests {
             )
             .await;
 
-        assert_eq!(got.id(), PARTITION_ID);
+        assert_eq!(got.partition_id(), PARTITION_ID);
         assert_eq!(got.shard_id(), other_shard);
         assert_eq!(got.table_id(), TABLE_ID);
         assert_eq!(got.table_name(), TABLE_NAME);
diff --git a/ingester/src/data/partition/resolver/catalog.rs b/ingester/src/data/partition/resolver/catalog.rs
index 8035546be6..e42c4876c4 100644
--- a/ingester/src/data/partition/resolver/catalog.rs
+++ b/ingester/src/data/partition/resolver/catalog.rs
@@ -9,7 +9,10 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 
-use crate::data::partition::PartitionData;
+use crate::data::{
+    partition::{PartitionData, SortKeyState},
+    table::TableName,
+};
 
 use super::r#trait::PartitionProvider;
 
@@ -55,7 +58,7 @@ impl PartitionProvider for CatalogPartitionResolver {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         debug!(
             %partition_key,
@@ -78,6 +81,7 @@ impl PartitionProvider for CatalogPartitionResolver {
             namespace_id,
             table_id,
             table_name,
+            SortKeyState::Provided(p.sort_key()),
             p.persisted_sequence_number,
         )
     }
@@ -131,7 +135,7 @@ mod tests {
         };
 
         let callers_partition_key = PartitionKey::from(PARTITION_KEY);
-        let table_name = TABLE_NAME.into();
+        let table_name = TableName::from(TABLE_NAME);
         let resolver = CatalogPartitionResolver::new(Arc::clone(&catalog));
         let got = resolver
             .get_partition(
@@ -139,11 +143,12 @@ mod tests {
                 shard_id,
                 namespace_id,
                 table_id,
-                Arc::clone(&table_name),
+                table_name.clone(),
             )
             .await;
         assert_eq!(got.namespace_id(), namespace_id);
         assert_eq!(*got.table_name(), *table_name);
+        assert_eq!(got.sort_key().await, None);
         assert_eq!(got.max_persisted_sequence_number(), None);
         assert!(got.partition_key.ptr_eq(&callers_partition_key));
 
diff --git a/ingester/src/data/partition/resolver/mock.rs b/ingester/src/data/partition/resolver/mock.rs
index e65f127ef4..80f859c43e 100644
--- a/ingester/src/data/partition/resolver/mock.rs
+++ b/ingester/src/data/partition/resolver/mock.rs
@@ -1,12 +1,12 @@
 //! A mock [`PartitionProvider`] to inject [`PartitionData`] for tests.
 
-use std::{collections::HashMap, sync::Arc};
+use std::collections::HashMap;
 
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
 use parking_lot::Mutex;
 
-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};
 
 use super::r#trait::PartitionProvider;
 
@@ -58,7 +58,7 @@ impl PartitionProvider for MockPartitionProvider {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         let p = self
             .partitions
diff --git a/ingester/src/data/partition/resolver/mod.rs b/ingester/src/data/partition/resolver/mod.rs
index fcb5e5fb6a..904eb781f5 100644
--- a/ingester/src/data/partition/resolver/mod.rs
+++ b/ingester/src/data/partition/resolver/mod.rs
@@ -11,6 +11,9 @@ pub use r#trait::*;
 mod catalog;
 pub use catalog::*;
 
+mod sort_key;
+pub(crate) use sort_key::*;
+
 #[cfg(test)]
 mod mock;
 #[cfg(test)]
diff --git a/ingester/src/data/partition/resolver/sort_key.rs b/ingester/src/data/partition/resolver/sort_key.rs
new file mode 100644
index 0000000000..36e3ee5f1a
--- /dev/null
+++ b/ingester/src/data/partition/resolver/sort_key.rs
@@ -0,0 +1,331 @@
+//! A optimised resolver of a partition [`SortKey`].
+
+use std::{sync::Arc, time::Duration};
+
+use backoff::{Backoff, BackoffConfig};
+use data_types::PartitionId;
+use iox_catalog::interface::Catalog;
+use parking_lot::Mutex;
+use rand::Rng;
+use schema::sort::SortKey;
+use tokio::task::JoinHandle;
+
+/// The states of a [`DeferredSortKey`] instance.
+#[derive(Debug)]
+enum State {
+    /// The value has not yet been fetched by the background task.
+    Unresolved,
+    /// The value was fetched by the background task and is read to be consumed.
+    Resolved(Option<SortKey>),
+}
+
+/// A resolver of [`SortKey`] from the catalog for a given partition.
+///
+/// This implementation combines lazy / deferred loading of the [`SortKey`] from
+/// the [`Catalog`], and a background timer that pre-fetches the [`SortKey`]
+/// after some random duration of time. Combined, these behaviours smear the
+/// [`SortKey`] queries across the allowable time range, avoiding a large number
+/// of queries from executing when multiple [`SortKey`] are needed in the system
+/// at one point in time.
+///
+/// If the [`DeferredSortKey`] is dropped and the background task is still
+/// incomplete (sleeping / actively fetching the [`SortKey`]) it is aborted
+/// immediately. The background task exists once it has successfully fetched the
+/// [`SortKey`].
+///
+/// # Stale Cached Values
+///
+/// This is effectively a cache that is pre-warmed in the background - this
+/// necessitates that the caller can tolerate, or determine, stale values.
+#[derive(Debug)]
+pub(crate) struct DeferredSortKey {
+    value: Arc<Mutex<State>>,
+    partition_id: PartitionId,
+
+    handle: JoinHandle<()>,
+
+    backoff_config: BackoffConfig,
+    catalog: Arc<dyn Catalog>,
+}
+
+impl DeferredSortKey {
+    /// Construct a [`DeferredSortKey`] instance that fetches the [`SortKey`]
+    /// for the specified `partition_id`.
+    ///
+    /// The background task will wait a uniformly random duration of time
+    /// between `[0, max_smear)` before attempting to pre-fetch the [`SortKey`]
+    /// from `catalog`.
+    pub(crate) fn new(
+        partition_id: PartitionId,
+        max_smear: Duration,
+        catalog: Arc<dyn Catalog>,
+        backoff_config: BackoffConfig,
+    ) -> Self {
+        // Init the value container the background thread populates.
+        let value = Arc::new(Mutex::new(State::Unresolved));
+
+        // Select random duration from a uniform distribution, up to the
+        // configured maximum.
+        let wait_for = rand::thread_rng().gen_range(Duration::ZERO..max_smear);
+
+        // Spawn the background task, sleeping for the random duration of time
+        // before fetching the sort key.
+        let handle = tokio::spawn({
+            let value = Arc::clone(&value);
+            let catalog = Arc::clone(&catalog);
+            let backoff_config = backoff_config.clone();
+            async move {
+                // Sleep for the random duration
+                tokio::time::sleep(wait_for).await;
+                // Fetch the sort key from the catalog
+                let v = fetch(partition_id, &*catalog, &backoff_config).await;
+                // And attempt to update the value container, if it hasn't
+                // already resolved
+                let mut state = value.lock();
+                *state = match *state {
+                    State::Unresolved => State::Resolved(v),
+                    State::Resolved(_) => return,
+                };
+            }
+        });
+
+        Self {
+            value,
+            partition_id,
+            handle,
+            backoff_config,
+            catalog,
+        }
+    }
+
+    /// Read the [`SortKey`] for the partition.
+    ///
+    /// If the [`SortKey`] was pre-fetched in the background, it is returned
+    /// immediately. If the [`SortKey`] has not yet been resolved, this call
+    /// blocks while it is read from the [`Catalog`].
+    ///
+    /// # Concurrency
+    ///
+    /// If this method requires resolving the [`SortKey`], N concurrent callers
+    /// will cause N queries against the catalog.
+    ///
+    /// # Await Safety
+    ///
+    /// Cancelling the future returned by calling [`Self::get()`] before
+    /// completion will leave [`Self`] without a background task. The next call
+    /// to [`Self::get()`] will incur a catalog query (see concurrency above).
+    pub(crate) async fn get(&self) -> Option<SortKey> {
+        {
+            let state = self.value.lock();
+
+            // If there is a resolved value, return it.
+            if let State::Resolved(v) = &*state {
+                return v.clone();
+            }
+        }
+
+        // Otherwise resolve the value immediately, aborting the background
+        // task.
+        self.handle.abort();
+        let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await;
+
+        {
+            let mut state = self.value.lock();
+            *state = State::Resolved(sort_key.clone());
+        }
+
+        sort_key
+    }
+}
+
+impl Drop for DeferredSortKey {
+    fn drop(&mut self) {
+        // Attempt to abort the background task, regardless of it having
+        // completed or not.
+        self.handle.abort()
+    }
+}
+
+/// Fetch the [`SortKey`] from the [`Catalog`] for `partition_id`, retrying
+/// endlessly when errors occur.
+async fn fetch(
+    partition_id: PartitionId,
+    catalog: &dyn Catalog,
+    backoff_config: &BackoffConfig,
+) -> Option<SortKey> {
+    Backoff::new(backoff_config)
+        .retry_all_errors("fetch partition sort key", || async {
+            let s = catalog
+                .repositories()
+                .await
+                .partitions()
+                .get_by_id(partition_id)
+                .await?
+                .expect("resolving sort key for non-existent partition")
+                .sort_key();
+
+            Result::<_, iox_catalog::interface::Error>::Ok(s)
+        })
+        .await
+        .expect("retry forever")
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::ShardIndex;
+    use test_helpers::timeout::FutureTimeout;
+
+    use crate::test_util::populate_catalog;
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+
+    // A test that (most likely) exercises the "read on demand" code path.
+    //
+    // The background task is configured to run some time between now, and
+    // 10,000,000 seconds in the future - it most likely doesn't get to complete
+    // before the get() call is issued.
+    //
+    // If this test flakes, it is POSSIBLE but UNLIKELY that the background task
+    // has completed and the get() call reads a pre-fetched value.
+    #[tokio::test]
+    async fn test_read_demand() {
+        const LONG_LONG_TIME: Duration = Duration::from_secs(10_000_000);
+
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            Duration::from_secs(36_000_000),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        )
+        .get()
+        .await;
+        assert!(fetched.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetched = DeferredSortKey::new(
+            partition_id,
+            LONG_LONG_TIME,
+            Arc::clone(&catalog),
+            backoff_config,
+        )
+        .get()
+        .await;
+
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+
+    // A test that deterministically exercises the "background pre-fetch" code path.
+    #[tokio::test]
+    async fn test_read_pre_fetched() {
+        let metrics = Arc::new(metric::Registry::default());
+        let backoff_config = BackoffConfig::default();
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, _ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        let partition_id = catalog
+            .repositories()
+            .await
+            .partitions()
+            .create_or_get(PARTITION_KEY.into(), shard_id, table_id)
+            .await
+            .expect("should create")
+            .id;
+
+        // Read the just-created sort key (None)
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        assert!(fetcher.get().await.is_none());
+
+        // Set the sort key
+        let catalog_state = catalog
+            .repositories()
+            .await
+            .partitions()
+            .update_sort_key(partition_id, &["uno", "dos", "bananas"])
+            .await
+            .expect("should update existing partition key");
+
+        // Read the updated sort key
+        let fetcher = DeferredSortKey::new(
+            partition_id,
+            Duration::from_nanos(1),
+            Arc::clone(&catalog),
+            backoff_config.clone(),
+        );
+
+        // Spin, waiting for the background task to show as complete.
+        async {
+            loop {
+                if fetcher.handle.is_finished() {
+                    return;
+                }
+
+                tokio::task::yield_now().await
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(5))
+        .await;
+
+        let fetched = fetcher.get().await;
+        assert!(fetched.is_some());
+        assert_eq!(fetched, catalog_state.sort_key());
+    }
+}
diff --git a/ingester/src/data/partition/resolver/trait.rs b/ingester/src/data/partition/resolver/trait.rs
index c18ccdf1a2..4ca50ec949 100644
--- a/ingester/src/data/partition/resolver/trait.rs
+++ b/ingester/src/data/partition/resolver/trait.rs
@@ -3,7 +3,7 @@ use std::{fmt::Debug, sync::Arc};
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
 
-use crate::data::partition::PartitionData;
+use crate::data::{partition::PartitionData, table::TableName};
 
 /// An infallible resolver of [`PartitionData`] for the specified shard, table,
 /// and partition key, returning an initialised [`PartitionData`] buffer for it.
@@ -20,7 +20,7 @@ pub trait PartitionProvider: Send + Sync + Debug {
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData;
 }
 
@@ -35,7 +35,7 @@ where
         shard_id: ShardId,
         namespace_id: NamespaceId,
         table_id: TableId,
-        table_name: Arc<str>,
+        table_name: TableName,
     ) -> PartitionData {
         (**self)
             .get_partition(partition_key, shard_id, namespace_id, table_id, table_name)
@@ -49,7 +49,7 @@ mod tests {
 
     use data_types::PartitionId;
 
-    use crate::data::partition::resolver::MockPartitionProvider;
+    use crate::data::partition::{resolver::MockPartitionProvider, SortKeyState};
 
     use super::*;
 
@@ -59,7 +59,7 @@ mod tests {
         let shard_id = ShardId::new(42);
         let namespace_id = NamespaceId::new(1234);
         let table_id = TableId::new(24);
-        let table_name = "platanos".into();
+        let table_name = TableName::from("platanos");
         let partition = PartitionId::new(4242);
         let data = PartitionData::new(
             partition,
@@ -67,22 +67,17 @@ mod tests {
             shard_id,
             namespace_id,
             table_id,
-            Arc::clone(&table_name),
+            table_name.clone(),
+            SortKeyState::Provided(None),
             None,
         );
 
         let mock = Arc::new(MockPartitionProvider::default().with_partition(data));
 
         let got = mock
-            .get_partition(
-                key,
-                shard_id,
-                namespace_id,
-                table_id,
-                Arc::clone(&table_name),
-            )
+            .get_partition(key, shard_id, namespace_id, table_id, table_name.clone())
             .await;
-        assert_eq!(got.id(), partition);
+        assert_eq!(got.partition_id(), partition);
         assert_eq!(got.namespace_id(), namespace_id);
         assert_eq!(*got.table_name(), *table_name);
     }
diff --git a/ingester/src/data/query_dedup.rs b/ingester/src/data/query_dedup.rs
deleted file mode 100644
index 199e3ae14e..0000000000
--- a/ingester/src/data/query_dedup.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-use std::sync::Arc;
-
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
-use iox_query::{
-    exec::{Executor, ExecutorType},
-    QueryChunk, QueryChunkMeta, ScanPlanBuilder,
-};
-use observability_deps::tracing::debug;
-use snafu::{ResultExt, Snafu};
-
-use crate::query::QueryableBatch;
-
-#[derive(Debug, Snafu)]
-#[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
-    #[snafu(display("Error creating plan for querying Ingester data to send to Querier"))]
-    Frontend {
-        source: iox_query::frontend::common::Error,
-    },
-
-    #[snafu(display("Error building logical plan for querying Ingester data to send to Querier"))]
-    LogicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error building physical plan for querying Ingester data to send to Querier: {}",
-        source
-    ))]
-    PhysicalPlan { source: DataFusionError },
-
-    #[snafu(display(
-        "Error executing the query for getting Ingester data to send to Querier: {}",
-        source
-    ))]
-    ExecutePlan { source: DataFusionError },
-}
-
-/// A specialized `Error` for Ingester's Query errors
-pub type Result<T, E = Error> = std::result::Result<T, E>;
-
-/// Query a given Queryable Batch, applying selection and filters as appropriate
-/// Return stream of record batches
-pub(crate) async fn query(
-    executor: &Executor,
-    data: Arc<QueryableBatch>,
-) -> Result<SendableRecordBatchStream> {
-    // Build logical plan for filtering data
-    // Note that this query will also apply the delete predicates that go with the QueryableBatch
-
-    // TODO: Since we have different type of servers (router,
-    // ingester, compactor, and querier), we may want to add more
-    // types into the ExecutorType to have better log and resource
-    // managment
-    let ctx = executor.new_context(ExecutorType::Query);
-
-    // Creates an execution plan for a scan and filter data of a single chunk
-    let schema = data.schema();
-    let table_name = data.table_name().to_string();
-
-    debug!(%table_name, "Creating single chunk scan plan");
-
-    let logical_plan = ScanPlanBuilder::new(schema, ctx.child_ctx("scan_and_filter planning"))
-        .with_chunks([data as _])
-        .build()
-        .context(FrontendSnafu)?
-        .plan_builder
-        .build()
-        .context(LogicalPlanSnafu)?;
-
-    debug!(%table_name, plan=%logical_plan.display_indent_schema(),
-           "created single chunk scan plan");
-
-    // Build physical plan
-    let physical_plan = ctx
-        .create_physical_plan(&logical_plan)
-        .await
-        .context(PhysicalPlanSnafu {})?;
-
-    // Execute the plan and return the filtered stream
-    let output_stream = ctx
-        .execute_stream(physical_plan)
-        .await
-        .context(ExecutePlanSnafu {})?;
-
-    Ok(output_stream)
-}
-
-#[cfg(test)]
-mod tests {
-    use arrow_util::assert_batches_eq;
-
-    use super::*;
-    use crate::test_util::{
-        create_one_record_batch_with_influxtype_no_duplicates, create_tombstone,
-        make_queryable_batch, make_queryable_batch_with_deletes,
-    };
-
-    #[tokio::test]
-    async fn test_query() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch("test_table", 0, 1, batches);
-
-        // query without filters
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data: all rows and columns should be returned
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 70        | UT   | 1970-01-01T00:00:00.000020Z |",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-
-    #[tokio::test]
-    async fn test_query_with_delete() {
-        test_helpers::maybe_start_logging();
-
-        // create input data
-        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
-        let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
-
-        // build queryable batch from the input batches
-        let batch = make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
-
-        let exc = Executor::new(1);
-        let stream = query(&exc, batch).await.unwrap();
-        let output_batches = datafusion::physical_plan::common::collect(stream)
-            .await
-            .unwrap();
-
-        // verify data:
-        let expected = vec![
-            "+-----------+------+-----------------------------+",
-            "| field_int | tag1 | time                        |",
-            "+-----------+------+-----------------------------+",
-            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
-            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
-            "+-----------+------+-----------------------------+",
-        ];
-        assert_batches_eq!(&expected, &output_batches);
-
-        exc.join().await;
-    }
-}
diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs
index 76fa44ab8b..b01504085f 100644
--- a/ingester/src/data/shard.rs
+++ b/ingester/src/data/shard.rs
@@ -1,22 +1,49 @@
 //! Shard level data buffer structures.
 
-use std::{
-    collections::{btree_map::Entry, BTreeMap},
-    sync::Arc,
-};
+use std::{collections::HashMap, sync::Arc};
 
-use data_types::{ShardId, ShardIndex};
+use data_types::{NamespaceId, ShardId, ShardIndex};
 use dml::DmlOperation;
 use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
 use metric::U64Counter;
 use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
 use write_summary::ShardProgress;
 
-use super::{namespace::NamespaceData, partition::resolver::PartitionProvider};
+use super::{
+    namespace::{NamespaceData, NamespaceName},
+    partition::resolver::PartitionProvider,
+};
 use crate::lifecycle::LifecycleHandle;
 
+/// A double-referenced map where [`NamespaceData`] can be looked up by name, or
+/// ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_name: HashMap<NamespaceName, Arc<NamespaceData>>,
+    by_id: HashMap<NamespaceId, Arc<NamespaceData>>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, name: NamespaceName, ns: NamespaceData) -> Arc<NamespaceData> {
+        let id = ns.namespace_id();
+
+        let ns = Arc::new(ns);
+        self.by_name.insert(name, Arc::clone(&ns));
+        self.by_id.insert(id, Arc::clone(&ns));
+        ns
+    }
+
+    fn by_name(&self, name: &NamespaceName) -> Option<Arc<NamespaceData>> {
+        self.by_name.get(name).map(Arc::clone)
+    }
+
+    fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        self.by_id.get(&id).map(Arc::clone)
+    }
+}
+
 /// Data of a Shard
 #[derive(Debug)]
 pub(crate) struct ShardData {
@@ -32,7 +59,7 @@ pub(crate) struct ShardData {
     partition_provider: Arc<dyn PartitionProvider>,
 
     // New namespaces can come in at any time so we need to be able to add new ones
-    namespaces: RwLock<BTreeMap<String, Arc<NamespaceData>>>,
+    namespaces: RwLock<DoubleRef>,
 
     metrics: Arc<metric::Registry>,
     namespace_count: U64Counter,
@@ -72,9 +99,8 @@ impl ShardData {
         dml_operation: DmlOperation,
         catalog: &Arc<dyn Catalog>,
         lifecycle_handle: &dyn LifecycleHandle,
-        executor: &Executor,
     ) -> Result<bool, super::Error> {
-        let namespace_data = match self.namespace(dml_operation.namespace()) {
+        let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) {
             Some(d) => d,
             None => {
                 self.insert_namespace(dml_operation.namespace(), &**catalog)
@@ -83,14 +109,24 @@ impl ShardData {
         };
 
         namespace_data
-            .buffer_operation(dml_operation, catalog, lifecycle_handle, executor)
+            .buffer_operation(dml_operation, catalog, lifecycle_handle)
             .await
     }
 
     /// Gets the namespace data out of the map
-    pub(crate) fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
+    pub(crate) fn namespace(&self, namespace: &NamespaceName) -> Option<Arc<NamespaceData>> {
         let n = self.namespaces.read();
-        n.get(namespace).cloned()
+        n.by_name(namespace)
+    }
+
+    /// Gets the namespace data out of the map
+    pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
+        // TODO: this should be the default once IDs are pushed over the wire.
+        //
+        // At which point the map should be indexed by IDs, instead of namespace
+        // names.
+        let n = self.namespaces.read();
+        n.by_id(namespace_id)
     }
 
     /// Retrieves the namespace from the catalog and initializes an empty buffer, or
@@ -101,6 +137,8 @@ impl ShardData {
         catalog: &dyn Catalog,
     ) -> Result<Arc<NamespaceData>, super::Error> {
         let mut repos = catalog.repositories().await;
+
+        let ns_name = NamespaceName::from(namespace);
         let namespace = repos
             .namespaces()
             .get_by_name(namespace)
@@ -110,26 +148,35 @@ impl ShardData {
 
         let mut n = self.namespaces.write();
 
-        let data = match n.entry(namespace.name) {
-            Entry::Vacant(v) => {
-                let v = v.insert(Arc::new(NamespaceData::new(
-                    namespace.id,
-                    self.shard_id,
-                    Arc::clone(&self.partition_provider),
-                    &*self.metrics,
-                )));
+        Ok(match n.by_name(&ns_name) {
+            Some(v) => v,
+            None => {
                 self.namespace_count.inc(1);
-                Arc::clone(v)
-            }
-            Entry::Occupied(v) => Arc::clone(v.get()),
-        };
 
-        Ok(data)
+                // Insert the table and then return a ref to it.
+                n.insert(
+                    ns_name.clone(),
+                    NamespaceData::new(
+                        namespace.id,
+                        ns_name,
+                        self.shard_id,
+                        Arc::clone(&self.partition_provider),
+                        &*self.metrics,
+                    ),
+                )
+            }
+        })
     }
 
     /// Return the progress of this shard
     pub(super) async fn progress(&self) -> ShardProgress {
-        let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect();
+        let namespaces: Vec<_> = self
+            .namespaces
+            .read()
+            .by_id
+            .values()
+            .map(Arc::clone)
+            .collect();
 
         let mut progress = ShardProgress::new();
 
@@ -144,3 +191,89 @@ impl ShardData {
         self.shard_index
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use data_types::{PartitionId, PartitionKey, ShardIndex};
+    use metric::{Attributes, Metric};
+
+    use crate::{
+        data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+        lifecycle::mock_handle::MockLifecycleHandle,
+        test_util::{make_write_op, populate_catalog},
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+
+    #[tokio::test]
+    async fn test_shard_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PartitionId::new(0),
+                PartitionKey::from("banana-split"),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let shard = ShardData::new(
+            SHARD_INDEX,
+            shard_id,
+            partition_provider,
+            Arc::clone(&metrics),
+        );
+
+        // Assert the namespace does not contain the test data
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_none());
+        assert!(shard.namespace_by_id(ns_id).is_none());
+
+        // Write some test data
+        shard
+            .buffer_operation(
+                DmlOperation::Write(make_write_op(
+                    &PartitionKey::from("banana-split"),
+                    SHARD_INDEX,
+                    NAMESPACE_NAME,
+                    0,
+                    r#"bananas,city=Medford day="sun",temp=55 22"#,
+                )),
+                &catalog,
+                &MockLifecycleHandle::default(),
+            )
+            .await
+            .expect("buffer op should succeed");
+
+        // Both forms of referencing the table should succeed
+        assert!(shard.namespace(&NAMESPACE_NAME.into()).is_some());
+        assert!(shard.namespace_by_id(ns_id).is_some());
+
+        // And the table counter metric should increase
+        let tables = metrics
+            .get_instrument::<Metric<U64Counter>>("ingester_namespaces_total")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from([]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(tables, 1);
+    }
+}
diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs
index 89127d04bf..8ebaa7a192 100644
--- a/ingester/src/data/table.rs
+++ b/ingester/src/data/table.rs
@@ -1,41 +1,94 @@
 //! Table level data buffer structures.
 
-use std::{collections::BTreeMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc};
 
-use data_types::{
-    DeletePredicate, NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId, Timestamp,
-};
-use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
+use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
 use mutable_batch::MutableBatch;
-use snafu::ResultExt;
+use observability_deps::tracing::*;
 use write_summary::ShardProgress;
 
-use super::partition::{
-    resolver::PartitionProvider, PartitionData, PartitionStatus, UnpersistedPartitionData,
-};
-use crate::lifecycle::LifecycleHandle;
+use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData};
+use crate::{lifecycle::LifecycleHandle, querier_handler::PartitionStatus};
+
+/// A double-referenced map where [`PartitionData`] can be looked up by
+/// [`PartitionKey`], or ID.
+#[derive(Debug, Default)]
+struct DoubleRef {
+    // TODO(4880): this can be removed when IDs are sent over the wire.
+    by_key: HashMap<PartitionKey, PartitionData>,
+    by_id: HashMap<PartitionId, PartitionKey>,
+}
+
+impl DoubleRef {
+    fn insert(&mut self, ns: PartitionData) {
+        let id = ns.partition_id();
+        let key = ns.partition_key().clone();
+
+        assert!(self.by_key.insert(key.clone(), ns).is_none());
+        assert!(self.by_id.insert(id, key).is_none());
+    }
+
+    #[cfg(test)]
+    fn by_key(&self, key: &PartitionKey) -> Option<&PartitionData> {
+        self.by_key.get(key)
+    }
+
+    fn by_key_mut(&mut self, key: &PartitionKey) -> Option<&mut PartitionData> {
+        self.by_key.get_mut(key)
+    }
+
+    fn by_id_mut(&mut self, id: PartitionId) -> Option<&mut PartitionData> {
+        let key = self.by_id.get(&id)?.clone();
+        self.by_key_mut(&key)
+    }
+}
+
+/// The string name / identifier of a Table.
+///
+/// A reference-counted, cheap clone-able string.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct TableName(Arc<str>);
+
+impl<T> From<T> for TableName
+where
+    T: AsRef<str>,
+{
+    fn from(v: T) -> Self {
+        Self(Arc::from(v.as_ref()))
+    }
+}
+
+impl std::fmt::Display for TableName {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl std::ops::Deref for TableName {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
 
 /// Data of a Table in a given Namesapce that belongs to a given Shard
 #[derive(Debug)]
 pub(crate) struct TableData {
     table_id: TableId,
-    table_name: Arc<str>,
+    table_name: TableName,
 
     /// The catalog ID of the shard & namespace this table is being populated
     /// from.
     shard_id: ShardId,
     namespace_id: NamespaceId,
 
-    // the max sequence number for a tombstone associated with this table
-    tombstone_max_sequence_number: Option<SequenceNumber>,
-
     /// An abstract constructor of [`PartitionData`] instances for a given
     /// `(key, shard, table)` triplet.
     partition_provider: Arc<dyn PartitionProvider>,
 
-    // Map pf partition key to its data
-    pub(super) partition_data: BTreeMap<PartitionKey, PartitionData>,
+    // Map of partition key to its data
+    partition_data: DoubleRef,
 }
 
 impl TableData {
@@ -51,18 +104,16 @@ impl TableData {
     /// for the first time.
     pub(super) fn new(
         table_id: TableId,
-        table_name: &str,
+        table_name: TableName,
         shard_id: ShardId,
         namespace_id: NamespaceId,
-        tombstone_max_sequence_number: Option<SequenceNumber>,
         partition_provider: Arc<dyn PartitionProvider>,
     ) -> Self {
         Self {
             table_id,
-            table_name: table_name.into(),
+            table_name,
             shard_id,
             namespace_id,
-            tombstone_max_sequence_number,
             partition_data: Default::default(),
             partition_provider,
         }
@@ -71,18 +122,13 @@ impl TableData {
     /// Return parquet_max_sequence_number
     pub(super) fn parquet_max_sequence_number(&self) -> Option<SequenceNumber> {
         self.partition_data
+            .by_key
             .values()
             .map(|p| p.max_persisted_sequence_number())
             .max()
             .flatten()
     }
 
-    /// Return tombstone_max_sequence_number
-    #[allow(dead_code)] // Used in tests
-    pub(super) fn tombstone_max_sequence_number(&self) -> Option<SequenceNumber> {
-        self.tombstone_max_sequence_number
-    }
-
     // buffers the table write and returns true if the lifecycle manager indicates that
     // ingest should be paused.
     pub(super) async fn buffer_table_write(
@@ -92,7 +138,7 @@ impl TableData {
         partition_key: PartitionKey,
         lifecycle_handle: &dyn LifecycleHandle,
     ) -> Result<bool, super::Error> {
-        let partition_data = match self.partition_data.get_mut(&partition_key) {
+        let partition_data = match self.partition_data.by_key.get_mut(&partition_key) {
             Some(p) => p,
             None => {
                 let p = self
@@ -102,86 +148,87 @@ impl TableData {
                         self.shard_id,
                         self.namespace_id,
                         self.table_id,
-                        Arc::clone(&self.table_name),
+                        self.table_name.clone(),
                     )
                     .await;
-                // Add the partition to the map.
-                assert!(self
-                    .partition_data
-                    .insert(partition_key.clone(), p)
-                    .is_none());
-                self.partition_data.get_mut(&partition_key).unwrap()
+                // Add the double-referenced partition to the map.
+                self.partition_data.insert(p);
+                self.partition_data.by_key_mut(&partition_key).unwrap()
             }
         };
 
         // skip the write if it has already been persisted
         if let Some(max) = partition_data.max_persisted_sequence_number() {
             if max >= sequence_number {
+                trace!(
+                    shard_id=%self.shard_id,
+                    op_sequence_number=?sequence_number,
+                    "skipping already-persisted write"
+                );
                 return Ok(false);
             }
         }
 
+        let size = batch.size();
+        let rows = batch.rows();
+        partition_data.buffer_write(sequence_number, batch)?;
+
+        // Record the write as having been buffered.
+        //
+        // This should happen AFTER the write is applied, because buffering the
+        // op may fail which would lead to a write being recorded, but not
+        // applied.
         let should_pause = lifecycle_handle.log_write(
-            partition_data.id(),
+            partition_data.partition_id(),
             self.shard_id,
             self.namespace_id,
             self.table_id,
             sequence_number,
-            batch.size(),
-            batch.rows(),
+            size,
+            rows,
         );
-        partition_data.buffer_write(sequence_number, batch)?;
 
         Ok(should_pause)
     }
 
-    pub(super) async fn buffer_delete(
+    /// Return the [`PartitionData`] for the specified ID.
+    #[allow(unused)]
+    pub(crate) fn get_partition(
         &mut self,
-        predicate: &DeletePredicate,
-        sequence_number: SequenceNumber,
-        catalog: &dyn Catalog,
-        executor: &Executor,
-    ) -> Result<(), super::Error> {
-        let min_time = Timestamp::new(predicate.range.start());
-        let max_time = Timestamp::new(predicate.range.end());
+        partition_id: PartitionId,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_id_mut(partition_id)
+    }
 
-        let mut repos = catalog.repositories().await;
-        let tombstone = repos
-            .tombstones()
-            .create_or_get(
-                self.table_id,
-                self.shard_id,
-                sequence_number,
-                min_time,
-                max_time,
-                &predicate.expr_sql_string(),
-            )
-            .await
-            .context(super::CatalogSnafu)?;
+    /// Return the [`PartitionData`] for the specified partition key.
+    #[cfg(test)]
+    pub(crate) fn get_partition_by_key(
+        &self,
+        partition_key: &PartitionKey,
+    ) -> Option<&PartitionData> {
+        self.partition_data.by_key(partition_key)
+    }
 
-        // remember "persisted" state
-        self.tombstone_max_sequence_number = Some(sequence_number);
-
-        // modify one partition at a time
-        for data in self.partition_data.values_mut() {
-            data.buffer_tombstone(executor, tombstone.clone()).await;
-        }
-
-        Ok(())
+    /// Return the [`PartitionData`] for the specified partition key.
+    pub(crate) fn get_partition_by_key_mut(
+        &mut self,
+        partition_key: &PartitionKey,
+    ) -> Option<&mut PartitionData> {
+        self.partition_data.by_key_mut(partition_key)
     }
 
     pub(crate) fn unpersisted_partition_data(&self) -> Vec<UnpersistedPartitionData> {
         self.partition_data
+            .by_key
             .values()
             .map(|p| UnpersistedPartitionData {
-                partition_id: p.id(),
+                partition_id: p.partition_id(),
                 non_persisted: p
                     .get_non_persisting_data()
                     .expect("get_non_persisting should always work"),
                 persisting: p.get_persisting_data(),
                 partition_status: PartitionStatus {
                     parquet_max_sequence_number: p.max_persisted_sequence_number(),
-                    tombstone_max_sequence_number: self.tombstone_max_sequence_number,
                 },
             })
             .collect()
@@ -196,14 +243,223 @@ impl TableData {
         };
 
         self.partition_data
+            .by_key
             .values()
             .fold(progress, |progress, partition_data| {
                 progress.combine(partition_data.progress())
             })
     }
 
-    #[cfg(test)]
+    /// Returns the table ID for this partition.
     pub(super) fn table_id(&self) -> TableId {
         self.table_id
     }
+
+    /// Returns the name of this table.
+    pub(crate) fn table_name(&self) -> &TableName {
+        &self.table_name
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use assert_matches::assert_matches;
+    use data_types::{PartitionId, ShardIndex};
+    use iox_catalog::interface::Catalog;
+    use mutable_batch::writer;
+    use mutable_batch_lp::lines_to_batches;
+    use schema::{InfluxColumnType, InfluxFieldType};
+
+    use crate::{
+        data::{
+            partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
+            Error,
+        },
+        lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle},
+        test_util::populate_catalog,
+    };
+
+    use super::*;
+
+    const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
+    const TABLE_NAME: &str = "bananas";
+    const NAMESPACE_NAME: &str = "platanos";
+    const PARTITION_KEY: &str = "platanos";
+    const PARTITION_ID: PartitionId = PartitionId::new(0);
+
+    #[tokio::test]
+    async fn test_partition_double_ref() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME.into(),
+            shard_id,
+            ns_id,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &MockLifecycleHandle::default(),
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
+        assert!(table.partition_data.by_id_mut(PARTITION_ID).is_some());
+    }
+
+    #[tokio::test]
+    async fn test_bad_write_memory_counting() {
+        let metrics = Arc::new(metric::Registry::default());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
+
+        // Populate the catalog with the shard / namespace / table
+        let (shard_id, ns_id, table_id) =
+            populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
+
+        // Configure the mock partition provider to return a partition for this
+        // table ID.
+        let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
+            PartitionData::new(
+                PARTITION_ID,
+                PARTITION_KEY.into(),
+                shard_id,
+                ns_id,
+                table_id,
+                TABLE_NAME.into(),
+                SortKeyState::Provided(None),
+                None,
+            ),
+        ));
+
+        let mut table = TableData::new(
+            table_id,
+            TABLE_NAME.into(),
+            shard_id,
+            ns_id,
+            partition_provider,
+        );
+
+        let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        // Initialise the mock lifecycle handle and use it to inspect the calls
+        // made to the lifecycle manager during buffering.
+        let handle = MockLifecycleHandle::default();
+
+        // Assert the table does not contain the test partition
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
+
+        // Write some test data
+        let pause = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect("buffer op should succeed");
+        assert!(!pause);
+
+        // Referencing the partition should succeed
+        assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
+
+        // And the lifecycle handle was called with the expected values
+        assert_eq!(
+            handle.get_log_calls(),
+            &[MockLifecycleCall {
+                partition_id: PARTITION_ID,
+                shard_id,
+                namespace_id: ns_id,
+                table_id,
+                sequence_number: SequenceNumber::new(42),
+                bytes_written: 1131,
+                rows_written: 1,
+            }]
+        );
+
+        // Attempt to buffer the second op that contains a type conflict - this
+        // should return an error, and not make a call to the lifecycle handle
+        // (as no data was buffered)
+        //
+        // Note the type of value was numeric previously, and here it is a string.
+        let batch = lines_to_batches(r#"bananas,bat=man value="platanos" 42"#, 0)
+            .unwrap()
+            .remove(TABLE_NAME)
+            .unwrap();
+
+        let err = table
+            .buffer_table_write(
+                SequenceNumber::new(42),
+                batch,
+                PARTITION_KEY.into(),
+                &handle,
+            )
+            .await
+            .expect_err("type conflict should error");
+
+        // The buffer op should return a column type error
+        assert_matches!(
+            err,
+            Error::BufferWrite {
+                source: mutable_batch::Error::WriterError {
+                    source: writer::Error::TypeMismatch {
+                        existing: InfluxColumnType::Field(InfluxFieldType::Float),
+                        inserted: InfluxColumnType::Field(InfluxFieldType::String),
+                        column: col_name,
+                    }
+                },
+            } => { assert_eq!(col_name, "value") }
+        );
+
+        // And the lifecycle handle should not be called.
+        //
+        // It still contains the first call, so the desired length is 1
+        // indicating no second call was made.
+        assert_eq!(handle.get_log_calls().len(), 1);
+    }
 }
diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs
index dde159dc52..981a43cd57 100644
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@@ -30,17 +30,24 @@ use crate::{
     data::{
         partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
         shard::ShardData,
-        IngesterData, IngesterQueryResponse,
+        IngesterData,
     },
     lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
     poison::PoisonCabinet,
-    querier_handler::prepare_data_to_querier,
+    querier_handler::{prepare_data_to_querier, IngesterQueryResponse},
     stream_handler::{
         handler::SequencedStreamHandler, sink_adaptor::IngestSinkAdaptor,
         sink_instrumentation::SinkInstrumentation, PeriodicWatermarkFetcher,
     },
 };
 
+/// The maximum duration of time between creating a [`PartitionData`] and its
+/// [`SortKey`] being fetched from the catalog.
+///
+/// [`PartitionData`]: crate::data::partition::PartitionData
+/// [`SortKey`]: schema::sort::SortKey
+const SORT_KEY_PRE_FETCH: Duration = Duration::from_secs(30);
+
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
 pub enum Error {
@@ -160,7 +167,13 @@ impl IngestHandlerImpl {
 
         // Build the partition provider.
         let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
-        let partition_provider = PartitionCache::new(partition_provider, recent_partitions);
+        let partition_provider = PartitionCache::new(
+            partition_provider,
+            recent_partitions,
+            SORT_KEY_PRE_FETCH,
+            Arc::clone(&catalog),
+            BackoffConfig::default(),
+        );
         let partition_provider: Arc<dyn PartitionProvider> = Arc::new(partition_provider);
 
         // build the initial ingester data state
@@ -432,7 +445,7 @@ mod tests {
     use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};
 
     use super::*;
-    use crate::data::partition::SnapshotBatch;
+    use crate::data::{partition::SnapshotBatch, table::TableName};
 
     #[tokio::test]
     async fn read_from_write_buffer_write_to_mutable_buffer() {
@@ -499,13 +512,16 @@ mod tests {
         // give the writes some time to go through the buffer. Exit once we've verified there's
         // data in there from both writes.
         tokio::time::timeout(Duration::from_secs(2), async {
+            let ns_name = ingester.namespace.name.into();
+            let table_name = TableName::from("a");
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
-                    if let Some(data) = data.namespace(&ingester.namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                             if let Some(b) = b.first() {
                                 if b.data.num_rows() > 0 {
                                     has_measurement = true;
@@ -740,13 +756,16 @@ mod tests {
         // give the writes some time to go through the buffer. Exit once we've verified there's
         // data in there
         tokio::time::timeout(Duration::from_secs(1), async move {
+            let ns_name = namespace.name.into();
+            let table_name = TableName::from("cpu");
             loop {
                 let mut has_measurement = false;
 
                 if let Some(data) = ingester.data.shard(shard.id) {
-                    if let Some(data) = data.namespace(&namespace.name) {
+                    if let Some(data) = data.namespace(&ns_name) {
                         // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await {
+                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
+                        {
                             if let Some(b) = b.first() {
                                 custom_batch_verification(b);
 
diff --git a/ingester/src/lifecycle.rs b/ingester/src/lifecycle.rs
index b46b84dde7..d15389ed60 100644
--- a/ingester/src/lifecycle.rs
+++ b/ingester/src/lifecycle.rs
@@ -12,7 +12,7 @@ use std::{collections::BTreeMap, sync::Arc, time::Duration};
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
 use iox_time::{Time, TimeProvider};
 use metric::{Metric, U64Counter};
-use observability_deps::tracing::{error, info, warn};
+use observability_deps::tracing::{error, info, trace, warn};
 use parking_lot::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracker::TrackedFutureExt;
@@ -97,6 +97,18 @@ impl LifecycleHandle for LifecycleHandleImpl {
         stats.last_write = now;
         stats.rows_written += rows_written;
 
+        trace!(
+            shard_id=%stats.shard_id,
+            partition_id=%stats.partition_id,
+            namespace_id=%stats.namespace_id,
+            table_id=%stats.table_id,
+            first_write=%stats.first_write,
+            last_write=%stats.last_write,
+            bytes_written=%stats.bytes_written,
+            first_sequence_number=?stats.first_sequence_number,
+            "logged write"
+        );
+
         s.total_bytes += bytes_written;
 
         // Pause if the server has exceeded the configured memory limit.
@@ -234,7 +246,7 @@ struct LifecycleStats {
 }
 
 /// The stats for a partition
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 struct PartitionLifecycleStats {
     /// The shard this partition is under
     shard_id: ShardId,
@@ -469,6 +481,18 @@ impl LifecycleManager {
         let persist_tasks: Vec<_> = to_persist
             .into_iter()
             .map(|s| {
+                // BUG: TOCTOU: memory usage released may be incorrect.
+                //
+                // Here the amount of memory to be reduced is acquired, but this
+                // code does not prevent continued writes adding more data to
+                // the partition in another thread.
+                //
+                // This may lead to more actual data being persisted than the
+                // call below returns to the server pool - this would slowly
+                // starve the ingester of memory it thinks it has.
+                //
+                // See https://github.com/influxdata/influxdb_iox/issues/5777
+
                 // Mark this partition as being persisted, and remember the
                 // memory allocation it had accumulated.
                 let partition_memory_usage = self
@@ -483,7 +507,9 @@ impl LifecycleManager {
 
                 let state = Arc::clone(&self.state);
                 tokio::task::spawn(async move {
-                    persister.persist(s.partition_id).await;
+                    persister
+                        .persist(s.shard_id, s.namespace_id, s.table_id, s.partition_id)
+                        .await;
                     // Now the data has been uploaded and the memory it was
                     // using has been freed, released the memory capacity back
                     // the ingester.
@@ -524,6 +550,12 @@ impl LifecycleManager {
                     .map(|s| s.first_sequence_number)
                     .min()
                     .unwrap_or(sequence_number);
+                trace!(
+                    min_unpersisted_sequence_number=?min,
+                    shard_id=%shard_id,
+                    sequence_number=?sequence_number,
+                    "updated min_unpersisted_sequence_number for persisted shard"
+                );
                 persister
                     .update_min_unpersisted_sequence_number(shard_id, min)
                     .await;
@@ -602,7 +634,13 @@ mod tests {
 
     #[async_trait]
     impl Persister for TestPersister {
-        async fn persist(&self, partition_id: PartitionId) {
+        async fn persist(
+            &self,
+            _shard_id: ShardId,
+            _namespace_id: NamespaceId,
+            _table_id: TableId,
+            partition_id: PartitionId,
+        ) {
             let mut p = self.persist_called.lock();
             p.insert(partition_id);
         }
@@ -662,8 +700,16 @@ mod tests {
 
     #[async_trait]
     impl Persister for PausablePersister {
-        async fn persist(&self, partition_id: PartitionId) {
-            self.inner.persist(partition_id).await;
+        async fn persist(
+            &self,
+            shard_id: ShardId,
+            namespace_id: NamespaceId,
+            table_id: TableId,
+            partition_id: PartitionId,
+        ) {
+            self.inner
+                .persist(shard_id, namespace_id, table_id, partition_id)
+                .await;
             if let Some(event) = self.event(partition_id) {
                 event.before.wait().await;
                 event.after.wait().await;
diff --git a/ingester/src/lifecycle/mock_handle.rs b/ingester/src/lifecycle/mock_handle.rs
index d5b889c4af..bec4af5ce0 100644
--- a/ingester/src/lifecycle/mock_handle.rs
+++ b/ingester/src/lifecycle/mock_handle.rs
@@ -1,26 +1,66 @@
 //! A mock [`LifecycleHandle`] impl for testing.
 
+use std::sync::Arc;
+
 use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
+use parking_lot::Mutex;
 
 use super::LifecycleHandle;
 
-/// Special [`LifecycleHandle`] that never persists and always accepts more data.
-///
-/// This is useful to control persists manually.
-#[derive(Debug, Default, Clone, Copy)]
-pub struct NoopLifecycleHandle;
+/// A set of arguments captured from a call to
+/// [`MockLifecycleHandle::log_write()`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[allow(missing_docs)]
+pub struct MockLifecycleCall {
+    pub partition_id: PartitionId,
+    pub shard_id: ShardId,
+    pub namespace_id: NamespaceId,
+    pub table_id: TableId,
+    pub sequence_number: SequenceNumber,
+    pub bytes_written: usize,
+    pub rows_written: usize,
+}
 
-impl LifecycleHandle for NoopLifecycleHandle {
+/// A mock [`LifecycleHandle`] implementation that records calls made to
+/// [`Self::log_write()`] and never blocks ingest, always accepting more data.
+///
+/// # Cloning
+///
+/// Cloning a [`MockLifecycleHandle`] will clone the inner state - calls to all
+/// cloned instances are reported in a call to [`Self::get_log_calls()`].
+#[derive(Debug, Default, Clone)]
+pub struct MockLifecycleHandle {
+    log_calls: Arc<Mutex<Vec<MockLifecycleCall>>>,
+}
+
+impl MockLifecycleHandle {
+    /// Returns the ordered [`Self::log_write()`] calls made to this mock.
+    pub fn get_log_calls(&self) -> Vec<MockLifecycleCall> {
+        self.log_calls.lock().clone()
+    }
+}
+
+impl LifecycleHandle for MockLifecycleHandle {
     fn log_write(
         &self,
-        _partition_id: PartitionId,
-        _shard_id: ShardId,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _sequence_number: SequenceNumber,
-        _bytes_written: usize,
-        _rows_written: usize,
+        partition_id: PartitionId,
+        shard_id: ShardId,
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        sequence_number: SequenceNumber,
+        bytes_written: usize,
+        rows_written: usize,
     ) -> bool {
+        self.log_calls.lock().push(MockLifecycleCall {
+            partition_id,
+            shard_id,
+            namespace_id,
+            table_id,
+            sequence_number,
+            bytes_written,
+            rows_written,
+        });
+
         // do NOT pause ingest
         false
     }
diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs
index d3c8e37e19..59996f94cf 100644
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@@ -1,10 +1,13 @@
 //! Handle all requests from Querier
 
-use std::sync::Arc;
+use std::{pin::Pin, sync::Arc};
 
+use arrow::{error::ArrowError, record_batch::RecordBatch};
+use arrow_util::optimize::{optimize_record_batch, optimize_schema};
+use data_types::{PartitionId, SequenceNumber};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
-use futures::StreamExt;
+use futures::{Stream, StreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use observability_deps::tracing::debug;
 use schema::selection::Selection;
@@ -12,8 +15,8 @@ use snafu::{ensure, Snafu};
 
 use crate::{
     data::{
-        partition::UnpersistedPartitionData, IngesterData, IngesterQueryPartition,
-        IngesterQueryResponse,
+        namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName,
+        IngesterData,
     },
     query::QueryableBatch,
 };
@@ -47,6 +50,159 @@ pub enum Error {
 /// A specialized `Error` for Ingester's Query errors
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+/// Stream of snapshots.
+///
+/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
+pub(crate) type SnapshotStream =
+    Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
+
+/// Status of a partition that has unpersisted data.
+///
+/// Note that this structure is specific to a partition (which itself is bound to a table and
+/// shard)!
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[allow(missing_copy_implementations)]
+pub struct PartitionStatus {
+    /// Max sequence number persisted
+    pub parquet_max_sequence_number: Option<SequenceNumber>,
+}
+
+/// Response data for a single partition.
+pub(crate) struct IngesterQueryPartition {
+    /// Stream of snapshots.
+    snapshots: SnapshotStream,
+
+    /// Partition ID.
+    id: PartitionId,
+
+    /// Partition persistence status.
+    status: PartitionStatus,
+}
+
+impl std::fmt::Debug for IngesterQueryPartition {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryPartition")
+            .field("snapshots", &"<SNAPSHOT STREAM>")
+            .field("id", &self.id)
+            .field("status", &self.status)
+            .finish()
+    }
+}
+
+impl IngesterQueryPartition {
+    pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
+        Self {
+            snapshots,
+            id,
+            status,
+        }
+    }
+}
+
+/// Stream of partitions in this response.
+pub(crate) type IngesterQueryPartitionStream =
+    Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
+
+/// Response streams for querier<>ingester requests.
+///
+/// The data structure is constructed to allow lazy/streaming data generation. For easier
+/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
+pub struct IngesterQueryResponse {
+    /// Stream of partitions.
+    partitions: IngesterQueryPartitionStream,
+}
+
+impl std::fmt::Debug for IngesterQueryResponse {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IngesterQueryResponse")
+            .field("partitions", &"<PARTITION STREAM>")
+            .finish()
+    }
+}
+
+impl IngesterQueryResponse {
+    /// Make a response
+    pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
+        Self { partitions }
+    }
+
+    /// Flattens the data according to the wire protocol.
+    pub fn flatten(self) -> FlatIngesterQueryResponseStream {
+        self.partitions
+            .flat_map(|partition_res| match partition_res {
+                Ok(partition) => {
+                    let head = futures::stream::once(async move {
+                        Ok(FlatIngesterQueryResponse::StartPartition {
+                            partition_id: partition.id,
+                            status: partition.status,
+                        })
+                    });
+                    let tail = partition
+                        .snapshots
+                        .flat_map(|snapshot_res| match snapshot_res {
+                            Ok(snapshot) => {
+                                let schema = Arc::new(optimize_schema(&snapshot.schema()));
+
+                                let schema_captured = Arc::clone(&schema);
+                                let head = futures::stream::once(async {
+                                    Ok(FlatIngesterQueryResponse::StartSnapshot {
+                                        schema: schema_captured,
+                                    })
+                                });
+
+                                let tail = snapshot.map(move |batch_res| match batch_res {
+                                    Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
+                                        batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
+                                    }),
+                                    Err(e) => Err(e),
+                                });
+
+                                head.chain(tail).boxed()
+                            }
+                            Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+                        });
+
+                    head.chain(tail).boxed()
+                }
+                Err(e) => futures::stream::once(async { Err(e) }).boxed(),
+            })
+            .boxed()
+    }
+}
+
+/// Flattened version of [`IngesterQueryResponse`].
+pub(crate) type FlatIngesterQueryResponseStream =
+    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
+
+/// Element within the flat wire protocol.
+#[derive(Debug, PartialEq)]
+pub enum FlatIngesterQueryResponse {
+    /// Start a new partition.
+    StartPartition {
+        /// Partition ID.
+        partition_id: PartitionId,
+
+        /// Partition persistence status.
+        status: PartitionStatus,
+    },
+
+    /// Start a new snapshot.
+    ///
+    /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
+    /// message.
+    StartSnapshot {
+        /// Snapshot schema.
+        schema: Arc<arrow::datatypes::Schema>,
+    },
+
+    /// Add a record batch to the snapshot that was announced by the last
+    /// [`StartSnapshot`](Self::StartSnapshot) message.
+    RecordBatch {
+        /// Record batch.
+        batch: RecordBatch,
+    },
+}
+
 /// Return data to send as a response back to the Querier per its request
 pub async fn prepare_data_to_querier(
     ingest_data: &Arc<IngesterData>,
@@ -57,7 +213,8 @@ pub async fn prepare_data_to_querier(
     let mut found_namespace = false;
     for (shard_id, shard_data) in ingest_data.shards() {
         debug!(shard_id=%shard_id.get());
-        let namespace_data = match shard_data.namespace(&request.namespace) {
+        let namespace_name = NamespaceName::from(&request.namespace);
+        let namespace_data = match shard_data.namespace(&namespace_name) {
             Some(namespace_data) => {
                 debug!(namespace=%request.namespace, "found namespace");
                 found_namespace = true;
@@ -68,7 +225,8 @@ pub async fn prepare_data_to_querier(
             }
         };
 
-        let table_data = match namespace_data.table_data(&request.table) {
+        let table_name = TableName::from(&request.table);
+        let table_data = match namespace_data.table_data(&table_name) {
             Some(table_data) => {
                 debug!(table_name=%request.table, "found table");
                 table_data
@@ -153,7 +311,6 @@ fn prepare_data_to_querier_for_partition(
                 request.table.clone().into(),
                 unpersisted_partition_data.partition_id,
                 vec![],
-                vec![],
             )
         })
         .with_data(unpersisted_partition_data.non_persisted);
@@ -188,22 +345,106 @@ fn prepare_data_to_querier_for_partition(
 
 #[cfg(test)]
 mod tests {
-    use arrow::{array::new_null_array, record_batch::RecordBatch};
+    use std::task::{Context, Poll};
+
+    use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch};
     use arrow_util::assert_batches_sorted_eq;
     use assert_matches::assert_matches;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        physical_plan::RecordBatchStream,
+    };
     use futures::TryStreamExt;
+    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
     use predicate::Predicate;
     use schema::merge::SchemaMerger;
 
     use super::*;
-    use crate::{
-        data::FlatIngesterQueryResponse,
-        test_util::{
-            make_ingester_data, make_ingester_data_with_tombstones, DataLocation, TEST_NAMESPACE,
-            TEST_TABLE,
-        },
-    };
+    use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE};
+
+    #[tokio::test]
+    async fn test_ingester_query_response_flatten() {
+        let batch_1_1 = lp_to_batch("table x=1 0");
+        let batch_1_2 = lp_to_batch("table x=2 1");
+        let batch_2 = lp_to_batch("table y=1 10");
+        let batch_3 = lp_to_batch("table z=1 10");
+
+        let schema_1 = batch_1_1.schema();
+        let schema_2 = batch_2.schema();
+        let schema_3 = batch_3.schema();
+
+        let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![
+                            Ok(batch_1_1.clone()),
+                            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+                            Ok(batch_1_2.clone()),
+                        ],
+                        Arc::clone(&schema_1),
+                    )) as _),
+                    Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+                    Ok(Box::pin(TestRecordBatchStream::new(
+                        vec![Ok(batch_2.clone())],
+                        Arc::clone(&schema_2),
+                    )) as _),
+                    Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
+                ])),
+                PartitionId::new(2),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(IngesterQueryPartition::new(
+                Box::pin(futures::stream::iter([])),
+                PartitionId::new(1),
+                PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            )),
+        ])));
+
+        let actual: Vec<_> = response.flatten().collect().await;
+        let expected = vec![
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(2),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
+            Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
+            Err(ArrowError::InvalidArgumentError("invalid arg".into())),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
+            Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
+            Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
+            Err(ArrowError::IoError("some io error".into())),
+            Ok(FlatIngesterQueryResponse::StartPartition {
+                partition_id: PartitionId::new(1),
+                status: PartitionStatus {
+                    parquet_max_sequence_number: None,
+                },
+            }),
+        ];
+
+        assert_eq!(actual.len(), expected.len());
+        for (actual, expected) in actual.into_iter().zip(expected) {
+            match (actual, expected) {
+                (Ok(actual), Ok(expected)) => {
+                    assert_eq!(actual, expected);
+                }
+                (Err(_), Err(_)) => {
+                    // cannot compare `ArrowError`, but it's unlikely that someone changed the error
+                }
+                (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
+                (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
+            }
+        }
+    }
 
     #[tokio::test]
     async fn test_prepare_data_to_querier() {
@@ -360,180 +601,44 @@ mod tests {
         }
     }
 
-    #[tokio::test]
-    async fn test_prepare_data_to_querier_with_tombstones() {
-        test_helpers::maybe_start_logging();
+    pub struct TestRecordBatchStream {
+        schema: SchemaRef,
+        batches: Vec<Result<RecordBatch, ArrowError>>,
+    }
 
-        // make 7 scenarios for ingester data with tombstones
-        let mut scenarios = vec![];
-        for loc in &[
-            DataLocation::BUFFER,
-            DataLocation::BUFFER_SNAPSHOT,
-            DataLocation::BUFFER_PERSISTING,
-            DataLocation::BUFFER_SNAPSHOT_PERSISTING,
-            DataLocation::SNAPSHOT,
-            DataLocation::SNAPSHOT_PERSISTING,
-            DataLocation::PERSISTING,
-        ] {
-            let scenario = Arc::new(make_ingester_data_with_tombstones(*loc).await);
-            scenarios.push((loc, scenario));
+    impl TestRecordBatchStream {
+        pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
+            Self { schema, batches }
         }
+    }
 
-        // read data from all scenarios without any filters
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec![],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+-----+------+--------------------------------+",
-            "| city       | day | temp | time                           |",
-            "+------------+-----+------+--------------------------------+",
-            "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+-----+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
+    impl RecordBatchStream for TestRecordBatchStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    impl futures::Stream for TestRecordBatchStream {
+        type Item = Result<RecordBatch, ArrowError>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            _: &mut Context<'_>,
+        ) -> Poll<Option<Self::Item>> {
+            if self.batches.is_empty() {
+                Poll::Ready(None)
             } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
+                Poll::Ready(Some(self.batches.remove(0)))
+            }
         }
 
-        // read data from all scenarios and filter out column day
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            None,
-        ));
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (self.batches.len(), Some(self.batches.len()))
         }
+    }
 
-        // read data from all scenarios, filter out column day, city Medford, time outside range [0, 42)
-        let expr = col("city").not_eq(lit("Medford"));
-        let pred = Predicate::default().with_expr(expr).with_range(0, 42);
-        let request = Arc::new(IngesterQueryRequest::new(
-            TEST_NAMESPACE.to_string(),
-            TEST_TABLE.to_string(),
-            vec!["city".to_string(), "temp".to_string(), "time".to_string()],
-            Some(pred),
-        ));
-        // predicates and de-dup are NOT applied!, otherwise this would look like this:
-        // let expected = vec![
-        //     "+------------+------+--------------------------------+",
-        //     "| city       | temp | time                           |",
-        //     "+------------+------+--------------------------------+",
-        //     "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-        //     "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-        //     "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-        //     "+------------+------+--------------------------------+",
-        // ];
-        let expected_not_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
-        // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
-        let expected_persisting = vec![
-            "+------------+------+--------------------------------+",
-            "| city       | temp | time                           |",
-            "+------------+------+--------------------------------+",
-            "| Andover    |      | 1970-01-01T00:00:00.000000046Z |",
-            "| Andover    | 56   | 1970-01-01T00:00:00.000000030Z |",
-            "| Boston     |      | 1970-01-01T00:00:00.000000038Z |",
-            "| Boston     | 60   | 1970-01-01T00:00:00.000000036Z |",
-            "| Medford    |      | 1970-01-01T00:00:00.000000026Z |",
-            "| Medford    | 55   | 1970-01-01T00:00:00.000000022Z |",
-            "| Reading    | 58   | 1970-01-01T00:00:00.000000040Z |",
-            "| Wilmington |      | 1970-01-01T00:00:00.000000035Z |",
-            "+------------+------+--------------------------------+",
-        ];
-        for (loc, scenario) in &scenarios {
-            println!("Location: {loc:?}");
-            let expected = if loc.intersects(DataLocation::PERSISTING) {
-                &expected_persisting
-            } else {
-                &expected_not_persisting
-            };
-
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
-            assert_batches_sorted_eq!(expected, &result);
-        }
+    fn lp_to_batch(lp: &str) -> RecordBatch {
+        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
     }
 
     /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
diff --git a/ingester/src/query.rs b/ingester/src/query.rs
index 747ff4666c..dc38001e4f 100644
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@@ -6,26 +6,26 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::util::ensure_schema;
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary,
-    TimestampMinMax, Tombstone,
+    TimestampMinMax,
 };
-use datafusion::physical_plan::{
-    common::SizedRecordBatchStream,
-    metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
-    SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        common::SizedRecordBatchStream,
+        metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
+        SendableRecordBatchStream,
+    },
 };
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::trace;
-use predicate::{
-    delete_predicate::{tombstones_to_delete_predicates, tombstones_to_delete_predicates_iter},
-    Predicate,
-};
+use predicate::Predicate;
 use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
 use snafu::{ResultExt, Snafu};
 
-use crate::data::partition::SnapshotBatch;
+use crate::data::{partition::SnapshotBatch, table::TableName};
 
 #[allow(clippy::enum_variant_names)]
 #[derive(Debug, Snafu)]
@@ -53,11 +53,8 @@ pub(crate) struct QueryableBatch {
     /// data
     pub(crate) data: Vec<Arc<SnapshotBatch>>,
 
-    /// Delete predicates of the tombstones
-    pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
-
     /// This is needed to return a reference for a trait function
-    pub(crate) table_name: Arc<str>,
+    pub(crate) table_name: TableName,
 
     /// Partition ID
     pub(crate) partition_id: PartitionId,
@@ -66,15 +63,12 @@ pub(crate) struct QueryableBatch {
 impl QueryableBatch {
     /// Initilaize a QueryableBatch
     pub(crate) fn new(
-        table_name: Arc<str>,
+        table_name: TableName,
         partition_id: PartitionId,
         data: Vec<Arc<SnapshotBatch>>,
-        deletes: Vec<Tombstone>,
     ) -> Self {
-        let delete_predicates = tombstones_to_delete_predicates(&deletes);
         Self {
             data,
-            delete_predicates,
             table_name,
             partition_id,
         }
@@ -86,12 +80,6 @@ impl QueryableBatch {
         self
     }
 
-    /// Add more tombstones
-    pub(crate) fn add_tombstones(&mut self, deletes: &[Tombstone]) {
-        let delete_predicates = tombstones_to_delete_predicates_iter(deletes);
-        self.delete_predicates.extend(delete_predicates);
-    }
-
     /// return min and max of all the snapshots
     pub(crate) fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) {
         let min = self
@@ -110,11 +98,6 @@ impl QueryableBatch {
 
         (min, max)
     }
-
-    /// return true if it has no data
-    pub(crate) fn is_empty(&self) -> bool {
-        self.data.is_empty()
-    }
 }
 
 impl QueryChunkMeta for QueryableBatch {
@@ -144,16 +127,16 @@ impl QueryChunkMeta for QueryableBatch {
         None // Ingester data is not sorted
     }
 
-    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
-        self.delete_predicates.as_ref()
-    }
-
     fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
         // Note: we need to consider which option we want to go with
         //  . Return None here and avoid taking time to compute time's min max of RecordBacthes (current choice)
         //  . Compute time's min max here and avoid compacting non-overlapped QueryableBatches in the Ingester
         None
     }
+
+    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
+        &[]
+    }
 }
 
 impl QueryChunk for QueryableBatch {
@@ -185,7 +168,7 @@ impl QueryChunk for QueryableBatch {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -199,7 +182,7 @@ impl QueryChunk for QueryableBatch {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         Ok(None)
     }
 
@@ -210,12 +193,16 @@ impl QueryChunk for QueryableBatch {
         mut ctx: IOxSessionContext,
         _predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         ctx.set_metadata("storage", "ingester");
         ctx.set_metadata("projection", format!("{}", selection));
         trace!(?selection, "selection");
 
-        let schema = self.schema().select(selection).context(SchemaSnafu)?;
+        let schema = self
+            .schema()
+            .select(selection)
+            .context(SchemaSnafu)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
         // Get all record batches from their snapshots
         let batches = self
@@ -234,7 +221,8 @@ impl QueryChunk for QueryableBatch {
                     .map(Arc::new);
                 Some(batch)
             })
-            .collect::<Result<Vec<_>, _>>()?;
+            .collect::<Result<Vec<_>, _>>()
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
         // Return stream of data
         let dummy_metrics = ExecutionPlanMetricsSet::new();
@@ -257,165 +245,3 @@ impl QueryChunk for QueryableBatch {
         self
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use arrow::{
-        array::{
-            ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
-            TimestampNanosecondArray, UInt64Array,
-        },
-        datatypes::{DataType, Int32Type, TimeUnit},
-    };
-    use data_types::{DeleteExpr, Op, Scalar, TimestampRange};
-
-    use super::*;
-    use crate::test_util::create_tombstone;
-
-    #[tokio::test]
-    async fn test_merge_batch_schema() {
-        // Merge schema of the batches
-        // The fields in the schema are sorted by column name
-        let batches = create_batches();
-        let merged_schema = (*merge_record_batch_schemas(&batches)).clone();
-
-        // Expected Arrow schema
-        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![
-            arrow::datatypes::Field::new(
-                "dict",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                true,
-            ),
-            arrow::datatypes::Field::new("int64", DataType::Int64, true),
-            arrow::datatypes::Field::new("string", DataType::Utf8, true),
-            arrow::datatypes::Field::new("bool", DataType::Boolean, true),
-            arrow::datatypes::Field::new(
-                "time",
-                DataType::Timestamp(TimeUnit::Nanosecond, None),
-                false,
-            ),
-            arrow::datatypes::Field::new("uint64", DataType::UInt64, false),
-            arrow::datatypes::Field::new("float64", DataType::Float64, true),
-        ]));
-        let expected_schema = Schema::try_from(arrow_schema)
-            .unwrap()
-            .sort_fields_by_name();
-
-        assert_eq!(
-            expected_schema, merged_schema,
-            "\nExpected:\n{:#?}\nActual:\n{:#?}",
-            expected_schema, merged_schema
-        );
-    }
-
-    #[tokio::test]
-    async fn test_tombstones_to_delete_predicates() {
-        // create tombstones
-        let tombstones = vec![
-            create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"),
-            create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"),
-        ];
-
-        // This new queryable batch will convert tombstone to delete predicates
-        let query_batch =
-            QueryableBatch::new("test_table".into(), PartitionId::new(0), vec![], tombstones);
-        let predicates = query_batch.delete_predicates();
-        let expected = vec![
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 200),
-                exprs: vec![DeleteExpr {
-                    column: String::from("temp"),
-                    op: Op::Eq,
-                    scalar: Scalar::I64(10),
-                }],
-            }),
-            Arc::new(DeletePredicate {
-                range: TimestampRange::new(100, 350),
-                exprs: vec![
-                    DeleteExpr {
-                        column: String::from("temp"),
-                        op: Op::Ne,
-                        scalar: Scalar::I64(10),
-                    },
-                    DeleteExpr {
-                        column: String::from("city"),
-                        op: Op::Eq,
-                        scalar: Scalar::String(String::from(r#"Boston"#)),
-                    },
-                ],
-            }),
-        ];
-
-        assert_eq!(expected, predicates);
-    }
-
-    // ----------------------------------------------------------------------------------------------
-    // Data for testing
-
-    // Create pure RecordBatches without knowledge of Influx datatype
-    fn create_batches() -> Vec<Arc<RecordBatch>> {
-        // Batch 1: <dict, i64, str, bool, time>  & 3 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![Some("a"), None, Some("b")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let int64_array: ArrayRef =
-            Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("and"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new(
-            [Some(true), None, Some(false)]
-                .iter()
-                .collect::<BooleanArray>(),
-        );
-        let ts_array: ArrayRef = Arc::new(
-            [Some(150), Some(200), Some(1526823730000000000)]
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("int64", int64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        // Batch 2: <dict, u64, f64, str, bool, time> & 2 rows
-        let dict_array: ArrayRef = Arc::new(
-            vec![None, Some("d")]
-                .into_iter()
-                .collect::<DictionaryArray<Int32Type>>(),
-        );
-        let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::<UInt64Array>()); // not null
-        let float64_array: ArrayRef =
-            Arc::new([Some(1.0), Some(2.0)].iter().collect::<Float64Array>());
-        let string_array: ArrayRef = Arc::new(
-            vec![Some("foo"), Some("bar")]
-                .into_iter()
-                .collect::<StringArray>(),
-        );
-        let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::<BooleanArray>());
-        let ts_array: ArrayRef = Arc::new(
-            [Some(100), Some(1626823730000000000)] // not null
-                .iter()
-                .collect::<TimestampNanosecondArray>(),
-        );
-        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
-            ("dict", dict_array, true),
-            ("uint64", uint64_array, false), // not null
-            ("float64", float64_array, true),
-            ("string", string_array, true),
-            ("bool", bool_array, true),
-            ("time", ts_array, false), // not null
-        ])
-        .unwrap();
-
-        vec![Arc::new(batch1), Arc::new(batch2)]
-    }
-}
diff --git a/ingester/src/server/grpc.rs b/ingester/src/server/grpc.rs
index 4f06a93a46..3bf785843d 100644
--- a/ingester/src/server/grpc.rs
+++ b/ingester/src/server/grpc.rs
@@ -30,8 +30,8 @@ use trace::ctx::SpanContext;
 use write_summary::WriteSummary;
 
 use crate::{
-    data::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
     handler::IngestHandler,
+    querier_handler::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
 };
 
 /// This type is responsible for managing all gRPC services exposed by
@@ -410,9 +410,6 @@ impl Stream for GetStream {
                             parquet_max_sequence_number: status
                                 .parquet_max_sequence_number
                                 .map(|x| x.get()),
-                            tombstone_max_sequence_number: status
-                                .tombstone_max_sequence_number
-                                .map(|x| x.get()),
                         }),
                     };
                     prost::Message::encode(&app_metadata, &mut bytes)
@@ -467,8 +464,9 @@ mod tests {
     use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
     use schema::selection::Selection;
 
+    use crate::querier_handler::PartitionStatus;
+
     use super::*;
-    use crate::data::partition::PartitionStatus;
 
     #[tokio::test]
     async fn test_get_stream_empty() {
@@ -489,7 +487,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
                 Ok(FlatIngesterQueryResponse::StartSnapshot { schema }),
@@ -502,7 +499,6 @@ mod tests {
                         partition_id: 1,
                         status: Some(proto::PartitionStatus {
                             parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                         }),
                     },
                 }),
@@ -527,7 +523,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
                 Err(ArrowError::IoError("foo".into())),
@@ -535,7 +530,6 @@ mod tests {
                     partition_id: PartitionId::new(1),
                     status: PartitionStatus {
                         parquet_max_sequence_number: None,
-                        tombstone_max_sequence_number: None,
                     },
                 }),
             ],
@@ -546,7 +540,6 @@ mod tests {
                         partition_id: 1,
                         status: Some(proto::PartitionStatus {
                             parquet_max_sequence_number: None,
-                            tombstone_max_sequence_number: None,
                         }),
                     },
                 }),
diff --git a/ingester/src/stream_handler/handler.rs b/ingester/src/stream_handler/handler.rs
index 3fa563b188..9a52b10505 100644
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@@ -396,6 +396,12 @@ something clever.",
             if let Some(delta) = duration_since_production {
                 // Update the TTBR metric before potentially sleeping.
                 self.time_to_be_readable.set(delta);
+                trace!(
+                    kafka_topic=%self.topic_name,
+                    shard_index=%self.shard_index,
+                    delta=%delta.as_millis(),
+                    "reporting TTBR for shard (ms)"
+                );
             }
 
             if should_pause {
@@ -939,7 +945,7 @@ mod tests {
             Ok(DmlOperation::Write(make_write("good_op", 2)))
         ]],
         sink_rets = [
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound{namespace: "bananas".to_string() }),
             Ok(true),
         ],
         want_ttbr = 2,
diff --git a/ingester/src/stream_handler/mod.rs b/ingester/src/stream_handler/mod.rs
index 296f158e1a..5e9a351fe4 100644
--- a/ingester/src/stream_handler/mod.rs
+++ b/ingester/src/stream_handler/mod.rs
@@ -17,7 +17,7 @@
 //! [`LifecycleManager`]: crate::lifecycle::LifecycleManager
 //! [`LifecycleHandle::can_resume_ingest()`]: crate::lifecycle::LifecycleHandle::can_resume_ingest()
 
-pub mod handler;
+pub(crate) mod handler;
 mod periodic_watermark_fetcher;
 mod sink;
 
@@ -25,8 +25,8 @@ mod sink;
 pub mod mock_sink;
 #[cfg(test)]
 pub mod mock_watermark_fetcher;
-pub mod sink_adaptor;
-pub mod sink_instrumentation;
+pub(crate) mod sink_adaptor;
+pub(crate) mod sink_instrumentation;
 
-pub use periodic_watermark_fetcher::*;
-pub use sink::*;
+pub(crate) use periodic_watermark_fetcher::*;
+pub(crate) use sink::*;
diff --git a/ingester/src/stream_handler/periodic_watermark_fetcher.rs b/ingester/src/stream_handler/periodic_watermark_fetcher.rs
index 43c8cf52c9..37f99663cc 100644
--- a/ingester/src/stream_handler/periodic_watermark_fetcher.rs
+++ b/ingester/src/stream_handler/periodic_watermark_fetcher.rs
@@ -24,7 +24,7 @@ use super::sink_instrumentation::WatermarkFetcher;
 /// Emits an error metric named `write_buffer_watermark_fetch_errors` that
 /// increments once per fetch error.
 #[derive(Debug)]
-pub struct PeriodicWatermarkFetcher {
+pub(crate) struct PeriodicWatermarkFetcher {
     last_watermark: Arc<AtomicI64>,
     poll_handle: JoinHandle<()>,
 }
diff --git a/ingester/src/stream_handler/sink.rs b/ingester/src/stream_handler/sink.rs
index 5f8220a942..825b012ce9 100644
--- a/ingester/src/stream_handler/sink.rs
+++ b/ingester/src/stream_handler/sink.rs
@@ -5,7 +5,7 @@ use dml::DmlOperation;
 
 /// A [`DmlSink`] handles [`DmlOperation`] instances read from a shard.
 #[async_trait]
-pub trait DmlSink: Debug + Send + Sync {
+pub(crate) trait DmlSink: Debug + Send + Sync {
     /// Apply `op` read from a shard, returning `Ok(true)` if ingest should
     /// be paused.
     async fn apply(&self, op: DmlOperation) -> Result<bool, crate::data::Error>;
diff --git a/ingester/src/stream_handler/sink_instrumentation.rs b/ingester/src/stream_handler/sink_instrumentation.rs
index 24b05cbf21..998e14bb48 100644
--- a/ingester/src/stream_handler/sink_instrumentation.rs
+++ b/ingester/src/stream_handler/sink_instrumentation.rs
@@ -414,11 +414,13 @@ mod tests {
         let got = test(
             op,
             &metrics,
-            Err(crate::data::Error::TableNotPresent),
+            Err(crate::data::Error::NamespaceNotFound {
+                namespace: "bananas".to_string(),
+            }),
             Some(12345),
         )
         .await;
-        assert_matches!(got, Err(crate::data::Error::TableNotPresent));
+        assert_matches!(got, Err(crate::data::Error::NamespaceNotFound { .. }));
 
         // Validate the various write buffer metrics
         assert_matches!(
diff --git a/ingester/src/test_util.rs b/ingester/src/test_util.rs
index 09045083e8..05dc226f90 100644
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@@ -9,17 +9,16 @@ use arrow::record_batch::RecordBatch;
 use arrow_util::assert_batches_eq;
 use bitflags::bitflags;
 use data_types::{
-    CompactionLevel, NamespaceId, NonEmptyString, PartitionId, PartitionKey, Sequence,
-    SequenceNumber, ShardId, ShardIndex, TableId, Timestamp, Tombstone, TombstoneId,
+    CompactionLevel, NamespaceId, PartitionId, PartitionKey, Sequence, SequenceNumber, ShardId,
+    ShardIndex, TableId,
 };
-use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
+use dml::{DmlMeta, DmlOperation, DmlWrite};
 use iox_catalog::{interface::Catalog, mem::MemCatalog};
 use iox_query::test::{raw_data, TestChunk};
 use iox_time::{SystemProvider, Time};
 use mutable_batch_lp::lines_to_batches;
 use object_store::memory::InMemory;
 use parquet_file::metadata::IoxMetadata;
-use predicate::delete_predicate::parse_delete_predicate;
 use schema::sort::SortKey;
 use uuid::Uuid;
 
@@ -28,31 +27,10 @@ use crate::{
         partition::{resolver::CatalogPartitionResolver, PersistingBatch, SnapshotBatch},
         IngesterData,
     },
-    lifecycle::{LifecycleConfig, LifecycleHandle, LifecycleManager},
+    lifecycle::{LifecycleConfig, LifecycleManager},
     query::QueryableBatch,
 };
 
-/// Create tombstone for testing
-pub(crate) fn create_tombstone(
-    id: i64,
-    table_id: i64,
-    shard_id: i64,
-    seq_num: i64,
-    min_time: i64,
-    max_time: i64,
-    predicate: &str,
-) -> Tombstone {
-    Tombstone {
-        id: TombstoneId::new(id),
-        table_id: TableId::new(table_id),
-        shard_id: ShardId::new(shard_id),
-        sequence_number: SequenceNumber::new(seq_num),
-        min_time: Timestamp::new(min_time),
-        max_time: Timestamp::new(max_time),
-        serialized_predicate: predicate.to_string(),
-    }
-}
-
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn make_meta(
     object_store_id: Uuid,
@@ -93,15 +71,8 @@ pub(crate) fn make_persisting_batch(
     partition_id: i64,
     object_store_id: Uuid,
     batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<PersistingBatch> {
-    let queryable_batch = make_queryable_batch_with_deletes(
-        table_name,
-        partition_id,
-        seq_num_start,
-        batches,
-        tombstones,
-    );
+    let queryable_batch = make_queryable_batch(table_name, partition_id, seq_num_start, batches);
     Arc::new(PersistingBatch {
         shard_id: ShardId::new(shard_id),
         table_id: TableId::new(table_id),
@@ -116,16 +87,6 @@ pub(crate) fn make_queryable_batch(
     partition_id: i64,
     seq_num_start: i64,
     batches: Vec<Arc<RecordBatch>>,
-) -> Arc<QueryableBatch> {
-    make_queryable_batch_with_deletes(table_name, partition_id, seq_num_start, batches, vec![])
-}
-
-pub(crate) fn make_queryable_batch_with_deletes(
-    table_name: &str,
-    partition_id: i64,
-    seq_num_start: i64,
-    batches: Vec<Arc<RecordBatch>>,
-    tombstones: Vec<Tombstone>,
 ) -> Arc<QueryableBatch> {
     // make snapshots for the batches
     let mut snapshots = vec![];
@@ -140,7 +101,6 @@ pub(crate) fn make_queryable_batch_with_deletes(
         table_name.into(),
         PartitionId::new(partition_id),
         snapshots,
-        tombstones,
     ))
 }
 
@@ -655,65 +615,24 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     } else if loc.contains(DataLocation::SNAPSHOT) {
         // move partition 1 data to snapshot
         let _ignored = ingester
             .shard(shard_id)
             .unwrap()
-            .namespace(TEST_NAMESPACE)
+            .namespace(&TEST_NAMESPACE.into())
             .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
+            .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
             .await;
     }
 
     ingester
 }
 
-pub(crate) async fn make_ingester_data_with_tombstones(loc: DataLocation) -> IngesterData {
-    // Whatever data because they won't be used in the tests
-    let metrics: Arc<metric::Registry> = Default::default();
-    let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
-    let object_store = Arc::new(InMemory::new());
-    let exec = Arc::new(iox_query::exec::Executor::new(1));
-    let lifecycle = LifecycleManager::new(
-        LifecycleConfig::new(
-            200_000_000,
-            100_000_000,
-            100_000_000,
-            Duration::from_secs(100_000_000),
-            Duration::from_secs(100_000_000),
-            100_000_000,
-        ),
-        Arc::clone(&metrics),
-        Arc::new(SystemProvider::default()),
-    );
-
-    // Make data for one shard and two tables
-    let shard_index = ShardIndex::new(0);
-    let (shard_id, _, _) =
-        populate_catalog(&*catalog, shard_index, TEST_NAMESPACE, TEST_TABLE).await;
-
-    let ingester = IngesterData::new(
-        object_store,
-        Arc::clone(&catalog),
-        [(shard_id, shard_index)],
-        exec,
-        Arc::new(CatalogPartitionResolver::new(catalog)),
-        backoff::BackoffConfig::default(),
-        metrics,
-    );
-
-    // Make partitions per requested
-    make_one_partition_with_tombstones(&ingester, &lifecycle.handle(), loc, shard_index, shard_id)
-        .await;
-
-    ingester
-}
-
 /// Make data for one or two partitions per requested
 pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> Vec<DmlOperation> {
     // In-memory data includes these rows but split between 4 groups go into
@@ -783,133 +702,6 @@ pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) ->
     ops
 }
 
-/// Make data for one partition with tombstones
-async fn make_one_partition_with_tombstones(
-    ingester: &IngesterData,
-    lifecycle_handle: &dyn LifecycleHandle,
-    loc: DataLocation,
-    shard_index: ShardIndex,
-    shard_id: ShardId,
-) {
-    // In-memory data includes these rows but split between 4 groups go into
-    // different batches of parittion 1 or partittion 2  as requeted
-    // let expected = vec![
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| city       | day | temp | time                           |",
-    //         "+------------+-----+------+--------------------------------+",
-    //         "| Andover    | tue | 56   | 1970-01-01T00:00:00.000000030Z |", // in group 1 - seq_num: 2
-    //         "| Andover    | mon |      | 1970-01-01T00:00:00.000000046Z |", // in group 2 - seq_num: 3
-    //         "| Boston     | sun | 60   | 1970-01-01T00:00:00.000000036Z |", // in group 1 - seq_num: 1  --> will get deleted
-    //         "| Boston     | mon |      | 1970-01-01T00:00:00.000000038Z |", // in group 3 - seq_num: 5  --> will get deleted
-    //         "| Medford    | sun | 55   | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 8  (after the tombstone's seq num)
-    //         "| Medford    | wed |      | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4
-    //         "| Reading    | mon | 58   | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 9
-    //         "| Wilmington | mon |      | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6
-    //         "+------------+-----+------+--------------------------------+",
-    //     ];
-
-    let (ops, seq_num) =
-        make_first_partition_data(&PartitionKey::from(TEST_PARTITION_1), shard_index);
-
-    // Apply all ops
-    for op in ops {
-        ingester
-            .buffer_operation(shard_id, op, lifecycle_handle)
-            .await
-            .unwrap();
-    }
-
-    if loc.contains(DataLocation::PERSISTING) {
-        // Move partition 1 data to persisting
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(TEST_NAMESPACE)
-            .unwrap()
-            .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    } else if loc.contains(DataLocation::SNAPSHOT) {
-        // move partition 1 data to snapshot
-        let _ignored = ingester
-            .shard(shard_id)
-            .unwrap()
-            .namespace(TEST_NAMESPACE)
-            .unwrap()
-            .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
-            .await;
-    }
-
-    // Add tombstones
-    // Depending on where the existing data is, they (buffer & snapshot) will be either moved to a new snapshot after
-    // applying the tombstone or (persisting) stay where they are and the tombstones is kept to get applied later
-    // ------------------------------------------
-    // Delete
-    let mut seq_num = seq_num.get();
-    seq_num += 1;
-
-    let delete = parse_delete_predicate(
-        "1970-01-01T00:00:00.000000010Z",
-        "1970-01-01T00:00:00.000000050Z",
-        "city=Boston",
-    )
-    .unwrap();
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Delete(DmlDelete::new(
-                TEST_NAMESPACE.to_string(),
-                delete,
-                NonEmptyString::new(TEST_TABLE),
-                DmlMeta::sequenced(
-                    Sequence {
-                        shard_index,
-                        sequence_number: SequenceNumber::new(seq_num),
-                    },
-                    Time::MIN,
-                    None,
-                    42,
-                ),
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-
-    // Group 4: in buffer of p1 after the tombstone
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Medford day="sun",temp=55 22"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-    seq_num += 1;
-
-    ingester
-        .buffer_operation(
-            shard_id,
-            DmlOperation::Write(make_write_op(
-                &PartitionKey::from(TEST_PARTITION_1),
-                shard_index,
-                TEST_NAMESPACE,
-                seq_num,
-                r#"test_table,city=Reading day="mon",temp=58 40"#,
-            )),
-            lifecycle_handle,
-        )
-        .await
-        .unwrap();
-}
-
 pub(crate) fn make_write_op(
     partition_key: &PartitionKey,
     shard_index: ShardIndex,
diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index 431c22cdb7..3aae75747d 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -463,7 +463,10 @@ pub trait PartitionRepo: Send + Sync {
         partition_id: PartitionId,
     ) -> Result<Option<PartitionInfo>>;
 
-    /// Update the sort key for the partition
+    /// Update the sort key for the partition.
+    ///
+    /// NOTE: it is expected that ONLY the ingesters update sort keys for
+    /// existing partitions.
     async fn update_sort_key(
         &mut self,
         partition_id: PartitionId,
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 7544e65370..d28a5f310d 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -1878,7 +1878,7 @@ LIMIT $4;
         sqlx::query_as::<_, PartitionParam>(
             r#"
 SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id,
-       parquet_file.table_id, 
+       parquet_file.table_id,
        count(case when to_delete is null then 1 end) total_count,
        max(case when compaction_level= $4 then parquet_file.created_at end)
 FROM   parquet_file
diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml
index 3ace171104..24d6baac34 100644
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@@ -11,7 +11,7 @@ chrono = { version = "0.4", default-features = false }
 chrono-english = "0.1.4"
 clap = { version = "4", features = ["derive", "env", "cargo"] }
 futures = "0.3"
-handlebars = "4.3.4"
+handlebars = "4.3.5"
 humantime = "2.1.0"
 influxdb2_client = { path = "../influxdb2_client" }
 itertools = "0.10.5"
@@ -22,7 +22,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
 regex = "1.6"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 toml = "0.5.9"
diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs
index 6c85358e4f..ca6be3acde 100644
--- a/iox_query/src/exec/seriesset/converter.rs
+++ b/iox_query/src/exec/seriesset/converter.rs
@@ -762,7 +762,7 @@ mod tests {
         .unwrap();
 
         // Input has one row that has no value (NULL value) for tag_b, which is its own series
-        let input = stream_from_batch(batch);
+        let input = stream_from_batch(batch.schema(), batch);
 
         let table_name = "foo";
         let tag_columns = ["tag_a", "tag_b"];
@@ -873,7 +873,8 @@ mod tests {
                     .collect();
 
                 // stream from those batches
-                stream_from_batches(batches)
+                assert!(!batches.is_empty());
+                stream_from_batches(batches[0].schema(), batches)
             })
             .collect()
     }
diff --git a/iox_query/src/frontend/influxrpc.rs b/iox_query/src/frontend/influxrpc.rs
index 0940aff71b..1a8750c779 100644
--- a/iox_query/src/frontend/influxrpc.rs
+++ b/iox_query/src/frontend/influxrpc.rs
@@ -17,12 +17,14 @@ use arrow::datatypes::DataType;
 use data_types::ChunkId;
 use datafusion::{
     error::DataFusionError,
+    logical_expr::utils::exprlist_to_columns,
     logical_plan::{col, when, DFSchemaRef, Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder},
+    prelude::Column,
 };
 use datafusion_util::AsExpr;
 use futures::{Stream, StreamExt, TryStreamExt};
 use hashbrown::HashSet;
-use observability_deps::tracing::{debug, trace};
+use observability_deps::tracing::{debug, trace, warn};
 use predicate::{rpc_predicate::InfluxRpcPredicate, Predicate, PredicateMatch};
 use query_functions::{
     group_by::{Aggregate, WindowDuration},
@@ -31,39 +33,18 @@ use query_functions::{
 };
 use schema::{selection::Selection, InfluxColumnType, Schema, TIME_COLUMN_NAME};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::collections::HashSet as StdHashSet;
 use std::{cmp::Reverse, collections::BTreeSet, sync::Arc};
 
 const CONCURRENT_TABLE_JOBS: usize = 10;
 
 #[derive(Debug, Snafu)]
 pub enum Error {
-    #[snafu(display("gRPC planner got error making table_name plan for chunk: {}", source))]
-    TableNamePlan {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
-    #[snafu(display("gRPC planner got error listing partition keys: {}", source))]
-    ListingPartitions {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
     #[snafu(display("gRPC planner got error finding column names: {}", source))]
-    FindingColumnNames {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
+    FindingColumnNames { source: DataFusionError },
 
     #[snafu(display("gRPC planner got error finding column values: {}", source))]
-    FindingColumnValues {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
-    #[snafu(display(
-        "gRPC planner got internal error making table_name with default predicate: {}",
-        source
-    ))]
-    InternalTableNamePlanForDefault {
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
+    FindingColumnValues { source: DataFusionError },
 
     #[snafu(display(
         "gRPC planner got error fetching chunks for table '{}': {}",
@@ -72,7 +53,7 @@ pub enum Error {
     ))]
     GettingChunks {
         table_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display(
@@ -82,19 +63,20 @@ pub enum Error {
     ))]
     CheckingChunkPredicate {
         chunk_id: ChunkId,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("gRPC planner got error creating string set plan: {}", source))]
     CreatingStringSet { source: StringSetError },
 
     #[snafu(display("gRPC planner got error creating predicates: {}", source))]
-    CreatingPredicates {
-        source: datafusion::error::DataFusionError,
-    },
+    CreatingPredicates { source: DataFusionError },
 
     #[snafu(display("gRPC planner got error building plan: {}", source))]
-    BuildingPlan {
+    BuildingPlan { source: DataFusionError },
+
+    #[snafu(display("gRPC planner got error reading columns from expression: {}", source))]
+    ReadColumns {
         source: datafusion::error::DataFusionError,
     },
 
@@ -148,7 +130,7 @@ pub enum Error {
     CastingAggregates {
         agg: Aggregate,
         field_name: String,
-        source: datafusion::error::DataFusionError,
+        source: DataFusionError,
     },
 
     #[snafu(display("Internal error: unexpected aggregate request for None aggregate",))]
@@ -163,6 +145,35 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+impl Error {
+    pub fn to_df_error(self, method: &'static str) -> DataFusionError {
+        let msg = self.to_string();
+
+        match self {
+            Self::GettingChunks { source, .. }
+            | Self::CreatingPredicates { source, .. }
+            | Self::BuildingPlan { source, .. }
+            | Self::ReadColumns { source, .. }
+            | Self::CheckingChunkPredicate { source, .. }
+            | Self::FindingColumnNames { source, .. }
+            | Self::FindingColumnValues { source, .. }
+            | Self::CastingAggregates { source, .. } => {
+                DataFusionError::Context(format!("{method}: {msg}"), Box::new(source))
+            }
+            e @ (Self::CreatingStringSet { .. }
+            | Self::TableRemoved { .. }
+            | Self::InvalidTagColumn { .. }
+            | Self::InternalInvalidTagType { .. }
+            | Self::DuplicateGroupColumn { .. }
+            | Self::GroupColumnNotFound { .. }
+            | Self::CreatingAggregates { .. }
+            | Self::CreatingScan { .. }
+            | Self::InternalUnexpectedNoneAggregate {}
+            | Self::InternalAggregateNotSelector { .. }) => DataFusionError::External(Box::new(e)),
+        }
+    }
+}
+
 impl From<super::common::Error> for Error {
     fn from(source: super::common::Error) -> Self {
         Self::CreatingScan { source }
@@ -227,49 +238,50 @@ impl InfluxRpcPlanner {
         let table_predicates = rpc_predicate
             .table_predicates(database.as_meta())
             .context(CreatingPredicatesSnafu)?;
-        let tables: Vec<_> = table_chunk_stream(Arc::clone(&database), &table_predicates, &ctx)
-            .try_filter_map(|(table_name, predicate, chunks)| async move {
-                // Identify which chunks can answer from its metadata and then record its table,
-                // and which chunks needs full plan and group them into their table
-                let mut chunks_full = vec![];
-                for chunk in cheap_chunk_first(chunks) {
-                    trace!(chunk_id=%chunk.id(), %table_name, "Considering table");
+        let tables: Vec<_> =
+            table_chunk_stream(Arc::clone(&database), false, &table_predicates, &ctx)
+                .try_filter_map(|(table_name, predicate, chunks)| async move {
+                    // Identify which chunks can answer from its metadata and then record its table,
+                    // and which chunks needs full plan and group them into their table
+                    let mut chunks_full = vec![];
+                    for chunk in cheap_chunk_first(chunks) {
+                        trace!(chunk_id=%chunk.id(), %table_name, "Considering table");
 
-                    // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate
-                    // deleted data before we can determine if its table participates in the requested predicate.
-                    if chunk.has_delete_predicates() {
-                        chunks_full.push(chunk);
-                    } else {
-                        // Try and apply the predicate using only metadata
-                        let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
-                            CheckingChunkPredicateSnafu {
-                                chunk_id: chunk.id(),
-                            },
-                        )?;
+                        // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate
+                        // deleted data before we can determine if its table participates in the requested predicate.
+                        if chunk.has_delete_predicates() {
+                            chunks_full.push(chunk);
+                        } else {
+                            // Try and apply the predicate using only metadata
+                            let pred_result = chunk
+                                .apply_predicate_to_metadata(predicate)
+                                .context(CheckingChunkPredicateSnafu {
+                                    chunk_id: chunk.id(),
+                                })?;
 
-                        match pred_result {
-                            PredicateMatch::AtLeastOneNonNullField => {
-                                trace!("Metadata predicate: table matches");
-                                // Meta data of the table covers predicates of the request
-                                return Ok(Some((table_name, None)));
+                            match pred_result {
+                                PredicateMatch::AtLeastOneNonNullField => {
+                                    trace!("Metadata predicate: table matches");
+                                    // Meta data of the table covers predicates of the request
+                                    return Ok(Some((table_name, None)));
+                                }
+                                PredicateMatch::Unknown => {
+                                    trace!("Metadata predicate: unknown match");
+                                    // We cannot match the predicate to get answer from meta data, let do full plan
+                                    chunks_full.push(chunk);
+                                }
+                                PredicateMatch::Zero => {
+                                    trace!("Metadata predicate: zero rows match");
+                                } // this chunk's table does not participate in the request
                             }
-                            PredicateMatch::Unknown => {
-                                trace!("Metadata predicate: unknown match");
-                                // We cannot match the predicate to get answer from meta data, let do full plan
-                                chunks_full.push(chunk);
-                            }
-                            PredicateMatch::Zero => {
-                                trace!("Metadata predicate: zero rows match");
-                            } // this chunk's table does not participate in the request
                         }
                     }
-                }
 
-                Ok((!chunks_full.is_empty())
-                    .then_some((table_name, Some((predicate, chunks_full)))))
-            })
-            .try_collect()
-            .await?;
+                    Ok((!chunks_full.is_empty())
+                        .then_some((table_name, Some((predicate, chunks_full)))))
+                })
+                .try_collect()
+                .await?;
 
         // Feed builder
         let mut builder = StringSetPlanBuilder::new();
@@ -341,84 +353,88 @@ impl InfluxRpcPlanner {
             }
         }
 
-        let tables: Vec<_> =
-            table_chunk_stream(Arc::clone(&database), &table_predicates_need_chunks, &ctx)
-                .and_then(|(table_name, predicate, chunks)| {
-                    let mut ctx = ctx.child_ctx("table");
-                    ctx.set_metadata("table", table_name.to_owned());
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&database),
+            false,
+            &table_predicates_need_chunks,
+            &ctx,
+        )
+        .and_then(|(table_name, predicate, chunks)| {
+            let mut ctx = ctx.child_ctx("table");
+            ctx.set_metadata("table", table_name.to_owned());
 
-                    async move {
-                        let mut chunks_full = vec![];
-                        let mut known_columns = BTreeSet::new();
+            async move {
+                let mut chunks_full = vec![];
+                let mut known_columns = BTreeSet::new();
 
-                        for chunk in cheap_chunk_first(chunks) {
-                            // Try and apply the predicate using only metadata
-                            let pred_result = chunk
-                                .apply_predicate_to_metadata(predicate)
-                                .context(CheckingChunkPredicateSnafu {
-                                    chunk_id: chunk.id(),
-                                })?;
+                for chunk in cheap_chunk_first(chunks) {
+                    // Try and apply the predicate using only metadata
+                    let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
+                        CheckingChunkPredicateSnafu {
+                            chunk_id: chunk.id(),
+                        },
+                    )?;
 
-                            if matches!(pred_result, PredicateMatch::Zero) {
-                                continue;
+                    if matches!(pred_result, PredicateMatch::Zero) {
+                        continue;
+                    }
+
+                    // get only tag columns from metadata
+                    let schema = chunk.schema();
+
+                    let column_names: Vec<&str> = schema
+                        .tags_iter()
+                        .map(|f| f.name().as_str())
+                        .collect::<Vec<&str>>();
+
+                    let selection = Selection::Some(&column_names);
+
+                    // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
+                    // deleted data before getting tag keys
+                    if chunk.has_delete_predicates() {
+                        debug!(
+                            %table_name,
+                            chunk_id=%chunk.id().get(),
+                            "column names need full plan"
+                        );
+                        chunks_full.push(chunk);
+                    } else {
+                        // filter the columns further from the predicate
+                        let maybe_names = chunk
+                            .column_names(
+                                ctx.child_ctx("column_names execution"),
+                                predicate,
+                                selection,
+                            )
+                            .context(FindingColumnNamesSnafu)?;
+
+                        match maybe_names {
+                            Some(mut names) => {
+                                debug!(
+                                    %table_name,
+                                    names=?names,
+                                    chunk_id=%chunk.id().get(),
+                                    "column names found from metadata",
+                                );
+                                known_columns.append(&mut names);
                             }
-
-                            // get only tag columns from metadata
-                            let schema = chunk.schema();
-
-                            let column_names: Vec<&str> = schema
-                                .tags_iter()
-                                .map(|f| f.name().as_str())
-                                .collect::<Vec<&str>>();
-
-                            let selection = Selection::Some(&column_names);
-
-                            // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
-                            // deleted data before getting tag keys
-                            if chunk.has_delete_predicates() {
+                            None => {
                                 debug!(
                                     %table_name,
                                     chunk_id=%chunk.id().get(),
                                     "column names need full plan"
                                 );
                                 chunks_full.push(chunk);
-                            } else {
-                                // filter the columns further from the predicate
-                                let maybe_names = chunk
-                                    .column_names(
-                                        ctx.child_ctx("column_names execution"),
-                                        predicate,
-                                        selection,
-                                    )
-                                    .context(FindingColumnNamesSnafu)?;
-
-                                match maybe_names {
-                                    Some(mut names) => {
-                                        debug!(
-                                            %table_name,
-                                            names=?names,
-                                            chunk_id=%chunk.id().get(),
-                                            "column names found from metadata",
-                                        );
-                                        known_columns.append(&mut names);
-                                    }
-                                    None => {
-                                        debug!(
-                                            %table_name,
-                                            chunk_id=%chunk.id().get(),
-                                            "column names need full plan"
-                                        );
-                                        chunks_full.push(chunk);
-                                    }
-                                }
                             }
                         }
-
-                        Ok((table_name, predicate, chunks_full, known_columns))
                     }
-                })
-                .try_collect()
-                .await?;
+                }
+
+                Ok((table_name, predicate, chunks_full, known_columns))
+            }
+        })
+        .try_collect()
+        .await?;
 
         // At this point, we have a set of column names we know pass
         // in `known_columns`, and potentially some tables in chunks
@@ -492,100 +508,104 @@ impl InfluxRpcPlanner {
             table_predicates_filtered.push((table_name, predicate));
         }
 
-        let tables: Vec<_> =
-            table_chunk_stream(Arc::clone(&database), &table_predicates_filtered, &ctx)
-                .and_then(|(table_name, predicate, chunks)| async move {
-                    let mut chunks_full = vec![];
-                    let mut known_values = BTreeSet::new();
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&database),
+            false,
+            &table_predicates_filtered,
+            &ctx,
+        )
+        .and_then(|(table_name, predicate, chunks)| async move {
+            let mut chunks_full = vec![];
+            let mut known_values = BTreeSet::new();
 
-                    for chunk in cheap_chunk_first(chunks) {
-                        // Try and apply the predicate using only metadata
-                        let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
-                            CheckingChunkPredicateSnafu {
-                                chunk_id: chunk.id(),
-                            },
-                        )?;
+            for chunk in cheap_chunk_first(chunks) {
+                // Try and apply the predicate using only metadata
+                let pred_result = chunk.apply_predicate_to_metadata(predicate).context(
+                    CheckingChunkPredicateSnafu {
+                        chunk_id: chunk.id(),
+                    },
+                )?;
 
-                        if matches!(pred_result, PredicateMatch::Zero) {
-                            continue;
+                if matches!(pred_result, PredicateMatch::Zero) {
+                    continue;
+                }
+
+                // use schema to validate column type
+                let schema = chunk.schema();
+
+                // Skip this table if the tag_name is not a column in this chunk
+                // Note: This may happen even when the table contains the tag_name, because some chunks may not
+                //       contain all columns.
+                let idx = if let Some(idx) = schema.find_index_of(tag_name) {
+                    idx
+                } else {
+                    continue;
+                };
+
+                // Validate that this really is a Tag column
+                let (influx_column_type, field) = schema.field(idx);
+                ensure!(
+                    matches!(influx_column_type, Some(InfluxColumnType::Tag)),
+                    InvalidTagColumnSnafu {
+                        tag_name,
+                        influx_column_type,
+                    }
+                );
+                ensure!(
+                    influx_column_type
+                        .unwrap()
+                        .valid_arrow_type(field.data_type()),
+                    InternalInvalidTagTypeSnafu {
+                        tag_name,
+                        data_type: field.data_type().clone(),
+                    }
+                );
+
+                // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
+                // deleted data before getting tag values
+                if chunk.has_delete_predicates() {
+                    debug!(
+                        %table_name,
+                        chunk_id=%chunk.id().get(),
+                        "need full plan to find tag values"
+                    );
+
+                    chunks_full.push(chunk);
+                } else {
+                    // try and get the list of values directly from metadata
+                    let mut ctx = self.ctx.child_ctx("tag_values execution");
+                    ctx.set_metadata("table", table_name.to_owned());
+
+                    let maybe_values = chunk
+                        .column_values(ctx, tag_name, predicate)
+                        .context(FindingColumnValuesSnafu)?;
+
+                    match maybe_values {
+                        Some(mut names) => {
+                            debug!(
+                                %table_name,
+                                names=?names,
+                                chunk_id=%chunk.id().get(),
+                                "tag values found from metadata",
+                            );
+                            known_values.append(&mut names);
                         }
-
-                        // use schema to validate column type
-                        let schema = chunk.schema();
-
-                        // Skip this table if the tag_name is not a column in this chunk
-                        // Note: This may happen even when the table contains the tag_name, because some chunks may not
-                        //       contain all columns.
-                        let idx = if let Some(idx) = schema.find_index_of(tag_name) {
-                            idx
-                        } else {
-                            continue;
-                        };
-
-                        // Validate that this really is a Tag column
-                        let (influx_column_type, field) = schema.field(idx);
-                        ensure!(
-                            matches!(influx_column_type, Some(InfluxColumnType::Tag)),
-                            InvalidTagColumnSnafu {
-                                tag_name,
-                                influx_column_type,
-                            }
-                        );
-                        ensure!(
-                            influx_column_type
-                                .unwrap()
-                                .valid_arrow_type(field.data_type()),
-                            InternalInvalidTagTypeSnafu {
-                                tag_name,
-                                data_type: field.data_type().clone(),
-                            }
-                        );
-
-                        // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
-                        // deleted data before getting tag values
-                        if chunk.has_delete_predicates() {
+                        None => {
                             debug!(
                                 %table_name,
                                 chunk_id=%chunk.id().get(),
                                 "need full plan to find tag values"
                             );
-
                             chunks_full.push(chunk);
-                        } else {
-                            // try and get the list of values directly from metadata
-                            let mut ctx = self.ctx.child_ctx("tag_values execution");
-                            ctx.set_metadata("table", table_name.to_owned());
-
-                            let maybe_values = chunk
-                                .column_values(ctx, tag_name, predicate)
-                                .context(FindingColumnValuesSnafu)?;
-
-                            match maybe_values {
-                                Some(mut names) => {
-                                    debug!(
-                                        %table_name,
-                                        names=?names,
-                                        chunk_id=%chunk.id().get(),
-                                        "tag values found from metadata",
-                                    );
-                                    known_values.append(&mut names);
-                                }
-                                None => {
-                                    debug!(
-                                        %table_name,
-                                        chunk_id=%chunk.id().get(),
-                                        "need full plan to find tag values"
-                                    );
-                                    chunks_full.push(chunk);
-                                }
-                            }
                         }
                     }
+                }
+            }
 
-                    Ok((table_name, predicate, chunks_full, known_values))
-                })
-                .try_collect()
-                .await?;
+            Ok((table_name, predicate, chunks_full, known_values))
+        })
+        .try_collect()
+        .await?;
 
         let mut builder = StringSetPlanBuilder::new();
 
@@ -1312,8 +1332,18 @@ impl InfluxRpcPlanner {
 }
 
 /// Stream of chunks for table predicates.
+/// This function is used by influx grpc meta queries that want to know which table/tags/fields
+/// that match the given predicates.
+/// `need_fields` means the grpc queries will need to return field columns. If  `need_fields`
+/// is false, the grpc query does not need to return field columns but it still filters data on the
+/// field columns in the predicate
+///
+/// This function is directly invoked by `table_name, `tag_keys` and `tag_values` where need_fields should be false.
+/// This function is indirectly invoked by `field_columns`, `read_filter`, `read_group` and `read_window_aggregate`
+/// through the function `create_plans` where need_fields should be true.
 fn table_chunk_stream<'a>(
     database: Arc<dyn QueryDatabase>,
+    need_fields: bool,
     table_predicates: &'a [(String, Predicate)],
     ctx: &'a IOxSessionContext,
 ) -> impl Stream<Item = Result<(&'a str, &'a Predicate, Vec<Arc<dyn QueryChunk>>)>> + 'a {
@@ -1324,9 +1354,22 @@ fn table_chunk_stream<'a>(
 
             let database = Arc::clone(&database);
 
+            let table_schema = database.table_schema(table_name);
+            let projection = match table_schema {
+                Some(table_schema) => {
+                    columns_in_predicates(need_fields, table_schema, table_name, predicate)
+                }
+                None => None,
+            };
+
             async move {
                 let chunks = database
-                    .chunks(table_name, predicate, ctx.child_ctx("table chunks"))
+                    .chunks(
+                        table_name,
+                        predicate,
+                        &projection,
+                        ctx.child_ctx("table chunks"),
+                    )
                     .await
                     .context(GettingChunksSnafu { table_name })?;
 
@@ -1336,6 +1379,89 @@ fn table_chunk_stream<'a>(
         .buffered(CONCURRENT_TABLE_JOBS)
 }
 
+// Return all columns in predicate's field_columns, exprs and val_exprs.
+// Return None means nothing is filtered in this function and all field columns should be used.
+// None is returned when:
+//   - we cannot determine at least one column in the predicate
+//   - need_fields is true and the predicate does not have any field_columns.
+//     This signal that all fields are needed.
+// Note that the returned columns can also include tag and time columns if they happen to be
+// in the predicate.
+fn columns_in_predicates(
+    need_fields: bool,
+    table_schema: Arc<Schema>,
+    table_name: &String,
+    predicate: &Predicate,
+) -> Option<Vec<usize>> {
+    let mut columns = StdHashSet::new();
+
+    // columns in field_columns
+    match &predicate.field_columns {
+        Some(field_columns) => {
+            for field in field_columns {
+                columns.insert(Column {
+                    relation: None,
+                    name: (*field).clone(),
+                });
+            }
+        }
+        None => {
+            if need_fields {
+                // fields wanted and `field_columns` is empty mean al fields will be needed
+                return None;
+            }
+        }
+    }
+
+    // columns in exprs
+    let expr_cols_result =
+        exprlist_to_columns(&predicate.exprs, &mut columns).context(ReadColumnsSnafu);
+
+    // columns in val_exprs
+    let exprs: Vec<Expr> = predicate
+        .value_expr
+        .iter()
+        .map(|e| Expr::from((*e).clone()))
+        .collect();
+    let val_exprs_cols_result = exprlist_to_columns(&exprs, &mut columns).context(ReadColumnsSnafu);
+
+    let projection = if expr_cols_result.is_err() || val_exprs_cols_result.is_err() {
+        if expr_cols_result.is_err() {
+            let error_message = expr_cols_result.err().unwrap().to_string();
+            warn!(?table_name, ?predicate.exprs, ?error_message, "cannot determine columns in predicate.exprs");
+        }
+        if val_exprs_cols_result.is_err() {
+            let error_message = val_exprs_cols_result.err().unwrap().to_string();
+            warn!(?table_name, ?predicate.value_expr, ?error_message, "cannot determine columns in predicate.value_expr");
+        }
+
+        None
+    } else {
+        // convert the column names into their corresponding indexes in the schema
+        let cols = columns
+            .iter()
+            .map(|c| table_schema.find_index_of(&c.name))
+            .collect::<Vec<_>>();
+
+        if cols.contains(&None) || cols.is_empty() {
+            // At least one column has no matching index, we do not know which
+            // columns to filter. Read all columns
+            warn!(
+                ?table_name,
+                ?predicate,
+                ?table_schema,
+                "cannot find index for at least one column in the table schema"
+            );
+            None
+        } else {
+            // We know which columns to filter, read only those columns
+            Some(cols.into_iter().flatten().collect::<Vec<_>>())
+        }
+    };
+
+    projection
+}
+
 /// Create plans that fetch the data specified in table_predicates.
 ///
 /// table_predicates contains `(table_name, predicate_specialized_for_that_table)`
@@ -1364,7 +1490,7 @@ where
         + Sync,
     P: Send,
 {
-    table_chunk_stream(Arc::clone(&database), table_predicates, &ctx)
+    table_chunk_stream(Arc::clone(&database), true, table_predicates, &ctx)
         .and_then(|(table_name, predicate, chunks)| async move {
             let chunks = prune_chunks_metadata(chunks, predicate)?;
             Ok((table_name, predicate, chunks))
@@ -1762,15 +1888,462 @@ fn cheap_chunk_first(mut chunks: Vec<Arc<dyn QueryChunk>>) -> Vec<Arc<dyn QueryC
 mod tests {
     use datafusion::logical_plan::lit;
     use futures::{future::BoxFuture, FutureExt};
-    use predicate::Predicate;
+    use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
 
     use crate::{
-        exec::Executor,
+        exec::{ExecutionContextProvider, Executor},
         test::{TestChunk, TestDatabase},
     };
 
     use super::*;
 
+    #[test]
+    fn test_columns_in_predicates() {
+        // setup a db
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+        // index of columns in the above chunk: [bar, foo, i64_field, i64_field_2, time]
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let table = "h2o";
+        let schema = test_db.table_schema(table).unwrap();
+
+        // test 1: empty predicate without need_fields
+        let predicate = Predicate::new();
+        let need_fields = false;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        assert_eq!(projection, None);
+
+        // test 2: empty predicate with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        assert_eq!(projection, None);
+
+        // test 3: predicate on tag without need_fields
+        let predicate = Predicate::new().with_expr(col("foo").eq(lit("some_thing")));
+        let need_fields = false;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        // return index of foo
+        assert_eq!(projection, vec![1]);
+
+        // test 4: predicate on tag with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        );
+        // return None means all fields
+        assert_eq!(projection, None);
+
+        // test 5: predicate on tag with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("foo").eq(lit("some_thing")))
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let need_fields = false;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of i64_field and foo
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 6: predicate on tag with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of foo and index of i64_field
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 7: predicate on tag and field with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("bar").eq(lit(1)).and(col("i64_field").eq(lit(1))))
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let need_fields = false;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+
+        // test 7: predicate on tag and field with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(
+            need_fields,
+            Arc::clone(&schema),
+            &table.to_string(),
+            &predicate,
+        )
+        .unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_no_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate has no field_columns
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        ////////////////////////////
+        // Test 1: need_fields --> all columns will be selected
+        let need_fields = true;
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because we asked it return all fileds (and implicit PK) even though the predicate is on `foo` only
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        ////////////////////////////
+        // Test 2: no need_fields --> only PK + columns in predicate are return
+        let need_fields = false;
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  only 3 columns of the table PK + cols in predicate
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 3);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_empty_pred() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // empty predicate
+        let predicate = Predicate::new();
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        /////////////
+        // Test 1: empty predicate with need_fields
+        let need_fields = true;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        /////////////
+        // Test 2: empty predicate without need_fields
+        let need_fields = false;
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_tag_no_data() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column(), // no row added for this chunk on purpose
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // Since no data, we do not do pushdown in the test chunk.
+        // the no-data returned chunk will include all columns of the table
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_and_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        let need_fields = false;
+
+        /////////////
+        // Test 1: predicate on field `i64_field_2` and `field_columns` is empty
+        // predicate on field column
+        let expr = col("i64_field_2").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes 4 columns: 3 cols of PK plus i64_field_2
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 4);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+
+        /////////////
+        // Test 2: predicate on tag `foo` and `field_columns` is not empty
+        let expr = col("bar").eq(lit(10));
+        let predicate = Predicate::new()
+            .with_expr(expr)
+            .with_field_columns(vec!["i64_field".to_string()]);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes 4 columns: 3 cols of PK plus i64_field_1
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 4);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_unknown_field() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new(1));
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+
+        // predicate on unknown column
+        let expr = col("unknown_name").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![("h2o".to_string(), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0, "h2o"); // table name
+        assert_eq!(result[0].2.len(), 1); // returned chunks
+
+        // chunk schema includes all 5 columns since we hit the unknown columnd
+        let chunk = &result[0].2[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+        executor.join().await;
+    }
+
     #[tokio::test]
     async fn test_predicate_rewrite_table_names() {
         run_test(|test_db, rpc_predicate| {
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
index a0bd37a68b..7863e9750f 100644
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@@ -14,7 +14,7 @@ use async_trait::async_trait;
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use exec::{stringset::StringSet, IOxSessionContext};
 use hashbrown::HashMap;
 use observability_deps::tracing::{debug, trace};
@@ -141,9 +141,6 @@ impl Drop for QueryCompletedToken {
 /// This avoids storing potentially large strings
 pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
 
-/// Error type for [`QueryDatabase`] operations.
-pub type QueryDatabaseError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// A `Database` is the main trait implemented by the IOx subsystems
 /// that store actual data.
 ///
@@ -154,12 +151,15 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
     /// Returns a set of chunks within the partition with data that may match
     /// the provided predicate. If possible, chunks which have no rows that can
     /// possibly match the predicate may be omitted.
+    /// If projection is None, returned chunks will include all columns of its original data. Otherwise,
+    /// returned chunks will includs PK columns (tags and time) and columns specified in the projection.
     async fn chunks(
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError>;
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;
 
     /// Record that particular type of query was run / planned
     fn record_query(
@@ -175,9 +175,6 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
     fn as_meta(&self) -> &dyn QueryDatabaseMeta;
 }
 
-/// Error type for [`QueryChunk`] operations.
-pub type QueryChunkError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
 /// Collection of data that shares the same partition key
 pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
     /// returns the Id of this chunk. Ids are unique within a
@@ -200,7 +197,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
     fn apply_predicate_to_metadata(
         &self,
         predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
         Ok(self
             .summary()
             .map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow()))
@@ -216,7 +213,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         predicate: &Predicate,
         columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;
 
     /// Return a set of Strings containing the distinct values in the
     /// specified columns. If the predicate can be evaluated entirely
@@ -228,7 +225,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         column_name: &str,
         predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError>;
+    ) -> Result<Option<StringSet>, DataFusionError>;
 
     /// Provides access to raw `QueryChunk` data as an
     /// asynchronous stream of `RecordBatch`es filtered by a *required*
@@ -248,7 +245,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
         ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError>;
+    ) -> Result<SendableRecordBatchStream, DataFusionError>;
 
     /// Returns chunk type. Useful in tests and debug logs.
     fn chunk_type(&self) -> &str;
diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs
index 23cb2e2f6a..cf143dcb57 100644
--- a/iox_query/src/provider/adapter.rs
+++ b/iox_query/src/provider/adapter.rs
@@ -262,7 +262,7 @@ mod tests {
         let batch = make_batch();
 
         let output_schema = batch.schema();
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -291,7 +291,7 @@ mod tests {
             Field::new("c", DataType::Utf8, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -321,7 +321,7 @@ mod tests {
             Field::new("d", DataType::Float32, true),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let adapter_stream =
             SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
 
@@ -349,7 +349,7 @@ mod tests {
             Field::new("c", DataType::Utf8, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
 
         assert_contains!(
@@ -368,7 +368,7 @@ mod tests {
             Field::new("b", DataType::Int32, false),
             Field::new("a", DataType::Int32, false),
         ]));
-        let input_stream = stream_from_batch(batch);
+        let input_stream = stream_from_batch(batch.schema(), batch);
         let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
 
         assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'");
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
index dee2d1120b..e7a0503f1c 100644
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@@ -8,8 +8,8 @@ use crate::{
         stringset::{StringSet, StringSetRef},
         ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext,
     },
-    Predicate, PredicateMatch, QueryChunk, QueryChunkError, QueryChunkMeta, QueryCompletedToken,
-    QueryDatabase, QueryDatabaseError, QueryText,
+    Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase,
+    QueryText,
 };
 use arrow::{
     array::{
@@ -24,7 +24,7 @@ use data_types::{
     ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues,
     Statistics, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
 use datafusion_util::stream_from_batches;
 use futures::StreamExt;
 use hashbrown::HashSet;
@@ -108,18 +108,54 @@ impl QueryDatabase for TestDatabase {
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         _ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         // save last predicate
         *self.chunks_predicate.lock() = predicate.clone();
 
-        let partitions = self.partitions.lock();
-        Ok(partitions
+        let partitions = self.partitions.lock().clone();
+        let chunks = partitions
             .values()
             .flat_map(|x| x.values())
             .filter(|x| x.table_name == table_name)
-            .map(|x| Arc::clone(x) as _)
-            .collect())
+            .map(|x| Arc::clone(x) as Arc<dyn QueryChunk>)
+            .collect::<Vec<_>>();
+
+        // Return chunks with fewer columns if a projection is specified
+        let mut new_chunks = Vec::with_capacity(chunks.len());
+        for c in chunks {
+            let schema = c.schema();
+            let cols = schema.select_given_and_pk_columns(projection);
+            let cols = cols.iter().map(|c| c.as_str()).collect::<Vec<_>>();
+            let selection = Selection::Some(&cols);
+
+            let read_result =
+                c.read_filter(IOxSessionContext::with_testing(), predicate, selection);
+            if read_result.is_err() {
+                return Err(read_result.err().unwrap());
+            }
+            let mut stream = read_result.unwrap();
+
+            let mut new_chunk = TestChunk::new(c.table_name());
+            while let Some(b) = stream.next().await {
+                let b = b.expect("Error in stream");
+                new_chunk.table_data.push(Arc::new(b));
+            }
+
+            let new_chunk = if !new_chunk.table_data.is_empty() {
+                let new_schema = Schema::try_from(new_chunk.table_data[0].schema()).unwrap();
+                let new_chunk = new_chunk.add_schema_to_table(new_schema, true, None);
+                Arc::new(new_chunk) as _
+            } else {
+                // No data, return the original empty chunk with the original schema
+                c
+            };
+
+            new_chunks.push(new_chunk);
+        }
+
+        Ok(new_chunks)
     }
 
     fn record_query(
@@ -327,9 +363,9 @@ impl TestChunk {
     }
 
     /// Checks the saved error, and returns it if any, otherwise returns OK
-    fn check_error(&self) -> Result<(), QueryChunkError> {
+    fn check_error(&self) -> Result<(), DataFusionError> {
         if let Some(message) = self.saved_error.as_ref() {
-            Err(message.clone().into())
+            Err(DataFusionError::External(message.clone().into()))
         } else {
             Ok(())
         }
@@ -509,12 +545,8 @@ impl TestChunk {
         mut self,
         new_column_schema: Schema,
         add_column_summary: bool,
-        stats: Option<Statistics>,
+        input_stats: Option<Statistics>,
     ) -> Self {
-        // assume the new schema has exactly a single table
-        assert_eq!(new_column_schema.len(), 1);
-        let (col_type, new_field) = new_column_schema.field(0);
-
         let mut merger = SchemaMerger::new();
         merger = merger.merge(&new_column_schema).unwrap();
         merger = merger
@@ -522,34 +554,38 @@ impl TestChunk {
             .expect("merging was successful");
         self.schema = merger.build();
 
-        if add_column_summary {
-            let influxdb_type = col_type.map(|t| match t {
-                InfluxColumnType::Tag => InfluxDbType::Tag,
-                InfluxColumnType::Field(_) => InfluxDbType::Field,
-                InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
-            });
+        for i in 0..new_column_schema.len() {
+            let (col_type, new_field) = new_column_schema.field(i);
+            if add_column_summary {
+                let influxdb_type = col_type.map(|t| match t {
+                    InfluxColumnType::Tag => InfluxDbType::Tag,
+                    InfluxColumnType::Field(_) => InfluxDbType::Field,
+                    InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
+                });
 
-            let stats = stats.unwrap_or_else(|| match new_field.data_type() {
-                DataType::Boolean => Statistics::Bool(StatValues::default()),
-                DataType::Int64 => Statistics::I64(StatValues::default()),
-                DataType::UInt64 => Statistics::U64(StatValues::default()),
-                DataType::Utf8 => Statistics::String(StatValues::default()),
-                DataType::Dictionary(_, value_type) => {
-                    assert!(matches!(**value_type, DataType::Utf8));
-                    Statistics::String(StatValues::default())
-                }
-                DataType::Float64 => Statistics::F64(StatValues::default()),
-                DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
-                _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
-            });
+                let stats = input_stats.clone();
+                let stats = stats.unwrap_or_else(|| match new_field.data_type() {
+                    DataType::Boolean => Statistics::Bool(StatValues::default()),
+                    DataType::Int64 => Statistics::I64(StatValues::default()),
+                    DataType::UInt64 => Statistics::U64(StatValues::default()),
+                    DataType::Utf8 => Statistics::String(StatValues::default()),
+                    DataType::Dictionary(_, value_type) => {
+                        assert!(matches!(**value_type, DataType::Utf8));
+                        Statistics::String(StatValues::default())
+                    }
+                    DataType::Float64 => Statistics::F64(StatValues::default()),
+                    DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
+                    _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
+                });
 
-            let column_summary = ColumnSummary {
-                name: new_field.name().clone(),
-                influxdb_type,
-                stats,
-            };
+                let column_summary = ColumnSummary {
+                    name: new_field.name().clone(),
+                    influxdb_type,
+                    stats,
+                };
 
-            self.table_summary.columns.push(column_summary);
+                self.table_summary.columns.push(column_summary);
+            }
         }
 
         self
@@ -921,13 +957,17 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
         self.predicates.lock().push(predicate.clone());
 
-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
             None => self.table_data.clone(),
             Some(projection) => self
                 .table_data
@@ -938,7 +978,8 @@ impl QueryChunk for TestChunk {
                 })
                 .collect::<std::result::Result<Vec<_>, ArrowError>>()?,
         };
-        Ok(stream_from_batches(batches))
+
+        Ok(stream_from_batches(self.schema().as_arrow(), batches))
     }
 
     fn chunk_type(&self) -> &str {
@@ -948,7 +989,7 @@ impl QueryChunk for TestChunk {
     fn apply_predicate_to_metadata(
         &self,
         predicate: &Predicate,
-    ) -> Result<PredicateMatch, QueryChunkError> {
+    ) -> Result<PredicateMatch, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
@@ -967,7 +1008,7 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // Model not being able to get column values from metadata
         Ok(None)
     }
@@ -977,7 +1018,7 @@ impl QueryChunk for TestChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         self.check_error()?;
 
         // save the predicate
diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml
index 8760728d4e..514bfb5754 100644
--- a/iox_tests/Cargo.toml
+++ b/iox_tests/Cargo.toml
@@ -14,7 +14,7 @@ iox_catalog = { path = "../iox_catalog" }
 iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 once_cell = { version = "1.15.0", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml
index eb41af0c4f..26d9d3fdeb 100644
--- a/ioxd_common/Cargo.toml
+++ b/ioxd_common/Cargo.toml
@@ -40,7 +40,7 @@ log = "0.4"
 parking_lot = "0.12"
 reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 serde_urlencoded = "0.7.0"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
diff --git a/ioxd_compactor/Cargo.toml b/ioxd_compactor/Cargo.toml
index 3fae827159..6cbe04119c 100644
--- a/ioxd_compactor/Cargo.toml
+++ b/ioxd_compactor/Cargo.toml
@@ -15,7 +15,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 iox_query = { path = "../iox_query" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_time = { path = "../iox_time" }
 trace = { path = "../trace" }
 
diff --git a/ioxd_ingester/Cargo.toml b/ioxd_ingester/Cargo.toml
index db8f65e202..11e3118c2d 100644
--- a/ioxd_ingester/Cargo.toml
+++ b/ioxd_ingester/Cargo.toml
@@ -11,7 +11,7 @@ ingester = { path = "../ingester" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 iox_query = { path = "../iox_query" }
 trace = { path = "../trace" }
 write_buffer = { path = "../write_buffer" }
diff --git a/ioxd_querier/Cargo.toml b/ioxd_querier/Cargo.toml
index e90a4a68df..60574ed73d 100644
--- a/ioxd_querier/Cargo.toml
+++ b/ioxd_querier/Cargo.toml
@@ -11,7 +11,7 @@ generated_types = { path = "../generated_types" }
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 querier = { path = "../querier" }
 iox_query = { path = "../iox_query" }
 router = { path = "../router" }
diff --git a/ioxd_router/Cargo.toml b/ioxd_router/Cargo.toml
index 5797a9cf01..1ae3d3ab2a 100644
--- a/ioxd_router/Cargo.toml
+++ b/ioxd_router/Cargo.toml
@@ -11,7 +11,7 @@ iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 router = { path = "../router" }
 sharder = { path = "../sharder" }
diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml
index 60838a8e28..f04cb909ef 100644
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@@ -10,7 +10,7 @@ bytes = "1.2"
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 metric = { version = "0.1.0", path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 pin-project = "1.0.12"
 tokio = { version = "1.21", features = ["io-util"] }
 workspace-hack = { path = "../workspace-hack" }
diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml
index 6fd9bafa4f..783b1ddca4 100644
--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@@ -14,7 +14,7 @@ datafusion_util = { path = "../datafusion_util" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
 iox_time = { path = "../iox_time" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet = {version = "23.0.0", features = ["experimental"]}
diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml
index 9b4cc08004..5273a01dd3 100644
--- a/parquet_to_line_protocol/Cargo.toml
+++ b/parquet_to_line_protocol/Cargo.toml
@@ -10,7 +10,7 @@ datafusion = { path = "../datafusion" }
 influxdb_line_protocol = { path = "../influxdb_line_protocol" }
 futures = {version = "0.3"}
 num_cpus = "1.13.1"
-object_store = { version = "0.5.0" }
+object_store = { version = "0.5.1" }
 parquet_file  = { path = "../parquet_file" }
 schema = { path = "../schema" }
 tokio = "1.0"
diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml
index 9bf303b6c1..e1d423255f 100644
--- a/predicate/Cargo.toml
+++ b/predicate/Cargo.toml
@@ -13,9 +13,9 @@ itertools = "0.10"
 observability_deps = { path = "../observability_deps" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
-sqlparser = "0.24.0"
+sqlparser = "0.25.0"
 workspace-hack = { path = "../workspace-hack"}
 
 [dev-dependencies]
diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs
index 03b52e521d..633a345e50 100644
--- a/predicate/src/lib.rs
+++ b/predicate/src/lib.rs
@@ -12,7 +12,6 @@
 
 pub mod delete_expr;
 pub mod delete_predicate;
-pub mod rewrite;
 pub mod rpc_predicate;
 
 use arrow::{
diff --git a/predicate/src/rpc_predicate.rs b/predicate/src/rpc_predicate.rs
index 2836a6e57e..833dfdc063 100644
--- a/predicate/src/rpc_predicate.rs
+++ b/predicate/src/rpc_predicate.rs
@@ -1,19 +1,23 @@
+mod column_rewrite;
 mod field_rewrite;
 mod measurement_rewrite;
+mod rewrite;
 mod value_rewrite;
 
-use crate::{rewrite, Predicate};
+use crate::Predicate;
 
 use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::lit;
 use datafusion::logical_plan::{
-    Column, Expr, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
+    Column, Expr, ExprRewritable, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
 };
+use observability_deps::tracing::{debug, trace};
 use schema::Schema;
 use std::collections::BTreeSet;
 use std::sync::Arc;
 
+use self::column_rewrite::MissingColumnRewriter;
 use self::field_rewrite::FieldProjectionRewriter;
 use self::measurement_rewrite::rewrite_measurement_references;
 use self::value_rewrite::rewrite_field_value_references;
@@ -187,6 +191,7 @@ fn normalize_predicate(
     let mut predicate = predicate.clone();
 
     let mut field_projections = FieldProjectionRewriter::new(Arc::clone(&schema));
+    let mut missing_columums = MissingColumnRewriter::new(Arc::clone(&schema));
 
     let mut field_value_exprs = vec![];
 
@@ -194,24 +199,38 @@ fn normalize_predicate(
         .exprs
         .into_iter()
         .map(|e| {
-            rewrite_measurement_references(table_name, e)
+            debug!(?e, "rewriting expr");
+
+            let e = rewrite_measurement_references(table_name, e)
+                .map(|e| log_rewrite(e, "rewrite_measurement_references"))
                 // Rewrite any references to `_value = some_value` to literal true values.
                 // Keeps track of these expressions, which can then be used to
                 // augment field projections with conditions using `CASE` statements.
                 .and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e))
+                .map(|e| log_rewrite(e, "rewrite_field_value_references"))
                 // Rewrite any references to `_field` with a literal
                 // and keep track of referenced field names to add to
                 // the field column projection set.
                 .and_then(|e| field_projections.rewrite_field_exprs(e))
+                .map(|e| log_rewrite(e, "field_projections"))
+                // remove references to columns that don't exist in this schema
+                .and_then(|e| e.rewrite(&mut missing_columums))
+                .map(|e| log_rewrite(e, "missing_columums"))
                 // apply IOx specific rewrites (that unlock other simplifications)
                 .and_then(rewrite::rewrite)
-                // Call the core DataFusion simplification logic
+                .map(|e| log_rewrite(e, "rewrite"))
+                // Call DataFusion simplification logic
                 .and_then(|e| {
                     let adapter = SimplifyAdapter::new(schema.as_ref());
                     // simplify twice to ensure "full" cleanup
                     e.simplify(&adapter)?.simplify(&adapter)
                 })
+                .map(|e| log_rewrite(e, "simplify_expr"))
                 .and_then(rewrite::simplify_predicate)
+                .map(|e| log_rewrite(e, "simplify_expr"));
+
+            debug!(?e, "rewritten expr");
+            e
         })
         // Filter out literal true so is_empty works correctly
         .filter(|f| match f {
@@ -227,6 +246,11 @@ fn normalize_predicate(
     field_projections.add_to_predicate(predicate)
 }
 
+fn log_rewrite(expr: Expr, description: &str) -> Expr {
+    trace!(?expr, %description, "After rewrite");
+    expr
+}
+
 struct SimplifyAdapter<'a> {
     schema: &'a Schema,
     execution_props: ExecutionProps,
@@ -290,9 +314,27 @@ mod tests {
 
     use super::*;
     use arrow::datatypes::DataType;
-    use datafusion::logical_plan::{col, lit};
+    use datafusion::{
+        logical_plan::{col, lit},
+        scalar::ScalarValue,
+    };
     use test_helpers::assert_contains;
 
+    #[test]
+    fn test_normalize_predicate_coerced() {
+        let schema = schema();
+        let predicate = normalize_predicate(
+            "table",
+            Arc::clone(&schema),
+            &Predicate::new().with_expr(col("t1").eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(col("t1").eq(lit("f1")));
+
+        assert_eq!(predicate, expected);
+    }
+
     #[test]
     fn test_normalize_predicate_field_rewrite() {
         let predicate = normalize_predicate(
@@ -336,6 +378,20 @@ mod tests {
         assert_eq!(predicate, expected);
     }
 
+    #[test]
+    fn test_normalize_predicate_field_non_tag() {
+        // should treat
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None)));
+        assert_eq!(predicate, expected);
+    }
+
     #[test]
     fn test_normalize_predicate_field_rewrite_multi_field_unsupported() {
         let err = normalize_predicate(
diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs
new file mode 100644
index 0000000000..7a29331fca
--- /dev/null
+++ b/predicate/src/rpc_predicate/column_rewrite.rs
@@ -0,0 +1,99 @@
+use std::sync::Arc;
+
+use datafusion::{
+    error::Result as DataFusionResult, logical_plan::ExprRewriter, prelude::*, scalar::ScalarValue,
+};
+use schema::Schema;
+
+/// Logic for rewriting expressions from influxrpc that reference non
+/// existent columns to NULL
+#[derive(Debug)]
+pub(crate) struct MissingColumnRewriter {
+    /// The input schema
+    schema: Arc<Schema>,
+}
+
+impl MissingColumnRewriter {
+    /// Create a new [`MissingColumnRewriter`] targeting the given schema
+    pub(crate) fn new(schema: Arc<Schema>) -> Self {
+        Self { schema }
+    }
+
+    fn column_exists(&self, col: &Column) -> DataFusionResult<bool> {
+        // todo a real error here (rpc_predicates shouldn't have table/relation qualifiers)
+        assert!(col.relation.is_none());
+
+        if self.schema.find_index_of(&col.name).is_some() {
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+}
+
+fn lit_null() -> Expr {
+    lit(ScalarValue::Utf8(None))
+}
+
+impl ExprRewriter for MissingColumnRewriter {
+    fn mutate(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        Ok(match expr {
+            Expr::Column(col) if !self.column_exists(&col)? => lit_null(),
+            expr => expr,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{arrow::datatypes::DataType, logical_plan::ExprRewritable};
+    use schema::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn all_columns_defined_no_rewrite() {
+        // t1 = "foo"
+        let expr = col("t1").eq(lit("foo"));
+        assert_eq!(rewrite(expr.clone()), expr);
+
+        // f1 > 1.0
+        let expr = col("f1").gt(lit(1.0));
+        assert_eq!(rewrite(expr.clone()), expr);
+    }
+
+    #[test]
+    fn all_columns_not_defined() {
+        // non_defined = "foo" --> NULL = "foo"
+        let expr = col("non_defined").eq(lit("foo"));
+        let expected = lit_null().eq(lit("foo"));
+        assert_eq!(rewrite(expr), expected);
+
+        // non_defined = 1.4 --> NULL = 1.4
+        let expr = col("non_defined").eq(lit(1.4));
+        // No type is inferred so this is a literal null string (even though it maybe should be a literal float)
+        let expected = lit_null().eq(lit(1.4));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    #[test]
+    fn some_columns_not_defined() {
+        // t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar"
+        let expr = col("t1")
+            .eq(lit("foo"))
+            .and(col("non_defined").eq(lit("bar")));
+        let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar")));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    fn rewrite(expr: Expr) -> Expr {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .field("f1", DataType::Int64)
+            .build()
+            .unwrap();
+
+        let mut rewriter = MissingColumnRewriter::new(Arc::new(schema));
+        expr.rewrite(&mut rewriter).unwrap()
+    }
+}
diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs
index 3cccfa219a..3f983a28e7 100644
--- a/predicate/src/rpc_predicate/field_rewrite.rs
+++ b/predicate/src/rpc_predicate/field_rewrite.rs
@@ -55,8 +55,8 @@ impl FieldProjectionRewriter {
         }
     }
 
-    // Rewrites the predicate. See the description on
-    // [`FieldProjectionRewriter`] for more details.
+    /// Rewrites the predicate. See the description on
+    /// [`FieldProjectionRewriter`] for more details.
     pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult<Expr> {
         // for predicates like `A AND B AND C`
         // rewrite `A`, `B` and `C` separately and put them back together
diff --git a/predicate/src/rewrite.rs b/predicate/src/rpc_predicate/rewrite.rs
similarity index 100%
rename from predicate/src/rewrite.rs
rename to predicate/src/rpc_predicate/rewrite.rs
diff --git a/querier/Cargo.toml b/querier/Cargo.toml
index 9d55643c4b..02fe680c07 100644
--- a/querier/Cargo.toml
+++ b/querier/Cargo.toml
@@ -18,7 +18,7 @@ generated_types = { path = "../generated_types" }
 influxdb_iox_client = { path = "../influxdb_iox_client" }
 iox_catalog = { path = "../iox_catalog" }
 metric = { path = "../metric" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
diff --git a/querier/src/cache/read_buffer.rs b/querier/src/cache/read_buffer.rs
index 4c68bcac9d..63138e242a 100644
--- a/querier/src/cache/read_buffer.rs
+++ b/querier/src/cache/read_buffer.rs
@@ -470,9 +470,9 @@ mod tests {
             .into_iter()
             .map(lp_to_record_batch)
             .map(Arc::new)
-            .collect();
+            .collect::<Vec<_>>();
 
-        let stream = stream_from_batches(batches);
+        let stream = stream_from_batches(batches[0].schema(), batches);
 
         let metric_registry = metric::Registry::new();
 
diff --git a/querier/src/chunk/query_access.rs b/querier/src/chunk/query_access.rs
index 0edf477ec7..dc94a55b69 100644
--- a/querier/src/chunk/query_access.rs
+++ b/querier/src/chunk/query_access.rs
@@ -7,13 +7,16 @@ use arrow::{
 use data_types::{
     ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary, TimestampMinMax,
 };
-use datafusion::physical_plan::{
-    stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
+    },
 };
 use futures::{Stream, TryStreamExt};
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use observability_deps::tracing::debug;
 use predicate::Predicate;
@@ -114,7 +117,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         ctx.set_metadata("projection", format!("{}", columns));
         ctx.set_metadata("predicate", format!("{}", &predicate));
 
@@ -161,10 +164,10 @@ impl QueryChunk for QuerierChunk {
                         None
                     }
                     Err(other) => {
-                        return Err(Box::new(Error::RBChunk {
+                        return Err(DataFusionError::External(Box::new(Error::RBChunk {
                             source: other,
                             chunk_id: self.id(),
-                        }))
+                        })))
                     }
                 };
 
@@ -178,7 +181,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         column_name: &str,
         predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         ctx.set_metadata("column_name", column_name.to_string());
         ctx.set_metadata("predicate", format!("{}", &predicate));
 
@@ -205,11 +208,13 @@ impl QueryChunk for QuerierChunk {
                 };
                 ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));
 
-                let mut values = rb_chunk.column_values(
-                    rb_predicate,
-                    Selection::Some(&[column_name]),
-                    BTreeMap::new(),
-                )?;
+                let mut values = rb_chunk
+                    .column_values(
+                        rb_predicate,
+                        Selection::Some(&[column_name]),
+                        BTreeMap::new(),
+                    )
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
                 // The InfluxRPC frontend only supports getting column values
                 // for one column at a time (this is a restriction on the Influx
@@ -221,7 +226,8 @@ impl QueryChunk for QuerierChunk {
                     .context(ColumnNameNotFoundSnafu {
                         chunk_id: self.id(),
                         column_name,
-                    })?;
+                    })
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
                 ctx.set_metadata("output_values", values.len() as i64);
 
                 Ok(Some(values))
@@ -234,7 +240,7 @@ impl QueryChunk for QuerierChunk {
         mut ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<SendableRecordBatchStream, DataFusionError> {
         let span_recorder = SpanRecorder::new(
             ctx.span()
                 .map(|span| span.child("QuerierChunk::read_filter")),
diff --git a/querier/src/ingester/mod.rs b/querier/src/ingester/mod.rs
index aac2635c29..9c9f7a8910 100644
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@@ -11,6 +11,7 @@ use data_types::{
     ChunkId, ChunkOrder, IngesterMapping, PartitionId, SequenceNumber, ShardId, ShardIndex,
     TableSummary, TimestampMinMax,
 };
+use datafusion::error::DataFusionError;
 use datafusion_util::MemoryStream;
 use futures::{stream::FuturesUnordered, TryStreamExt};
 use generated_types::{
@@ -24,7 +25,7 @@ use influxdb_iox_client::flight::{
 use iox_query::{
     exec::{stringset::StringSet, IOxSessionContext},
     util::compute_timenanosecond_min_max,
-    QueryChunk, QueryChunkError, QueryChunkMeta,
+    QueryChunk, QueryChunkMeta,
 };
 use iox_time::{Time, TimeProvider};
 use metric::{DurationHistogram, Metric};
@@ -612,9 +613,7 @@ impl IngesterStreamDecoder {
                     partition_id,
                     shard_id,
                     status.parquet_max_sequence_number.map(SequenceNumber::new),
-                    status
-                        .tombstone_max_sequence_number
-                        .map(SequenceNumber::new),
+                    None,
                     partition_sort_key,
                 );
                 self.current_partition = Some(partition);
@@ -1097,7 +1096,7 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         _predicate: &Predicate,
         _columns: Selection<'_>,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // TODO maybe some special handling?
         Ok(None)
     }
@@ -1107,7 +1106,7 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         _column_name: &str,
         _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, QueryChunkError> {
+    ) -> Result<Option<StringSet>, DataFusionError> {
         // TODO maybe some special handling?
         Ok(None)
     }
@@ -1117,11 +1116,15 @@ impl QueryChunk for IngesterChunk {
         _ctx: IOxSessionContext,
         predicate: &Predicate,
         selection: Selection<'_>,
-    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, QueryChunkError> {
+    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, DataFusionError> {
         trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data");
 
         // Apply selection to in-memory batch
-        let batches = match self.schema.df_projection(selection)? {
+        let batches = match self
+            .schema
+            .df_projection(selection)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+        {
             None => self.batches.clone(),
             Some(projection) => self
                 .batches
@@ -1333,7 +1336,6 @@ mod tests {
                             partition_id: 1,
                             status: Some(PartitionStatus {
                                 parquet_max_sequence_number: None,
-                                tombstone_max_sequence_number: None,
                             }),
                         },
                     ))],
@@ -1389,7 +1391,6 @@ mod tests {
                                 partition_id: 1,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1399,7 +1400,6 @@ mod tests {
                                 partition_id: 2,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1409,7 +1409,6 @@ mod tests {
                                 partition_id: 1,
                                 status: Some(PartitionStatus {
                                     parquet_max_sequence_number: None,
-                                    tombstone_max_sequence_number: None,
                                 }),
                             },
                         )),
@@ -1489,7 +1488,6 @@ mod tests {
                                     partition_id: 1,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                     }),
                                 },
                             )),
@@ -1519,7 +1517,6 @@ mod tests {
                                     partition_id: 2,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(21),
-                                        tombstone_max_sequence_number: Some(22),
                                     }),
                                 },
                             )),
@@ -1544,7 +1541,6 @@ mod tests {
                                     partition_id: 3,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(31),
-                                        tombstone_max_sequence_number: Some(32),
                                     }),
                                 },
                             )),
@@ -1574,10 +1570,7 @@ mod tests {
             p1.parquet_max_sequence_number,
             Some(SequenceNumber::new(11))
         );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
         assert_eq!(p1.chunks.len(), 2);
         assert_eq!(p1.chunks[0].schema().as_arrow(), schema_1_1);
         assert_eq!(p1.chunks[0].batches.len(), 2);
@@ -1594,10 +1587,7 @@ mod tests {
             p2.parquet_max_sequence_number,
             Some(SequenceNumber::new(21))
         );
-        assert_eq!(
-            p2.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(22))
-        );
+        assert_eq!(p2.tombstone_max_sequence_number, None);
         assert_eq!(p2.chunks.len(), 1);
         assert_eq!(p2.chunks[0].schema().as_arrow(), schema_2_1);
         assert_eq!(p2.chunks[0].batches.len(), 1);
@@ -1610,10 +1600,7 @@ mod tests {
             p3.parquet_max_sequence_number,
             Some(SequenceNumber::new(31))
         );
-        assert_eq!(
-            p3.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(32))
-        );
+        assert_eq!(p3.tombstone_max_sequence_number, None);
         assert_eq!(p3.chunks.len(), 1);
         assert_eq!(p3.chunks[0].schema().as_arrow(), schema_3_1);
         assert_eq!(p3.chunks[0].batches.len(), 1);
@@ -1733,7 +1720,6 @@ mod tests {
                                     partition_id: 1,
                                     status: Some(PartitionStatus {
                                         parquet_max_sequence_number: Some(11),
-                                        tombstone_max_sequence_number: Some(12),
                                     }),
                                 },
                             )),
@@ -1773,10 +1759,7 @@ mod tests {
             p1.parquet_max_sequence_number,
             Some(SequenceNumber::new(11))
         );
-        assert_eq!(
-            p1.tombstone_max_sequence_number,
-            Some(SequenceNumber::new(12))
-        );
+        assert_eq!(p1.tombstone_max_sequence_number, None);
         assert_eq!(p1.chunks.len(), 1);
     }
 
diff --git a/querier/src/namespace/query_access.rs b/querier/src/namespace/query_access.rs
index b7451000b3..30b9975a06 100644
--- a/querier/src/namespace/query_access.rs
+++ b/querier/src/namespace/query_access.rs
@@ -11,10 +11,11 @@ use data_types::NamespaceId;
 use datafusion::{
     catalog::{catalog::CatalogProvider, schema::SchemaProvider},
     datasource::TableProvider,
+    error::DataFusionError,
 };
 use iox_query::{
     exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext},
-    QueryChunk, QueryCompletedToken, QueryDatabase, QueryDatabaseError, QueryText, DEFAULT_SCHEMA,
+    QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
 };
 use observability_deps::tracing::{debug, trace};
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
@@ -40,8 +41,9 @@ impl QueryDatabase for QuerierNamespace {
         &self,
         table_name: &str,
         predicate: &Predicate,
+        projection: &Option<Vec<usize>>,
         ctx: IOxSessionContext,
-    ) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
         debug!(%table_name, %predicate, "Finding chunks for table");
         // get table metadata
         let table = match self.tables.get(table_name).map(Arc::clone) {
@@ -57,7 +59,7 @@ impl QueryDatabase for QuerierNamespace {
             .chunks(
                 predicate,
                 ctx.span().map(|span| span.child("querier table chunks")),
-                &None, // todo: pushdown projection to chunks
+                projection,
             )
             .await?;
 
@@ -627,7 +629,7 @@ mod tests {
             .unwrap_err();
         assert_eq!(
             err.to_string(),
-            format!("Cannot build plan: External error: Chunk pruning failed: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
+            format!("Cannot build plan: Resources exhausted: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
         );
     }
 
diff --git a/querier/src/table/mod.rs b/querier/src/table/mod.rs
index 19835fde6f..767fa6c83a 100644
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@@ -8,6 +8,7 @@ use crate::{
     IngesterConnection,
 };
 use data_types::{ColumnId, PartitionId, ShardIndex, TableId, TimestampMinMax};
+use datafusion::error::DataFusionError;
 use futures::{join, StreamExt};
 use iox_query::pruning::prune_summaries;
 use iox_query::{exec::Executor, provider, provider::ChunkPruner, QueryChunk};
@@ -65,6 +66,17 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+impl From<Error> for DataFusionError {
+    fn from(err: Error) -> Self {
+        match err {
+            Error::ChunkPruning {
+                source: err @ provider::Error::TooMuchData { .. },
+            } => Self::ResourcesExhausted(err.to_string()),
+            _ => Self::External(Box::new(err) as _),
+        }
+    }
+}
+
 /// Args to create a [`QuerierTable`].
 pub struct QuerierTableArgs {
     pub sharder: Arc<JumpHash<Arc<ShardIndex>>>,
diff --git a/querier/src/table/query_access/mod.rs b/querier/src/table/query_access/mod.rs
index 5665f79171..e16830577b 100644
--- a/querier/src/table/query_access/mod.rs
+++ b/querier/src/table/query_access/mod.rs
@@ -66,8 +66,7 @@ impl TableProvider for QuerierTable {
                 ctx.child_span("querier table chunks"),
                 projection,
             )
-            .await
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+            .await?;
 
         for chunk in chunks {
             builder = builder.add_chunk(chunk);
diff --git a/querier/src/table/state_reconciler.rs b/querier/src/table/state_reconciler.rs
index baa2935911..d5fe4cced6 100644
--- a/querier/src/table/state_reconciler.rs
+++ b/querier/src/table/state_reconciler.rs
@@ -23,6 +23,7 @@ use crate::{
 use self::interface::{IngesterPartitionInfo, ParquetFileInfo, TombstoneInfo};
 
 #[derive(Snafu, Debug)]
+#[allow(missing_copy_implementations)]
 pub enum ReconcileError {
     #[snafu(display("Compactor processed file that the querier would need to split apart which is not yet implemented"))]
     CompactorConflict,
diff --git a/query_tests/cases/in/delete_all.expected b/query_tests/cases/in/delete_all.expected
deleted file mode 100644
index ba828eab9a..0000000000
--- a/query_tests/cases/in/delete_all.expected
+++ /dev/null
@@ -1,25 +0,0 @@
--- Test Setup: OneDeleteSimpleExprOneChunkDeleteAll
--- SQL: SELECT * from cpu;
-++
-++
--- SQL: SELECT time from cpu;
-++
-++
--- SQL: SELECT count(*), count(bar), count(time) from cpu;
-+-----------------+----------------+-----------------+
-| COUNT(UInt8(1)) | COUNT(cpu.bar) | COUNT(cpu.time) |
-+-----------------+----------------+-----------------+
-| 0               | 0              | 0               |
-+-----------------+----------------+-----------------+
--- SQL: SELECT min(bar), max(bar), min(time), max(time) from cpu;
-+--------------+--------------+---------------+---------------+
-| MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+--------------+--------------+---------------+---------------+
-|              |              |               |               |
-+--------------+--------------+---------------+---------------+
--- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-|              |
-+--------------+
diff --git a/query_tests/cases/in/delete_all.sql b/query_tests/cases/in/delete_all.sql
deleted file mode 100644
index b79612846e..0000000000
--- a/query_tests/cases/in/delete_all.sql
+++ /dev/null
@@ -1,17 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteSimpleExprOneChunkDeleteAll
-
--- select *
-SELECT * from cpu;
-
--- select one specific column
-SELECT time from cpu;
-
--- select aggregate of every column inlcuding star
-SELECT count(*), count(bar), count(time) from cpu;
-
--- select aggregate of every column
-SELECT min(bar), max(bar), min(time), max(time) from cpu;
-
--- select aggregate of one column
-SELECT max(bar) from cpu;
\ No newline at end of file
diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_multi_expr_one_chunk.expected
deleted file mode 100644
index f0765f7c16..0000000000
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.expected
+++ /dev/null
@@ -1,207 +0,0 @@
--- Test Setup: OneDeleteMultiExprsOneChunk
--- SQL: SELECT * from cpu order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu order by time, bar;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-| 1970-01-01T00:00:00.000000040Z | 1   |
-+--------------------------------+-----+
--- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 2   |
-+-----+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 2               | 2               | 2              | 1            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000040Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 2               |
-+-----------------+
--- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 2              |
-+----------------+
--- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 2              |
-+----------------+
--- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 2               |
-+-----------------+
--- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT foo from cpu;
--- Results After Sorting
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
--- SQL: SELECT min(foo) as min_foo from cpu order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-+---------+
--- SQL: SELECT max(foo) as max_foo from cpu order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| you     |
-+---------+
--- SQL: SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-+--------------------------------+---------+
-| time                           | max_foo |
-+--------------------------------+---------+
-| 1970-01-01T00:00:00.000000020Z | you     |
-| 1970-01-01T00:00:00.000000040Z | me      |
-+--------------------------------+---------+
--- SQL: SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-+---------+
-| min_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-+-----+---------+
-| bar | max_foo |
-+-----+---------+
-| 1   | me      |
-| 2   | you     |
-+-----+---------+
--- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-+---------+
-| max_foo |
-+---------+
-| me      |
-| you     |
-+---------+
--- SQL: SELECT min(time) as min_time from cpu order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT min(time) as min_time from cpu group by bar order by min_time;
-+--------------------------------+
-| min_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-+-----+--------------------------------+
-| bar | min_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000040Z |
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu group by foo order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-+-----+--------------------------------+
-| foo | max_time                       |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT time from cpu;
--- Results After Sorting
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT max(bar) from cpu order by 1;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 2            |
-+--------------+
--- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| you |
-+-----+
--- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
--- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 2   | you | 1970-01-01T00:00:00.000000020Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
-+----+--------------------------------+
-| mi | ma                             |
-+----+--------------------------------+
-| 2  | 1970-01-01T00:00:00.000000020Z |
-+----+--------------------------------+
diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_multi_expr_one_chunk.sql
deleted file mode 100644
index 5295c53055..0000000000
--- a/query_tests/cases/in/delete_multi_expr_one_chunk.sql
+++ /dev/null
@@ -1,61 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteMultiExprsOneChunk
-
--- select *
-SELECT * from cpu order by bar, foo, time;
-
-SELECT time, bar from cpu order by time, bar;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
--- IOX_COMPARE: sorted
-SELECT foo from cpu;
-
-SELECT min(foo) as min_foo from cpu order by min_foo;
-SELECT max(foo) as max_foo from cpu order by max_foo;
-
-SELECT min(foo) as min_foo from cpu group by time order by min_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
-
-SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
-SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
-SELECT max(foo) as max_foo from cpu group by time order by max_foo;
-
-SELECT min(time) as min_time from cpu order by min_time;
-SELECT max(time) as max_time from cpu order by max_time;
-
-SELECT min(time) as min_time from cpu group by bar order by min_time;
-SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
-SELECT max(time) as max_time from cpu group by foo order by max_time;
-SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-
--- IOX_COMPARE: sorted
-SELECT time from cpu;
-
-SELECT max(bar) from cpu order by 1;
-
---------------------------------------------------------
--- With selection predicate
-
-SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-
-SELECT foo from cpu where bar >= 1.0 order by foo;
-
-SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-
-SELECT * from cpu where foo = 'you' order by bar, foo, time;
-
-SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.expected b/query_tests/cases/in/delete_simple_pred_one_chunk.expected
deleted file mode 100644
index f367cdefef..0000000000
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.expected
+++ /dev/null
@@ -1,91 +0,0 @@
--- Test Setup: OneDeleteSimpleExprOneChunk
--- SQL: SELECT * from cpu;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000020Z | 2   |
-+--------------------------------+-----+
--- SQL: SELECT min(bar), max(bar) from cpu;
-+--------------+--------------+
-| MIN(cpu.bar) | MAX(cpu.bar) |
-+--------------+--------------+
-| 2            | 2            |
-+--------------+--------------+
--- SQL: SELECT time from cpu;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT max(time)  from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT min(time)  from cpu group by bar;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT bar, min(time)  from cpu group by bar;
-+-----+--------------------------------+
-| bar | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT count(time), max(time)  from cpu;
-+-----------------+--------------------------------+
-| COUNT(cpu.time) | MAX(cpu.time)                  |
-+-----------------+--------------------------------+
-| 1               | 1970-01-01T00:00:00.000000020Z |
-+-----------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 1               |
-+-----------------+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT * from cpu where bar = 2.0;
-+-----+--------------------------------+
-| bar | time                           |
-+-----+--------------------------------+
-| 2   | 1970-01-01T00:00:00.000000020Z |
-+-----+--------------------------------+
--- SQL: SELECT * from cpu where bar != 2.0;
-++
-++
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time)                  | MAX(cpu.time)                  |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-| 1               | 1               | 1              | 2            | 2            | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
-+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
--- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-| 0               | 0               | 0              |              |              |               |               |
-+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
--- SQL: SELECT time from cpu where bar=2;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000020Z |
-+--------------------------------+
--- SQL: SELECT bar from cpu where bar!= 2;
-++
-++
diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.sql b/query_tests/cases/in/delete_simple_pred_one_chunk.sql
deleted file mode 100644
index 7b22641c63..0000000000
--- a/query_tests/cases/in/delete_simple_pred_one_chunk.sql
+++ /dev/null
@@ -1,37 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: OneDeleteSimpleExprOneChunk
-
--- select *
-SELECT * from cpu;
-
-SELECT time, bar from cpu;
-
-SELECT min(bar), max(bar) from cpu;
-
-SELECT time from cpu;
-
-SELECT max(time)  from cpu;
-SELECT min(time)  from cpu group by bar;
-SELECT bar, min(time)  from cpu group by bar;
-
-SELECT count(time), max(time)  from cpu;
-
-SELECT count(time)  from cpu;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu;
-
-----------------------------------------------------------------
--- Now add selection predicate
-SELECT * from cpu where bar = 2.0;
-
-SELECT * from cpu where bar != 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar= 2.0;
-
-SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time)  from cpu where bar != 2.0;
-
-SELECT time from cpu where bar=2;
-
-SELECT bar from cpu where bar!= 2;
-
-
diff --git a/query_tests/cases/in/delete_three_chunks_1.expected b/query_tests/cases/in/delete_three_chunks_1.expected
deleted file mode 100644
index 47ec3d3de4..0000000000
--- a/query_tests/cases/in/delete_three_chunks_1.expected
+++ /dev/null
@@ -1,85 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu order by foo, bar, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT time, bar from cpu order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000042Z | 1   |
-| 1970-01-01T00:00:00.000000062Z | 1   |
-| 1970-01-01T00:00:00.000000070Z | 3   |
-| 1970-01-01T00:00:00.000000050Z | 4   |
-| 1970-01-01T00:00:00.000000060Z | 5   |
-| 1970-01-01T00:00:00.000000080Z | 7   |
-+--------------------------------+-----+
--- SQL: SELECT bar from cpu order by bar;
-+-----+
-| bar |
-+-----+
-| 1   |
-| 1   |
-| 1   |
-| 3   |
-| 4   |
-| 5   |
-| 7   |
-+-----+
--- SQL: SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-+---+---+---+----+--------------------------------+--------------------------------+
-| t | c | b | mi | mt                             | mat                            |
-+---+---+---+----+--------------------------------+--------------------------------+
-| 7 | 7 | 7 | 1  | 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+---+---+---+----+--------------------------------+--------------------------------+
--- SQL: SELECT count(time)  from cpu;
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 7               |
-+-----------------+
--- SQL: SELECT count(foo) from cpu;
-+----------------+
-| COUNT(cpu.foo) |
-+----------------+
-| 7              |
-+----------------+
--- SQL: SELECT count(bar) from cpu;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 7              |
-+----------------+
--- SQL: SELECT count(*) from cpu;
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 7               |
-+-----------------+
--- SQL: SELECT min(bar) from cpu;
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT foo from cpu order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| you |
-+-----+
diff --git a/query_tests/cases/in/delete_three_chunks_1.sql b/query_tests/cases/in/delete_three_chunks_1.sql
deleted file mode 100644
index c0105412e9..0000000000
--- a/query_tests/cases/in/delete_three_chunks_1.sql
+++ /dev/null
@@ -1,23 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
--- select *
-SELECT * from cpu order by foo, bar, time;
-
-SELECT time, bar from cpu order by bar, time;
-
-SELECT bar from cpu order by bar;
-
-SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
-
-SELECT count(time)  from cpu;
-
-SELECT count(foo) from cpu;
-
-SELECT count(bar) from cpu;
-
-SELECT count(*) from cpu;
-
-SELECT min(bar) from cpu;
-
-SELECT foo from cpu order by foo;
diff --git a/query_tests/cases/in/delete_three_chunks_2.expected b/query_tests/cases/in/delete_three_chunks_2.expected
deleted file mode 100644
index 99fda88e70..0000000000
--- a/query_tests/cases/in/delete_three_chunks_2.expected
+++ /dev/null
@@ -1,77 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT min(foo) from cpu;
-+--------------+
-| MIN(cpu.foo) |
-+--------------+
-| me           |
-+--------------+
--- SQL: SELECT max(foo) from cpu;
-+--------------+
-| MAX(cpu.foo) |
-+--------------+
-| you          |
-+--------------+
--- SQL: SELECT min(time) from cpu;
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT max(time) from cpu;
-+--------------------------------+
-| MAX(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT foo, min(time) from cpu group by foo;
--- Results After Sorting
-+-----+--------------------------------+
-| foo | MIN(cpu.time)                  |
-+-----+--------------------------------+
-| me  | 1970-01-01T00:00:00.000000040Z |
-| you | 1970-01-01T00:00:00.000000070Z |
-+-----+--------------------------------+
--- SQL: SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-+-----+--------------------------------+
-| bar | max_time                       |
-+-----+--------------------------------+
-| 1   | 1970-01-01T00:00:00.000000062Z |
-| 3   | 1970-01-01T00:00:00.000000070Z |
-| 4   | 1970-01-01T00:00:00.000000050Z |
-| 5   | 1970-01-01T00:00:00.000000060Z |
-| 7   | 1970-01-01T00:00:00.000000080Z |
-+-----+--------------------------------+
--- SQL: SELECT max(time) as max_time from cpu group by bar order by max_time;
-+--------------------------------+
-| max_time                       |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT time from cpu order by time;
-+--------------------------------+
-| time                           |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-| 1970-01-01T00:00:00.000000042Z |
-| 1970-01-01T00:00:00.000000050Z |
-| 1970-01-01T00:00:00.000000060Z |
-| 1970-01-01T00:00:00.000000062Z |
-| 1970-01-01T00:00:00.000000070Z |
-| 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+
--- SQL: SELECT max(bar) from cpu;
-+--------------+
-| MAX(cpu.bar) |
-+--------------+
-| 7            |
-+--------------+
--- SQL: SELECT min(time), max(time) from cpu;
-+--------------------------------+--------------------------------+
-| MIN(cpu.time)                  | MAX(cpu.time)                  |
-+--------------------------------+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
-+--------------------------------+--------------------------------+
diff --git a/query_tests/cases/in/delete_three_chunks_2.sql b/query_tests/cases/in/delete_three_chunks_2.sql
deleted file mode 100644
index bb35711393..0000000000
--- a/query_tests/cases/in/delete_three_chunks_2.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
-SELECT min(foo) from cpu;
-SELECT max(foo) from cpu;
-
-SELECT min(time) from cpu;
-SELECT max(time) from cpu;
-
--- IOX_COMPARE: sorted
-SELECT foo, min(time) from cpu group by foo;
-SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
-SELECT max(time) as max_time from cpu group by bar order by max_time;
-
-SELECT time from cpu order by time;
-
-SELECT max(bar) from cpu;
-
-SELECT min(time), max(time) from cpu;
diff --git a/query_tests/cases/in/delete_three_chunks_3.expected b/query_tests/cases/in/delete_three_chunks_3.expected
deleted file mode 100644
index 3e0c5fb2f6..0000000000
--- a/query_tests/cases/in/delete_three_chunks_3.expected
+++ /dev/null
@@ -1,76 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu where bar != 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where bar = 1 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time;
-++
-++
--- SQL: SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------+
-| MIN(cpu.bar) |
-+--------------+
-| 1            |
-+--------------+
--- SQL: SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------+
-| MAX(cpu.foo) |
-+--------------+
-| me           |
-+--------------+
--- SQL: SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+--------------------------------+
-| MIN(cpu.time)                  |
-+--------------------------------+
-| 1970-01-01T00:00:00.000000040Z |
-+--------------------------------+
--- SQL: SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 6              |
-+----------------+
--- SQL: SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+-----------------+
-| COUNT(cpu.time) |
-+-----------------+
-| 6               |
-+-----------------+
--- SQL: SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-+-----------------+
-| COUNT(UInt8(1)) |
-+-----------------+
-| 6               |
-+-----------------+
diff --git a/query_tests/cases/in/delete_three_chunks_3.sql b/query_tests/cases/in/delete_three_chunks_3.sql
deleted file mode 100644
index 146fcaf95e..0000000000
--- a/query_tests/cases/in/delete_three_chunks_3.sql
+++ /dev/null
@@ -1,27 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
---------------------------------------------------------
--- With selection predicate
-
-SELECT * from cpu where bar != 1.0 order by bar, foo, time;
-
-SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time;
-
-SELECT * from cpu where bar = 1 order by bar, foo, time;
-
-SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time;
-
-SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time;
-
-SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
-
-SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0);
diff --git a/query_tests/cases/in/delete_three_chunks_4.expected b/query_tests/cases/in/delete_three_chunks_4.expected
deleted file mode 100644
index 2283d15375..0000000000
--- a/query_tests/cases/in/delete_three_chunks_4.expected
+++ /dev/null
@@ -1,49 +0,0 @@
--- Test Setup: ThreeDeleteThreeChunks
--- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-| 1   | me  | 1970-01-01T00:00:00.000000042Z |
-| 1   | me  | 1970-01-01T00:00:00.000000062Z |
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-| 4   | me  | 1970-01-01T00:00:00.000000050Z |
-| 5   | me  | 1970-01-01T00:00:00.000000060Z |
-| 7   | me  | 1970-01-01T00:00:00.000000080Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
-+-----+
-| foo |
-+-----+
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| me  |
-| you |
-+-----+
--- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-+--------------------------------+-----+
-| time                           | bar |
-+--------------------------------+-----+
-| 1970-01-01T00:00:00.000000040Z | 1   |
-| 1970-01-01T00:00:00.000000042Z | 1   |
-| 1970-01-01T00:00:00.000000062Z | 1   |
-| 1970-01-01T00:00:00.000000070Z | 3   |
-| 1970-01-01T00:00:00.000000050Z | 4   |
-| 1970-01-01T00:00:00.000000060Z | 5   |
-| 1970-01-01T00:00:00.000000080Z | 7   |
-+--------------------------------+-----+
--- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 3   | you | 1970-01-01T00:00:00.000000070Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma;
-+----+--------------------------------+
-| mi | ma                             |
-+----+--------------------------------+
-| 3  | 1970-01-01T00:00:00.000000070Z |
-+----+--------------------------------+
diff --git a/query_tests/cases/in/delete_three_chunks_4.sql b/query_tests/cases/in/delete_three_chunks_4.sql
deleted file mode 100644
index 95442f6b07..0000000000
--- a/query_tests/cases/in/delete_three_chunks_4.sql
+++ /dev/null
@@ -1,13 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: ThreeDeleteThreeChunks
-
-----------
-SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
-
-SELECT foo from cpu where bar >= 1.0 order by foo;
-
-SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
-
-SELECT * from cpu where foo = 'you' order by bar, foo, time;
-
-SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma;
diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected
deleted file mode 100644
index 6871fa7358..0000000000
--- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected
+++ /dev/null
@@ -1,34 +0,0 @@
--- Test Setup: TwoDeletesMultiExprsOneChunk
--- SQL: SELECT * from cpu;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT foo from cpu;
-+-----+
-| foo |
-+-----+
-| me  |
-+-----+
--- SQL: SELECT * from cpu where cast(time as bigint) > 30;
-+-----+-----+--------------------------------+
-| bar | foo | time                           |
-+-----+-----+--------------------------------+
-| 1   | me  | 1970-01-01T00:00:00.000000040Z |
-+-----+-----+--------------------------------+
--- SQL: SELECT count(bar) from cpu where cast(time as bigint) > 30;
-+----------------+
-| COUNT(cpu.bar) |
-+----------------+
-| 1              |
-+----------------+
--- SQL: SELECT * from cpu where cast(time as bigint) > 40;
-++
-++
--- SQL: SELECT max(time) from cpu where cast(time as bigint) > 40;
-+---------------+
-| MAX(cpu.time) |
-+---------------+
-|               |
-+---------------+
diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql
deleted file mode 100644
index 132d6f42cf..0000000000
--- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql
+++ /dev/null
@@ -1,15 +0,0 @@
--- Demonstrate soft deleted rows will not be return to queries
--- IOX_SETUP: TwoDeletesMultiExprsOneChunk
-
--- select *
-SELECT * from cpu;
-
-SELECT foo from cpu;
-
-SELECT * from cpu where cast(time as bigint) > 30;
-
-SELECT count(bar) from cpu where cast(time as bigint) > 30;
-
-SELECT * from cpu where cast(time as bigint) > 40;
-
-SELECT max(time) from cpu where cast(time as bigint) > 40;
diff --git a/query_tests/src/cases.rs b/query_tests/src/cases.rs
index 9946819fac..69caf0dfe5 100644
--- a/query_tests/src/cases.rs
+++ b/query_tests/src/cases.rs
@@ -1,8 +1,7 @@
-
 //! This file is auto generated by query_tests/generate.
 //! Do not edit manually --> will result in sadness
-use std::path::Path;
 use crate::runner::Runner;
+use std::path::Path;
 
 #[tokio::test]
 // Tests from "basic.sql",
@@ -11,141 +10,8 @@ async fn test_cases_basic_sql() {
 
     let input_path = Path::new("cases").join("in").join("basic.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_all.sql",
-async fn test_cases_delete_all_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_all.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_multi_expr_one_chunk.sql",
-async fn test_cases_delete_multi_expr_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_multi_expr_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_simple_pred_one_chunk.sql",
-async fn test_cases_delete_simple_pred_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_simple_pred_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_1.sql",
-async fn test_cases_delete_three_chunks_1_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_1.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_2.sql",
-async fn test_cases_delete_three_chunks_2_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_2.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_3.sql",
-async fn test_cases_delete_three_chunks_3_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_3.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_three_chunks_4.sql",
-async fn test_cases_delete_three_chunks_4_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_three_chunks_4.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
-
-#[tokio::test]
-// Tests from "delete_two_del_multi_expr_one_chunk.sql",
-async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() {
-    test_helpers::maybe_start_logging();
-
-    let input_path = Path::new("cases").join("in").join("delete_two_del_multi_expr_one_chunk.sql");
-    let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -153,15 +19,12 @@ async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() {
 async fn test_cases_duplicates_ingester_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("duplicates_ingester.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("duplicates_ingester.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -171,13 +34,8 @@ async fn test_cases_duplicates_parquet_sql() {
 
     let input_path = Path::new("cases").join("in").join("duplicates_parquet.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -185,15 +43,12 @@ async fn test_cases_duplicates_parquet_sql() {
 async fn test_cases_new_sql_system_tables_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("new_sql_system_tables.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("new_sql_system_tables.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -203,13 +58,8 @@ async fn test_cases_pushdown_sql() {
 
     let input_path = Path::new("cases").join("in").join("pushdown.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -219,13 +69,8 @@ async fn test_cases_selectors_sql() {
 
     let input_path = Path::new("cases").join("in").join("selectors.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -235,13 +80,8 @@ async fn test_cases_several_chunks_sql() {
 
     let input_path = Path::new("cases").join("in").join("several_chunks.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -249,15 +89,12 @@ async fn test_cases_several_chunks_sql() {
 async fn test_cases_sql_information_schema_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("sql_information_schema.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("sql_information_schema.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -267,13 +104,8 @@ async fn test_cases_timestamps_sql() {
 
     let input_path = Path::new("cases").join("in").join("timestamps.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -283,13 +115,8 @@ async fn test_cases_two_chunks_sql() {
 
     let input_path = Path::new("cases").join("in").join("two_chunks.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
 }
 
 #[tokio::test]
@@ -297,13 +124,10 @@ async fn test_cases_two_chunks_sql() {
 async fn test_cases_two_chunks_missing_columns_sql() {
     test_helpers::maybe_start_logging();
 
-    let input_path = Path::new("cases").join("in").join("two_chunks_missing_columns.sql");
+    let input_path = Path::new("cases")
+        .join("in")
+        .join("two_chunks_missing_columns.sql");
     let mut runner = Runner::new();
-    runner
-        .run(input_path)
-        .await
-        .expect("test failed");
-    runner
-        .flush()
-        .expect("flush worked");
-}
\ No newline at end of file
+    runner.run(input_path).await.expect("test failed");
+    runner.flush().expect("flush worked");
+}
diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs
index eecb583e6b..8d7339dafc 100644
--- a/query_tests/src/influxrpc/field_columns.rs
+++ b/query_tests/src/influxrpc/field_columns.rs
@@ -56,8 +56,6 @@ async fn test_field_columns_no_predicate() {
     run_field_columns_test_case(TwoMeasurementsManyFields {}, predicate, expected_fields).await;
 }
 
-// NGA todo: add delete tests when the TwoMeasurementsManyFieldsWithDelete available
-
 #[tokio::test]
 async fn test_field_columns_with_pred() {
     // get only fields from h20 (but both chunks)
@@ -201,86 +199,6 @@ async fn test_field_name_plan() {
     run_field_columns_test_case(OneMeasurementManyFields {}, predicate, expected_fields).await;
 }
 
-#[tokio::test]
-async fn test_field_name_plan_with_delete() {
-    test_helpers::maybe_start_logging();
-
-    let predicate = Predicate::default().with_range(0, 2000);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_fields = FieldList {
-        fields: vec![
-            Field {
-                name: "field1".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 100,
-            },
-            Field {
-                name: "field2".into(),
-                data_type: DataType::Utf8,
-                last_timestamp: 100,
-            },
-            Field {
-                name: "field3".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 100,
-            },
-        ],
-    };
-
-    run_field_columns_test_case(
-        OneMeasurementManyFieldsWithDelete {},
-        predicate,
-        expected_fields,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_field_name_plan_with_delete_all_time() {
-    test_helpers::maybe_start_logging();
-
-    let predicate = Predicate::default();
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_fields = FieldList {
-        fields: vec![
-            Field {
-                name: "field1".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 0, // all time queries are optimized but do not return timestamps
-            },
-            Field {
-                name: "field2".into(),
-                data_type: DataType::Utf8,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field3".into(),
-                data_type: DataType::Float64,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field4".into(),
-                data_type: DataType::Boolean,
-                last_timestamp: 0,
-            },
-            Field {
-                name: "field5".into(),
-                data_type: DataType::Boolean,
-                last_timestamp: 0,
-            },
-        ],
-    };
-
-    run_field_columns_test_case(
-        OneMeasurementManyFieldsWithDelete {},
-        predicate,
-        expected_fields,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_field_columns_all_time() {
     let predicate = Predicate::default().with_range(MIN_NANO_TIME, MAX_NANO_TIME);
diff --git a/query_tests/src/influxrpc/read_filter.rs b/query_tests/src/influxrpc/read_filter.rs
index c0485f42aa..7f32084f46 100644
--- a/query_tests/src/influxrpc/read_filter.rs
+++ b/query_tests/src/influxrpc/read_filter.rs
@@ -4,15 +4,13 @@ use std::sync::Arc;
 #[cfg(test)]
 use crate::scenarios::{
     DbScenario, DbSetup, EndToEndTest, TwoMeasurements, TwoMeasurementsManyFields,
-    TwoMeasurementsWithDelete, TwoMeasurementsWithDeleteAll,
 };
 use crate::{
     db::AbstractDb,
     influxrpc::util::run_series_set_plan_maybe_error,
     scenarios::{
         MeasurementStatusCode, MeasurementsForDefect2845, MeasurementsSortableTags,
-        MeasurementsSortableTagsWithDelete, TwoMeasurementsMultiSeries,
-        TwoMeasurementsMultiSeriesWithDelete, TwoMeasurementsMultiSeriesWithDeleteAll,
+        TwoMeasurementsMultiSeries,
     },
 };
 use datafusion::{
@@ -205,12 +203,12 @@ async fn test_read_filter_invalid_predicate_case() {
 #[tokio::test]
 async fn test_read_filter_unknown_column_in_predicate() {
     let predicate = Predicate::new()
-        // mystery_region is not a real column, so this predicate is
+        // mystery_region and bar are not real columns, so this predicate is
         // invalid but IOx should be able to handle it (and produce no results)
         .with_expr(
-            col("baz")
-                .eq(lit(4i32))
-                .or(col("bar").and(col("mystery_region").gt(lit(5i32)))),
+            col("baz").eq(lit(4i32)).or(col("bar")
+                .eq(lit("baz"))
+                .and(col("mystery_region").gt(lit(5i32)))),
         );
 
     let predicate = InfluxRpcPredicate::new(None, predicate);
@@ -220,39 +218,6 @@ async fn test_read_filter_unknown_column_in_predicate() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_no_pred_with_delete() {
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [100], values: [70.4]",
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_no_pred_with_delete_all() {
-    // nothing from h2o table because all rows were deleted
-    let expected_results = vec![
-    "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-    "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_filter() {
     // filter out one row in h20
@@ -281,58 +246,6 @@ async fn test_read_filter_data_filter() {
     run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_filter_with_delete() {
-    // filter out one row in h20 but the leftover row was deleted to nothing will be returned
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        .with_expr(col("state").eq(lit("CA"))); // state=CA
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results.clone(),
-    )
-    .await;
-
-    // Same results via a != predicate.
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        .with_expr(col("state").not_eq(lit("MA"))); // state=CA
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-
-    // Use different predicate to have data returned
-    let predicate = Predicate::default()
-        .with_range(100, 300)
-        .with_expr(col("state").eq(lit("MA"))) // state=MA
-        .with_expr(col("_measurement").eq(lit("h2o")));
-
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [100], values: [70.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_filter_fields() {
     // filter out one row in h20
@@ -350,8 +263,6 @@ async fn test_read_filter_data_filter_fields() {
     run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await;
 }
 
-// NGA todo: add delete tests here after we have delete scenarios for 2 chunks for 1 table
-
 #[tokio::test]
 async fn test_read_filter_data_filter_measurement_pred() {
     // use an expr on table name to pick just the last row from o2
@@ -378,16 +289,6 @@ async fn test_read_filter_data_pred_refers_to_non_existent_column() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_refers_to_non_existent_column_with_delete() {
-    let predicate = Predicate::default().with_expr(col("tag_not_in_h20").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![] as Vec<&str>;
-
-    run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_no_columns() {
     // predicate with no columns,
@@ -402,59 +303,6 @@ async fn test_read_filter_data_pred_no_columns() {
     run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_no_columns_with_delete() {
-    // predicate with no columns,
-    let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=user, _measurement=cpu, region=west}\n  FloatPoints timestamps: [100], values: [23.2]",
-        "Series tags={_field=bytes, _measurement=disk, region=east}\n  IntegerPoints timestamps: [200], values: [99]",
-    ];
-
-    run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_pred_no_columns_with_delete_all() {
-    // predicate with no columns,
-    let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // Only table disk has no deleted data
-    let expected_results = vec![
-    "Series tags={_field=bytes, _measurement=disk, region=east}\n  IntegerPoints timestamps: [200], values: [99]",
-    ];
-
-    run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await;
-}
-
-#[tokio::test]
-async fn test_read_filter_data_pred_refers_to_good_and_non_existent_columns() {
-    // predicate with both a column that does and does not appear
-    let predicate = Predicate::default()
-        .with_expr(col("state").eq(lit("MA")))
-        .with_expr(col("tag_not_in_h20").eq(lit("foo")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![] as Vec<&str>;
-
-    run_read_filter_test_case(
-        TwoMeasurements {},
-        predicate.clone(),
-        expected_results.clone(),
-    )
-    .await;
-    run_read_filter_test_case(
-        TwoMeasurementsWithDelete {},
-        predicate.clone(),
-        expected_results.clone(),
-    )
-    .await;
-    run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_using_regex_match() {
     let predicate = Predicate::default()
@@ -487,50 +335,6 @@ async fn test_read_filter_data_pred_using_regex_match_on_field() {
     run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_using_regex_match_with_delete() {
-    let predicate = Predicate::default()
-        .with_range(200, 300)
-        // will match CA state
-        .with_regex_match_expr("state", "C.*");
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // the selected row was soft deleted
-    let expected_results = vec![];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate,
-        expected_results,
-    )
-    .await;
-
-    // Different predicate to have data returned
-    let predicate = Predicate::default()
-        .with_range(200, 400)
-        // will match CA state
-        .with_regex_match_expr("state", "C.*");
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-    ];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate.clone(),
-        expected_results,
-    )
-    .await;
-
-    // Try same predicate but on delete_all data
-    let expected_results = vec![];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_pred_using_regex_not_match() {
     let predicate = Predicate::default()
@@ -600,45 +404,6 @@ async fn test_read_filter_data_pred_unsupported_in_scan() {
     run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_pred_unsupported_in_scan_with_delete() {
-    test_helpers::maybe_start_logging();
-
-    // These predicates can't be pushed down into chunks, but they can
-    // be evaluated by the general purpose DataFusion plan
-
-    // (STATE = 'CA') OR (READING > 0)
-    let predicate =
-        Predicate::default().with_expr(col("state").eq(lit("CA")).or(col("reading").gt(lit(0))));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    // Note these results include data from both o2 and h2o
-    let expected_results = vec![
-        "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n  FloatPoints timestamps: [350], values: [90.0]",
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDelete {},
-        predicate.clone(),
-        expected_results,
-    )
-    .await;
-
-    // With delete all from h2o, no rows from h2p should be returned
-    let expected_results = vec![
-        "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.0, 51.0]",
-        "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n  FloatPoints timestamps: [100, 250], values: [50.4, 53.4]",
-    ];
-    run_read_filter_test_case(
-        TwoMeasurementsMultiSeriesWithDeleteAll {},
-        predicate,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_data_plan_order() {
     test_helpers::maybe_start_logging();
@@ -659,25 +424,6 @@ async fn test_read_filter_data_plan_order() {
     .await;
 }
 
-#[tokio::test]
-async fn test_read_filter_data_plan_order_with_delete() {
-    test_helpers::maybe_start_logging();
-    let expected_results = vec![
-        "Series tags={_field=other, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [250], values: [5.0]",
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n  FloatPoints timestamps: [250], values: [70.5]",
-        "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA, zz_tag=A}\n  FloatPoints timestamps: [1000], values: [70.4]",
-        "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=A}\n  FloatPoints timestamps: [800], values: [70.1]",
-        "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=B}\n  FloatPoints timestamps: [100], values: [70.2]",
-    ];
-
-    run_read_filter_test_case(
-        MeasurementsSortableTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_filter_filter_on_value() {
     test_helpers::maybe_start_logging();
diff --git a/query_tests/src/influxrpc/read_group.rs b/query_tests/src/influxrpc/read_group.rs
index 25a0be0732..8867710b65 100644
--- a/query_tests/src/influxrpc/read_group.rs
+++ b/query_tests/src/influxrpc/read_group.rs
@@ -5,7 +5,6 @@ use crate::{
         AnotherMeasurementForAggs, DbScenario, DbSetup, MeasurementForDefect2691,
         MeasurementForGroupByField, MeasurementForGroupKeys, MeasurementForMax, MeasurementForMin,
         MeasurementForSelectors, OneMeasurementForAggs, OneMeasurementNoTags2,
-        OneMeasurementNoTagsWithDelete, OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk,
         TwoMeasurementForAggs, TwoMeasurementsManyFields, TwoMeasurementsManyFieldsOneChunk,
     },
 };
@@ -93,75 +92,6 @@ async fn test_read_group_data_no_tag_columns() {
     .await;
 }
 
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_count_with_delete() {
-    let agg = Aggregate::Count;
-    let group_columns = vec![];
-    let expected_results = vec![
-        "Group tag_keys: _field, _measurement partition_key_vals: ",
-        "Series tags={_field=foo, _measurement=m0}\n  IntegerPoints timestamps: [2], values: [1]",
-    ];
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_min_with_delete() {
-    let agg = Aggregate::Min;
-    let group_columns = vec![];
-    let expected_results = vec![
-        "Group tag_keys: _field, _measurement partition_key_vals: ",
-        "Series tags={_field=foo, _measurement=m0}\n  FloatPoints timestamps: [2], values: [2.0]",
-    ];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDelete {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_count_with_delete_all() {
-    let agg = Aggregate::Count;
-    let group_columns = vec![];
-    let expected_results = vec![];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns.clone(),
-        expected_results,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn test_read_group_data_no_tag_columns_min_with_delete_all() {
-    let agg = Aggregate::Min;
-    let group_columns = vec![];
-    let expected_results = vec![];
-
-    run_read_group_test_case(
-        OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {},
-        InfluxRpcPredicate::default(),
-        agg,
-        group_columns,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_group_data_pred() {
     let predicate = Predicate::default()
diff --git a/query_tests/src/influxrpc/read_window_aggregate.rs b/query_tests/src/influxrpc/read_window_aggregate.rs
index d1547dd6d2..1c3d1b44a0 100644
--- a/query_tests/src/influxrpc/read_window_aggregate.rs
+++ b/query_tests/src/influxrpc/read_window_aggregate.rs
@@ -170,47 +170,6 @@ async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697() {
     .await;
 }
 
-#[tokio::test]
-async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697_with_delete() {
-    let predicate = Predicate::default()
-        // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z'
-        .with_range(1609459201000000001, 1609459201000000031);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let agg = Aggregate::Min;
-    let every = WindowDuration::from_nanoseconds(10);
-    let offset = WindowDuration::from_nanoseconds(0);
-
-    // one row deleted
-    let expected_results = vec![
-        "Series tags={_field=bar, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000011], values: [5.0]",
-        "Series tags={_field=foo, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000001, 1609459201000000024], values: [1.0, 11.24]",
-        "Series tags={_field=bar, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000009, 1609459201000000015], values: [4.0, 6.0]",
-        "Series tags={_field=foo, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000002], values: [2.0]",
-    ];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDelete {},
-        predicate.clone(),
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-
-    // all rows deleted
-    let expected_results = vec![];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDeleteAll {},
-        predicate,
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-}
-
 // See https://github.com/influxdata/influxdb_iox/issues/2697
 #[tokio::test]
 async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697() {
@@ -276,50 +235,6 @@ async fn test_grouped_series_set_plan_group_aggregate_filter_on_field() {
     .await;
 }
 
-#[tokio::test]
-async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697_with_delete() {
-    let predicate = Predicate::default()
-        // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z'
-        .with_range(1609459201000000001, 1609459201000000031);
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-
-    let agg = Aggregate::Sum;
-    let every = WindowDuration::from_nanoseconds(10);
-    let offset = WindowDuration::from_nanoseconds(0);
-
-    // one row deleted
-
-    // The windowed aggregate is using a non-selector aggregate (SUM, COUNT, MEAD).
-    // For each distinct series the window defines the `time` column
-    let expected_results = vec![
-        "Series tags={_field=bar, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000020], values: [5.0]",
-        "Series tags={_field=foo, _measurement=mm, section=1a}\n  FloatPoints timestamps: [1609459201000000010, 1609459201000000030], values: [4.0, 11.24]",
-        "Series tags={_field=bar, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000010, 1609459201000000020], values: [4.0, 6.0]",
-        "Series tags={_field=foo, _measurement=mm, section=2b}\n  FloatPoints timestamps: [1609459201000000010], values: [2.0]",
-    ];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDelete {},
-        predicate.clone(),
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-
-    // all rows deleted
-    let expected_results = vec![];
-    run_read_window_aggregate_test_case(
-        MeasurementForDefect2697WithDeleteAll {},
-        predicate,
-        agg,
-        every,
-        offset,
-        expected_results,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn test_read_window_aggregate_overflow() {
     let predicate = Predicate::default().with_range(1609459201000000001, 1609459201000000024);
diff --git a/query_tests/src/influxrpc/table_names.rs b/query_tests/src/influxrpc/table_names.rs
index e18710d099..c7f23c3cd1 100644
--- a/query_tests/src/influxrpc/table_names.rs
+++ b/query_tests/src/influxrpc/table_names.rs
@@ -100,106 +100,31 @@ async fn list_table_names_no_non_null_general_data_passes() {
     run_table_names_test_case(TwoMeasurementsManyFields {}, predicate, vec![]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_no_data_pred_with_delete() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDelete {},
-        InfluxRpcPredicate::default(),
-        vec!["cpu", "disk"],
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_table_names_no_data_pred_with_delete_all() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDeleteAll {},
-        InfluxRpcPredicate::default(),
-        vec!["disk"],
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_0_201() {
     run_table_names_test_case(TwoMeasurements {}, tsp(0, 201), vec!["cpu", "disk"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_0_201_with_delete() {
-    run_table_names_test_case(
-        TwoMeasurementsWithDelete {},
-        tsp(0, 201),
-        vec!["cpu", "disk"],
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_0_201_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 201), vec!["disk"]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_0_200() {
     run_table_names_test_case(TwoMeasurements {}, tsp(0, 200), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_0_200_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(0, 200), vec!["cpu"]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_0_200_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 200), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_50_101() {
     run_table_names_test_case(TwoMeasurements {}, tsp(50, 101), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_50_101_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(50, 101), vec!["cpu"]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_50_101_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(50, 101), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_101_160() {
     run_table_names_test_case(TwoMeasurements {}, tsp(101, 160), vec!["cpu"]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_101_160_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(101, 160), vec![]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_101_160_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(101, 160), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_data_pred_250_300() {
     run_table_names_test_case(TwoMeasurements {}, tsp(250, 300), vec![]).await;
 }
 
-#[tokio::test]
-async fn list_table_names_data_pred_250_300_with_delete() {
-    run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(250, 300), vec![]).await;
-}
-
-#[tokio::test]
-async fn list_table_names_data_pred_250_300_with_delete_all() {
-    run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(250, 300), vec![]).await;
-}
-
 #[tokio::test]
 async fn list_table_names_max_time_included() {
     run_table_names_test_case(
diff --git a/query_tests/src/influxrpc/tag_keys.rs b/query_tests/src/influxrpc/tag_keys.rs
index da21ca52d7..a15672fde0 100644
--- a/query_tests/src/influxrpc/tag_keys.rs
+++ b/query_tests/src/influxrpc/tag_keys.rs
@@ -169,24 +169,6 @@ async fn list_tag_name_end_to_end() {
     run_tag_keys_test_case(EndToEndTest {}, predicate, expected_tag_keys).await;
 }
 
-#[tokio::test]
-async fn list_tag_name_end_to_end_with_delete_and_pred() {
-    let predicate = Predicate::default()
-        .with_range(0, 10000)
-        .with_expr(col("host").eq(lit("server01")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-    let expected_tag_keys = vec!["host", "region"];
-    run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await;
-}
-
-#[tokio::test]
-async fn list_tag_name_end_to_end_with_delete() {
-    let predicate = Predicate::default().with_expr(col("_measurement").eq(lit("swap")));
-    let predicate = InfluxRpcPredicate::new(None, predicate);
-    let expected_tag_keys = vec!["host", "name"];
-    run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await;
-}
-
 #[tokio::test]
 async fn list_tag_name_max_time() {
     test_helpers::maybe_start_logging();
diff --git a/query_tests/src/influxrpc/tag_values.rs b/query_tests/src/influxrpc/tag_values.rs
index 7a99ab59e7..0e9e2c532a 100644
--- a/query_tests/src/influxrpc/tag_values.rs
+++ b/query_tests/src/influxrpc/tag_values.rs
@@ -80,32 +80,6 @@ async fn list_tag_values_no_predicate_state_col() {
     .await;
 }
 
-#[tokio::test]
-async fn list_tag_values_no_predicate_state_col_with_delete() {
-    let tag_name = "state";
-    let expected_tag_keys = vec!["CA", "MA"];
-    run_tag_values_test_case(
-        OneMeasurementManyNullTagsWithDelete {},
-        tag_name,
-        InfluxRpcPredicate::default(),
-        expected_tag_keys,
-    )
-    .await;
-}
-
-#[tokio::test]
-async fn list_tag_values_no_predicate_state_col_with_delete_all() {
-    let tag_name = "state";
-    let expected_tag_keys = vec![];
-    run_tag_values_test_case(
-        OneMeasurementManyNullTagsWithDeleteAll {},
-        tag_name,
-        InfluxRpcPredicate::default(),
-        expected_tag_keys,
-    )
-    .await;
-}
-
 #[tokio::test]
 async fn list_tag_values_no_predicate_city_col() {
     let tag_name = "city";
diff --git a/query_tests/src/scenarios/library.rs b/query_tests/src/scenarios/library.rs
index 5e52a1ba9c..f3fb22c756 100644
--- a/query_tests/src/scenarios/library.rs
+++ b/query_tests/src/scenarios/library.rs
@@ -6,7 +6,6 @@ use super::{
 };
 use crate::scenarios::util::{make_n_chunks_scenario, ChunkData};
 use async_trait::async_trait;
-use data_types::{DeleteExpr, DeletePredicate, Op, Scalar, TimestampRange};
 use iox_query::frontend::sql::SqlQueryPlanner;
 
 #[derive(Debug)]
@@ -83,82 +82,6 @@ impl DbSetup for OneMeasurementManyNullTags {
     }
 }
 
-#[derive(Debug)]
-pub struct OneMeasurementManyNullTagsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyNullTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,state=CA,city=LA,county=LA temp=70.4 100",
-            "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250",
-            "h2o,state=MA,city=Boston temp=50.4 200",
-            "h2o,state=CA temp=79.0 300",
-            "h2o,state=NY temp=60.8 400",
-            "h2o,state=NY,city=NYC temp=61.0 500",
-            "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600",
-        ];
-
-        // pred: delete from h2o where 400 <= time <= 602 and state=NY
-        // 3 rows of h2o & NY state will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(400, 602),
-            exprs: vec![DeleteExpr::new(
-                "state".to_string(),
-                Op::Eq,
-                Scalar::String(("NY").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-#[derive(Debug)]
-pub struct OneMeasurementManyNullTagsWithDeleteAll {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyNullTagsWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,state=CA,city=LA,county=LA temp=70.4 100",
-            "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250",
-            "h2o,state=MA,city=Boston temp=50.4 200",
-            "h2o,state=CA temp=79.0 300",
-            "h2o,state=NY temp=60.8 400",
-            "h2o,state=NY,city=NYC temp=61.0 500",
-            "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600",
-        ];
-
-        // pred: delete from h2o where 100 <= time <= 602
-        // all rows of h2o  will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(100, 602),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 /// Two measurements data in different chunk scenarios
 #[derive(Debug)]
 pub struct TwoMeasurements {}
@@ -177,85 +100,6 @@ impl DbSetup for TwoMeasurements {
     }
 }
 
-/// Two measurements data in different chunk scenarios
-/// with one delete applied at different stages of the chunk
-#[derive(Debug)]
-pub struct TwoMeasurementsWithDelete {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "cpu,region=west user=23.2 100",
-            "cpu,region=west user=21.0 150",
-            "disk,region=east bytes=99i 200",
-        ];
-
-        // pred: delete from cpu where 120 <= time <= 160 and region="west"
-        // delete 1 row from cpu with timestamp 150
-        let table_name = "cpu";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 160),
-            exprs: vec![DeleteExpr::new(
-                "region".to_string(),
-                Op::Eq,
-                Scalar::String("west".to_string()),
-            )],
-        };
-
-        // return all possible combination scenarios of a chunk stage and when the delete
-        // predicates are applied
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp_lines, table_name, partition_key).await
-    }
-}
-
-/// Two measurements data in different chunk scenarios
-/// with 2 deletes that remove all data from one table
-#[derive(Debug)]
-pub struct TwoMeasurementsWithDeleteAll {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "cpu,region=west user=23.2 100",
-            "cpu,region=west user=21.0 150",
-            "disk,region=east bytes=99i 200",
-        ];
-
-        // pred: delete from cpu where 120 <= time <= 160 and region="west"
-        // which will delete second row of the cpu
-        let table_name = "cpu";
-        let pred1 = DeletePredicate {
-            range: TimestampRange::new(120, 160),
-            exprs: vec![DeleteExpr::new(
-                "region".to_string(),
-                Op::Eq,
-                Scalar::String("west".to_string()),
-            )],
-        };
-
-        // delete the first row of the cpu
-        let pred2 = DeletePredicate {
-            range: TimestampRange::new(0, 110),
-            exprs: vec![],
-        };
-
-        // return all possible combination scenarios of a chunk stage and when the delete
-        // predicates are applied
-        all_scenarios_for_one_chunk(
-            vec![&pred1],
-            vec![&pred2],
-            lp_lines,
-            table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 #[derive(Debug)]
 pub struct TwoMeasurementsUnsignedType {}
 #[async_trait]
@@ -710,44 +554,6 @@ impl DbSetup for OneMeasurementManyFields {
         all_scenarios_for_one_chunk(vec![], vec![], lp_lines, "h2o", partition_key).await
     }
 }
-
-#[derive(Debug)]
-pub struct OneMeasurementManyFieldsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementManyFieldsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        // Order this so field3 comes before field2
-        // (and thus the columns need to get reordered)
-        let lp_lines = vec![
-            "h2o,tag1=foo,tag2=bar field1=70.6,field3=2 100",
-            "h2o,tag1=foo,tag2=bar field1=70.4,field2=\"ss\" 100",
-            "h2o,tag1=foo,tag2=bar field1=70.5,field2=\"ss\" 100",
-            "h2o,tag1=foo,tag2=bar field1=70.6,field4=true 1000",
-            "h2o,tag1=foo,tag2=bar field1=70.3,field5=false 3000",
-        ];
-
-        // pred: delete from h2o where 1000 <= time <= 1100
-        // 1 rows of h2o with timestamp 1000 will be deleted which means
-        // field4 no longer available
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1000, 1100),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 /// This data (from end to end test)
 #[derive(Debug)]
 pub struct EndToEndTest {}
@@ -772,48 +578,6 @@ impl DbSetup for EndToEndTest {
     }
 }
 
-#[derive(Debug)]
-pub struct EndToEndTestWithDelete {}
-#[async_trait]
-impl DbSetup for EndToEndTestWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let lp_lines = vec![
-            "cpu_load_short,host=server01,region=us-west value=0.64 0000",
-            "cpu_load_short,host=server01 value=27.99 1000",
-            "cpu_load_short,host=server02,region=us-west value=3.89 2000",
-            "cpu_load_short,host=server01,region=us-east value=1234567.891011 3000",
-            "cpu_load_short,host=server01,region=us-west value=0.000003 4000",
-            "system,host=server03 uptime=1303385 5000",
-            "swap,host=server01,name=disk0 in=3,out=4 6000",
-            "status active=t 7000",
-            "attributes color=\"blue\" 8000",
-        ];
-
-        let partition_key = "1970-01-01T00";
-
-        // pred: delete from swap where 6000 <= time <= 6000 and name=disk0
-        // 1 rows of swap with name=disk0 will be deleted
-        let delete_table_name = "swap";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(6000, 6000),
-            exprs: vec![DeleteExpr::new(
-                "name".to_string(),
-                Op::Eq,
-                Scalar::String(("disk0").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 #[derive(Debug)]
 pub struct TwoMeasurementsMultiSeries {}
 #[async_trait]
@@ -838,84 +602,6 @@ impl DbSetup for TwoMeasurementsMultiSeries {
     }
 }
 
-#[derive(Debug)]
-pub struct TwoMeasurementsMultiSeriesWithDelete {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsMultiSeriesWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let mut lp_lines = vec![
-            "h2o,state=MA,city=Boston temp=70.4 100", // to row 2
-            "h2o,state=MA,city=Boston temp=72.4 250", // to row 1
-            "h2o,state=CA,city=LA temp=90.0 200",     // to row 0
-            "h2o,state=CA,city=LA temp=90.0 350",     // to row 3
-            "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5
-            "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4
-        ];
-
-        // Swap around data is not inserted in series order
-        lp_lines.swap(0, 2);
-        lp_lines.swap(4, 5);
-
-        // pred: delete from h2o where 120 <= time <= 250
-        // 2 rows of h2o with timestamp 200 and 350 will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 250),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-#[derive(Debug)]
-pub struct TwoMeasurementsMultiSeriesWithDeleteAll {}
-#[async_trait]
-impl DbSetup for TwoMeasurementsMultiSeriesWithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let mut lp_lines = vec![
-            "h2o,state=MA,city=Boston temp=70.4 100", // to row 2
-            "h2o,state=MA,city=Boston temp=72.4 250", // to row 1
-            "h2o,state=CA,city=LA temp=90.0 200",     // to row 0
-            "h2o,state=CA,city=LA temp=90.0 350",     // to row 3
-            "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5
-            "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4
-        ];
-
-        // Swap around data is not inserted in series order
-        lp_lines.swap(0, 2);
-        lp_lines.swap(4, 5);
-
-        // Delete all data form h2o
-        // pred: delete from h20 where 100 <= time <= 360
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(100, 360),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 pub struct MeasurementStatusCode {}
 #[async_trait]
 impl DbSetup for MeasurementStatusCode {
@@ -950,44 +636,6 @@ impl DbSetup for MeasurementsSortableTags {
     }
 }
 
-#[derive(Debug)]
-pub struct MeasurementsSortableTagsWithDelete {}
-#[async_trait]
-impl DbSetup for MeasurementsSortableTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-
-        let lp_lines = vec![
-            "h2o,zz_tag=A,state=MA,city=Kingston temp=70.1 800",
-            "h2o,state=MA,city=Kingston,zz_tag=B temp=70.2 100",
-            "h2o,state=CA,city=Boston temp=70.3 250", // soft deleted
-            "h2o,state=MA,city=Boston,zz_tag=A temp=70.4 1000",
-            "h2o,state=MA,city=Boston temp=70.5,other=5.0 250",
-        ];
-
-        // pred: delete from h2o where 120 <= time <= 350 and state=CA
-        // 1 rows of h2o with timestamp 250 will be deleted
-        let delete_table_name = "h2o";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(120, 350),
-            exprs: vec![DeleteExpr::new(
-                "state".to_string(),
-                Op::Eq,
-                Scalar::String(("CA").to_string()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 // See issue: https://github.com/influxdata/influxdb_iox/issues/2845
 #[derive(Debug)]
 pub struct MeasurementsForDefect2845 {}
@@ -1019,65 +667,6 @@ impl DbSetup for OneMeasurementNoTags2 {
     }
 }
 
-pub struct OneMeasurementNoTagsWithDelete {}
-#[async_trait]
-impl DbSetup for OneMeasurementNoTagsWithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-        let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"];
-
-        // pred: delete from m0 where 1 <= time <= 1 and foo=1.0
-        // 1 row of m0 with timestamp 1
-        let delete_table_name = "m0";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 1),
-            exprs: vec![DeleteExpr::new(
-                "foo".to_string(),
-                Op::Eq,
-                Scalar::F64((1.0).into()),
-            )],
-        };
-
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
-/// This will create many scenarios: some have a chunk with soft deleted data, some have no chunks
-/// because there is no point to create compacted chunks with all deleted data.
-pub struct OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {}
-#[async_trait]
-impl DbSetup for OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "1970-01-01T00";
-        let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"];
-
-        // pred: delete from m0 where 1 <= time <= 2
-        let delete_table_name = "m0";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 2),
-            exprs: vec![],
-        };
-
-        // Apply predicate before the chunk is moved if any. There will be scenarios without chunks
-        // as a consequence of not-compacting-deleted-data
-        all_scenarios_for_one_chunk(
-            vec![&pred],
-            vec![],
-            lp_lines,
-            delete_table_name,
-            partition_key,
-        )
-        .await
-    }
-}
-
 pub struct OneMeasurementForAggs {}
 #[async_trait]
 impl DbSetup for OneMeasurementForAggs {
@@ -1310,65 +899,6 @@ impl DbSetup for MeasurementForDefect2697 {
     }
 }
 
-pub struct MeasurementForDefect2697WithDelete {}
-#[async_trait]
-impl DbSetup for MeasurementForDefect2697WithDelete {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "2021-01-01T00";
-
-        let lp = vec![
-            "mm,section=1a bar=5.0 1609459201000000011",
-            "mm,section=1a bar=0.28 1609459201000000031",
-            "mm,section=2b bar=4.0 1609459201000000009",
-            "mm,section=2b bar=6.0 1609459201000000015",
-            "mm,section=2b bar=1.2 1609459201000000022",
-            "mm,section=1a foo=1.0 1609459201000000001",
-            "mm,section=1a foo=3.0 1609459201000000005",
-            "mm,section=1a foo=11.24 1609459201000000024",
-            "mm,section=2b foo=2.0 1609459201000000002",
-        ];
-
-        // pred: delete from mm where 1609459201000000022 <= time <= 1609459201000000022
-        // 1 row of m0 with timestamp 1609459201000000022 (section=2b bar=1.2)
-        let delete_table_name = "mm";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1609459201000000022, 1609459201000000022),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await
-    }
-}
-
-pub struct MeasurementForDefect2697WithDeleteAll {}
-#[async_trait]
-impl DbSetup for MeasurementForDefect2697WithDeleteAll {
-    async fn make(&self) -> Vec<DbScenario> {
-        let partition_key = "2021-01-01T00";
-
-        let lp = vec![
-            "mm,section=1a bar=5.0 1609459201000000011",
-            "mm,section=1a bar=0.28 1609459201000000031",
-            "mm,section=2b bar=4.0 1609459201000000009",
-            "mm,section=2b bar=6.0 1609459201000000015",
-            "mm,section=2b bar=1.2 1609459201000000022",
-            "mm,section=1a foo=1.0 1609459201000000001",
-            "mm,section=1a foo=3.0 1609459201000000005",
-            "mm,section=1a foo=11.24 1609459201000000024",
-            "mm,section=2b foo=2.0 1609459201000000002",
-        ];
-
-        // pred: delete from mm where 1 <= time <= 1609459201000000031
-        let delete_table_name = "mm";
-        let pred = DeletePredicate {
-            range: TimestampRange::new(1, 1609459201000000031),
-            exprs: vec![],
-        };
-
-        all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await
-    }
-}
-
 // Test data to validate fix for:
 // https://github.com/influxdata/influxdb_iox/issues/2890
 pub struct MeasurementForDefect2890 {}
diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs
index f9a687f03a..477503504b 100644
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@@ -14,12 +14,9 @@ use generated_types::{
 };
 use influxdb_iox_client::flight::{low_level::LowLevelMessage, Error as FlightError};
 use ingester::{
-    data::{
-        partition::resolver::CatalogPartitionResolver, FlatIngesterQueryResponse, IngesterData,
-        IngesterQueryResponse, Persister,
-    },
-    lifecycle::mock_handle::NoopLifecycleHandle,
-    querier_handler::prepare_data_to_querier,
+    data::{partition::resolver::CatalogPartitionResolver, IngesterData, Persister},
+    lifecycle::mock_handle::MockLifecycleHandle,
+    querier_handler::{prepare_data_to_querier, FlatIngesterQueryResponse, IngesterQueryResponse},
 };
 use iox_catalog::interface::get_schema_by_name;
 use iox_query::exec::{Executor, ExecutorConfig};
@@ -722,7 +719,7 @@ impl MockIngester {
     /// Takes `&self mut` because our partioning implementation does not work with concurrent
     /// access.
     async fn buffer_operation(&mut self, dml_operation: DmlOperation) {
-        let lifecycle_handle = NoopLifecycleHandle {};
+        let lifecycle_handle = MockLifecycleHandle::default();
 
         let should_pause = self
             .ingester_data
@@ -752,7 +749,32 @@ impl MockIngester {
                 .map(|f| f.id)
                 .collect();
 
-            self.ingester_data.persist(*partition_id).await;
+            let p = self
+                .catalog
+                .catalog
+                .repositories()
+                .await
+                .partitions()
+                .get_by_id(*partition_id)
+                .await
+                .unwrap()
+                .expect("partition not found");
+
+            let namespace_id = self
+                .catalog
+                .catalog
+                .repositories()
+                .await
+                .tables()
+                .get_by_id(p.table_id)
+                .await
+                .unwrap()
+                .expect("table does not exist")
+                .namespace_id;
+
+            self.ingester_data
+                .persist(p.shard_id, namespace_id, p.table_id, *partition_id)
+                .await;
 
             result.extend(
                 self.catalog
@@ -1023,9 +1045,6 @@ impl QueryDataAdapter {
                                     parquet_max_sequence_number: status
                                         .parquet_max_sequence_number
                                         .map(|x| x.get()),
-                                    tombstone_max_sequence_number: status
-                                        .tombstone_max_sequence_number
-                                        .map(|x| x.get()),
                                 }),
                             },
                         ),
diff --git a/query_tests/src/table_schema.rs b/query_tests/src/table_schema.rs
index f01a1b8b7d..359ba1ce49 100644
--- a/query_tests/src/table_schema.rs
+++ b/query_tests/src/table_schema.rs
@@ -38,7 +38,7 @@ async fn run_table_schema_test_case<D>(
 
         let ctx = db.new_query_context(None);
         let chunks = db
-            .chunks(table_name, &Default::default(), ctx)
+            .chunks(table_name, &Default::default(), &None, ctx)
             .await
             .expect("error getting chunks");
         for chunk in chunks {
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 7b655f9f91..d19ecf8b4d 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -20,7 +20,7 @@ metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 predicate = { path = "../predicate" }
@@ -47,7 +47,7 @@ pretty_assertions = "1.3.0"
 rand = "0.8.3"
 schema = { path = "../schema" }
 test_helpers = { version = "0.1.0", path = "../test_helpers", features = ["future_timeout"] }
-tokio-stream = { version = "0.1.10", default_features = false, features = [] }
+tokio-stream = { version = "0.1.11", default_features = false, features = [] }
 
 [lib]
 # Allow --save-baseline to work
diff --git a/service_common/src/planner.rs b/service_common/src/planner.rs
index 6431963aad..e1bc5adf71 100644
--- a/service_common/src/planner.rs
+++ b/service_common/src/planner.rs
@@ -60,7 +60,7 @@ impl Planner {
                 planner
                     .table_names(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("table_names error: {}", e)))
+                    .map_err(|e| e.to_df_error("table_names"))
             })
             .await
     }
@@ -82,7 +82,7 @@ impl Planner {
                 planner
                     .tag_keys(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("tag_keys error: {}", e)))
+                    .map_err(|e| e.to_df_error("tag_keys"))
             })
             .await
     }
@@ -106,7 +106,7 @@ impl Planner {
                 planner
                     .tag_values(database, &tag_name, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("tag_values error: {}", e)))
+                    .map_err(|e| e.to_df_error("tag_values"))
             })
             .await
     }
@@ -128,7 +128,7 @@ impl Planner {
                 planner
                     .field_columns(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("field_columns error: {}", e)))
+                    .map_err(|e| e.to_df_error("field_columns"))
             })
             .await
     }
@@ -150,7 +150,7 @@ impl Planner {
                 planner
                     .read_filter(database, predicate)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_filter error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_filter"))
             })
             .await
     }
@@ -174,7 +174,7 @@ impl Planner {
                 planner
                     .read_group(database, predicate, agg, &group_columns)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_group error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_group"))
             })
             .await
     }
@@ -199,7 +199,7 @@ impl Planner {
                 planner
                     .read_window_aggregate(database, predicate, agg, every, offset)
                     .await
-                    .map_err(|e| Error::Plan(format!("read_window_aggregate error: {}", e)))
+                    .map_err(|e| e.to_df_error("read_window_aggregate"))
             })
             .await
     }
diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml
index 172e89b560..b9999514e1 100644
--- a/service_grpc_flight/Cargo.toml
+++ b/service_grpc_flight/Cargo.toml
@@ -26,7 +26,7 @@ futures = "0.3"
 pin-project = "1.0"
 prost = "0.11"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tonic = "0.8"
diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs
index f88ce0d184..f4d84266e6 100644
--- a/service_grpc_flight/src/lib.rs
+++ b/service_grpc_flight/src/lib.rs
@@ -9,7 +9,7 @@ use arrow_flight::{
 use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use bytes::{Bytes, BytesMut};
 use data_types::{DatabaseName, DatabaseNameError};
-use datafusion::physical_plan::ExecutionPlan;
+use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
 use futures::{SinkExt, Stream, StreamExt};
 use generated_types::influxdata::iox::querier::v1 as proto;
 use iox_query::{
@@ -54,7 +54,7 @@ pub enum Error {
     ))]
     Query {
         database_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Invalid database name: {}", source))]
@@ -91,29 +91,40 @@ impl From<Error> for tonic::Status {
             Error::Optimize { .. }
             | Error::Planning { .. } | Error::Serialization { .. } => warn!(?err, msg),
         }
-        err.to_status()
+        err.into_status()
     }
 }
 
 impl Error {
     /// Converts a result from the business logic into the appropriate tonic
     /// status
-    fn to_status(&self) -> tonic::Status {
-        use tonic::Status;
-        match &self {
-            Self::InvalidTicket { .. } => Status::invalid_argument(self.to_string()),
-            Self::InvalidTicketLegacy { .. } => Status::invalid_argument(self.to_string()),
-            Self::InvalidQuery { .. } => Status::invalid_argument(self.to_string()),
-            Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()),
-            Self::Query { .. } => Status::internal(self.to_string()),
-            Self::InvalidDatabaseName { .. } => Status::invalid_argument(self.to_string()),
-            Self::Planning {
-                source: service_common::planner::Error::External(_),
-            } => Status::internal(self.to_string()),
-            Self::Planning { .. } => Status::invalid_argument(self.to_string()),
-            Self::Optimize { .. } => Status::internal(self.to_string()),
-            Self::Serialization { .. } => Status::internal(self.to_string()),
-        }
+    fn into_status(self) -> tonic::Status {
+        let msg = self.to_string();
+
+        let code = match self {
+            Self::DatabaseNotFound { .. } => tonic::Code::NotFound,
+            Self::InvalidTicket { .. }
+            | Self::InvalidTicketLegacy { .. }
+            | Self::InvalidQuery { .. }
+            | Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument,
+            Self::Planning { source, .. } | Self::Query { source, .. } => {
+                // traverse context chain
+                let mut source = source;
+                while let DataFusionError::Context(_msg, inner) = source {
+                    source = *inner;
+                }
+
+                match source {
+                    DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
+                    DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
+                    DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
+                    _ => tonic::Code::Internal,
+                }
+            }
+            Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal,
+        };
+
+        tonic::Status::new(code, msg)
     }
 }
 
@@ -334,7 +345,6 @@ impl GetStream {
         let mut stream_record_batches = ctx
             .execute_stream(Arc::clone(&physical_plan))
             .await
-            .map_err(|e| Box::new(e) as _)
             .context(QuerySnafu {
                 database_name: &database_name,
             })?;
@@ -382,7 +392,7 @@ impl GetStream {
                         // failure sending here is OK because we're cutting the stream anyways
                         tx.send(Err(Error::Query {
                             database_name: database_name.clone(),
-                            source: Box::new(e),
+                            source: DataFusionError::ArrowError(e),
                         }
                         .into()))
                             .await
diff --git a/service_grpc_influxrpc/Cargo.toml b/service_grpc_influxrpc/Cargo.toml
index ea4169e05c..00c5df645f 100644
--- a/service_grpc_influxrpc/Cargo.toml
+++ b/service_grpc_influxrpc/Cargo.toml
@@ -26,7 +26,7 @@ pin-project = "1.0"
 prost = "0.11"
 regex = "1.6.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-stream = { version = "0.1", features = ["net"] }
diff --git a/service_grpc_influxrpc/src/expr.rs b/service_grpc_influxrpc/src/expr.rs
index 58a5806b4e..8da9cebc67 100644
--- a/service_grpc_influxrpc/src/expr.rs
+++ b/service_grpc_influxrpc/src/expr.rs
@@ -906,6 +906,7 @@ mod tests {
                     let schema = SchemaBuilder::new()
                         .tag("t1")
                         .tag("t2")
+                        .tag("host")
                         .field("foo", DataType::Int64)
                         .field("bar", DataType::Int64)
                         .build()
diff --git a/service_grpc_influxrpc/src/service.rs b/service_grpc_influxrpc/src/service.rs
index f8d54e2d05..734f856b88 100644
--- a/service_grpc_influxrpc/src/service.rs
+++ b/service_grpc_influxrpc/src/service.rs
@@ -12,6 +12,7 @@ use crate::{
     StorageService,
 };
 use data_types::{org_and_bucket_to_database, DatabaseName};
+use datafusion::error::DataFusionError;
 use futures::Stream;
 use generated_types::{
     google::protobuf::Empty, literal_or_regex::Value as RegexOrLiteralValue,
@@ -54,43 +55,43 @@ pub enum Error {
     #[snafu(display("Error listing tables in database '{}': {}", db_name, source))]
     ListingTables {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error listing columns in database '{}': {}", db_name, source))]
     ListingColumns {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error listing fields in database '{}': {}", db_name, source))]
     ListingFields {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error creating series plans for database '{}': {}", db_name, source))]
     PlanningFilteringSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error creating group plans for database '{}': {}", db_name, source))]
     PlanningGroupSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error running series plans for database '{}': {}", db_name, source))]
     FilteringSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error running grouping plans for database '{}': {}", db_name, source))]
     GroupingSeries {
         db_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display(
@@ -102,7 +103,7 @@ pub enum Error {
     ListingTagValues {
         db_name: String,
         tag_name: String,
-        source: Box<dyn std::error::Error + Send + Sync>,
+        source: DataFusionError,
     },
 
     #[snafu(display("Error converting Predicate '{}: {}", rpc_predicate_string, source))]
@@ -177,44 +178,56 @@ impl From<Error> for tonic::Status {
     /// status
     fn from(err: Error) -> Self {
         error!("Error handling gRPC request: {}", err);
-        err.to_status()
+        err.into_status()
     }
 }
 
 impl Error {
     /// Converts a result from the business logic into the appropriate tonic
     /// status
-    fn to_status(&self) -> tonic::Status {
-        match &self {
-            Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()),
-            Self::ListingTables { .. } => Status::internal(self.to_string()),
-            Self::ListingColumns { .. } => {
-                // TODO: distinguish between input errors and internal errors
-                Status::invalid_argument(self.to_string())
+    fn into_status(self) -> tonic::Status {
+        let msg = self.to_string();
+
+        let code = match self {
+            Self::DatabaseNotFound { .. } => tonic::Code::NotFound,
+            Self::ListingTables { source, .. }
+            | Self::ListingColumns { source, .. }
+            | Self::ListingFields { source, .. }
+            | Self::PlanningFilteringSeries { source, .. }
+            | Self::PlanningGroupSeries { source, .. }
+            | Self::FilteringSeries { source, .. }
+            | Self::GroupingSeries { source, .. }
+            | Self::ListingTagValues { source, .. } => {
+                // traverse context chain
+                let mut source = source;
+                while let DataFusionError::Context(_msg, inner) = source {
+                    source = *inner;
+                }
+
+                match source {
+                    DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
+                    DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
+                    DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
+                    _ => tonic::Code::Internal,
+                }
             }
-            Self::ListingFields { .. } => {
-                // TODO: distinguish between input errors and internal errors
-                Status::invalid_argument(self.to_string())
+            Self::ConvertingPredicate { .. }
+            | Self::ConvertingReadGroupAggregate { .. }
+            | Self::ConvertingReadGroupType { .. }
+            | Self::ConvertingWindowAggregate { .. }
+            | Self::ConvertingTagKeyInTagValues { .. }
+            | Self::ComputingGroupedSeriesSet { .. }
+            | Self::ConvertingFieldList { .. }
+            | Self::MeasurementLiteralOrRegex { .. }
+            | Self::MissingTagKeyPredicate {}
+            | Self::InvalidTagKeyRegex { .. } => tonic::Code::InvalidArgument,
+            Self::SendingResults { .. } | Self::InternalHintsFieldNotSupported { .. } => {
+                tonic::Code::Internal
             }
-            Self::PlanningFilteringSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::PlanningGroupSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::FilteringSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::GroupingSeries { .. } => Status::invalid_argument(self.to_string()),
-            Self::ListingTagValues { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingPredicate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingReadGroupAggregate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingReadGroupType { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingWindowAggregate { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingTagKeyInTagValues { .. } => Status::invalid_argument(self.to_string()),
-            Self::ComputingGroupedSeriesSet { .. } => Status::invalid_argument(self.to_string()),
-            Self::ConvertingFieldList { .. } => Status::invalid_argument(self.to_string()),
-            Self::SendingResults { .. } => Status::internal(self.to_string()),
-            Self::InternalHintsFieldNotSupported { .. } => Status::internal(self.to_string()),
-            Self::NotYetImplemented { .. } => Status::internal(self.to_string()),
-            Self::MeasurementLiteralOrRegex { .. } => Status::invalid_argument(self.to_string()),
-            Self::MissingTagKeyPredicate {} => Status::invalid_argument(self.to_string()),
-            Self::InvalidTagKeyRegex { .. } => Status::invalid_argument(self.to_string()),
-        }
+            Self::NotYetImplemented { .. } => tonic::Code::Unimplemented,
+        };
+
+        tonic::Status::new(code, msg)
     }
 }
 
@@ -341,7 +354,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status())?
+        .map_err(|e| e.into_status())?
         .into_iter()
         .map(Ok)
         .collect::<Vec<_>>();
@@ -423,7 +436,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status())?
+        .map_err(|e| e.into_status())?
         .into_iter()
         .map(Ok)
         .collect::<Vec<_>>();
@@ -489,7 +502,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -560,7 +573,7 @@ where
                         operation: "tag_value for a measurement, with general predicate"
                             .to_string(),
                     }
-                    .to_status());
+                    .into_status());
                 }
 
                 measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx).await
@@ -593,7 +606,7 @@ where
             }
         };
 
-        let response = response.map_err(|e| e.to_status());
+        let response = response.map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -652,7 +665,7 @@ where
         let results =
             tag_values_grouped_by_measurement_and_tag_key_impl(Arc::clone(&db), db_name, req, &ctx)
                 .await
-                .map_err(|e| e.to_status())?
+                .map_err(|e| e.into_status())?
                 .into_iter()
                 .map(Ok)
                 .collect::<Vec<_>>();
@@ -762,7 +775,7 @@ where
 
         let response = measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx)
             .await
-            .map_err(|e| e.to_status());
+            .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -833,7 +846,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -907,7 +920,7 @@ where
             &ctx,
         )
         .await
-        .map_err(|e| e.to_status());
+        .map_err(|e| e.into_status());
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -981,9 +994,9 @@ where
         .map(|fieldlist| {
             fieldlist_to_measurement_fields_response(fieldlist)
                 .context(ConvertingFieldListSnafu)
-                .map_err(|e| e.to_status())
+                .map_err(|e| e.into_status())
         })
-        .map_err(|e| e.to_status())?;
+        .map_err(|e| e.into_status())?;
 
         if response.is_ok() {
             query_completed_token.set_success();
@@ -1048,13 +1061,11 @@ where
     let plan = Planner::new(ctx)
         .table_names(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTablesSnafu { db_name })?;
 
     let table_names = ctx
         .to_string_set(plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTablesSnafu { db_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1095,13 +1106,11 @@ where
     let tag_key_plan = Planner::new(ctx)
         .tag_keys(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingColumnsSnafu { db_name })?;
 
     let tag_keys = ctx
         .to_string_set(tag_key_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingColumnsSnafu { db_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1142,13 +1151,11 @@ where
     let tag_value_plan = Planner::new(ctx)
         .tag_values(db, tag_name, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTagValuesSnafu { db_name, tag_name })?;
 
     let tag_values = ctx
         .to_string_set(tag_value_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingTagValuesSnafu { db_name, tag_name })?;
 
     // Map the resulting collection of Strings into a Vec<Vec<u8>>for return
@@ -1266,14 +1273,12 @@ where
     let series_plan = Planner::new(ctx)
         .read_filter(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(PlanningFilteringSeriesSnafu { db_name })?;
 
     // Execute the plans.
     let series_or_groups = ctx
         .to_series_and_groups(series_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(FilteringSeriesSnafu { db_name })
         .log_if_error("Running series set plan")?;
 
@@ -1319,9 +1324,8 @@ where
                 .await
         }
     };
-    let grouped_series_set_plan = grouped_series_set_plan
-        .map_err(|e| Box::new(e) as _)
-        .context(PlanningGroupSeriesSnafu { db_name })?;
+    let grouped_series_set_plan =
+        grouped_series_set_plan.context(PlanningGroupSeriesSnafu { db_name })?;
 
     // PERF - This used to send responses to the client before execution had
     // completed, but now it doesn't. We may need to revisit this in the future
@@ -1331,7 +1335,6 @@ where
     let series_or_groups = ctx
         .to_series_and_groups(grouped_series_set_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(GroupingSeriesSnafu { db_name })
         .log_if_error("Running Grouped SeriesSet Plan")?;
 
@@ -1370,13 +1373,11 @@ where
     let field_list_plan = Planner::new(ctx)
         .field_columns(db, predicate)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingFieldsSnafu { db_name })?;
 
     let field_list = ctx
         .to_field_list(field_list_plan)
         .await
-        .map_err(|e| Box::new(e) as _)
         .context(ListingFieldsSnafu { db_name })?;
 
     trace!(field_names=?field_list, "Field names response");
@@ -1801,11 +1802,13 @@ mod tests {
         // Note multiple tables / measureemnts:
         let chunk0 = TestChunk::new("m1")
             .with_id(0)
+            .with_tag_column("state")
             .with_tag_column("k1")
             .with_tag_column("k2");
 
         let chunk1 = TestChunk::new("m2")
             .with_id(1)
+            .with_tag_column("state")
             .with_tag_column("k3")
             .with_tag_column("k4");
 
@@ -1825,7 +1828,7 @@ mod tests {
         };
 
         let actual_tag_keys = fixture.storage_client.tag_keys(request).await.unwrap();
-        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"];
+        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"];
 
         assert_eq!(actual_tag_keys, expected_tag_keys,);
 
@@ -1878,7 +1881,7 @@ mod tests {
         let response = fixture.storage_client.tag_keys(request).await;
         assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "TagKeys", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "TagKeys", "server_error", 1);
     }
 
     /// test the plumbing of the RPC layer for measurement_tag_keys--
@@ -1897,6 +1900,7 @@ mod tests {
             .with_tag_column("k0");
 
         let chunk1 = TestChunk::new("m4")
+            .with_tag_column("state")
             .with_tag_column("k1")
             .with_tag_column("k2")
             .with_tag_column("k3")
@@ -1926,7 +1930,7 @@ mod tests {
             .measurement_tag_keys(request)
             .await
             .unwrap();
-        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"];
+        let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"];
 
         assert_eq!(
             actual_tag_keys, expected_tag_keys,
@@ -1984,7 +1988,7 @@ mod tests {
         let response = fixture.storage_client.measurement_tag_keys(request).await;
         assert_contains!(response.unwrap_err().to_string(), "This is an error");
 
-        grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "server_error", 1);
     }
 
     /// test the plumbing of the RPC layer for tag_values -- specifically that
@@ -2173,7 +2177,8 @@ mod tests {
             "Error converting tag_key to UTF-8 in tag_values request"
         );
 
-        grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 2);
+        grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2524,7 +2529,7 @@ mod tests {
 
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2730,7 +2735,7 @@ mod tests {
         let response = fixture.storage_client.read_filter(request).await;
         assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadFilter", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadFilter", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2822,7 +2827,7 @@ mod tests {
             .to_string();
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadGroup", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadGroup", "server_error", 1);
     }
 
     #[tokio::test]
@@ -2988,7 +2993,7 @@ mod tests {
 
         assert_contains!(response_string, "Sugar we are going down");
 
-        grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "client_error", 1);
+        grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "server_error", 1);
     }
 
     #[tokio::test]
diff --git a/service_grpc_object_store/Cargo.toml b/service_grpc_object_store/Cargo.toml
index d25393c791..6a2bcac921 100644
--- a/service_grpc_object_store/Cargo.toml
+++ b/service_grpc_object_store/Cargo.toml
@@ -8,7 +8,7 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
 iox_catalog = { path = "../iox_catalog" }
-object_store = "0.5.0"
+object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
 tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
diff --git a/test_fixtures/cpu.parquet b/test_fixtures/cpu.parquet
new file mode 100644
index 0000000000..86cae861b6
Binary files /dev/null and b/test_fixtures/cpu.parquet differ
diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs
index 0f4567a973..5017b0bbba 100644
--- a/test_helpers_end_to_end/src/client.rs
+++ b/test_helpers_end_to_end/src/client.rs
@@ -1,12 +1,12 @@
 //! Client helpers for writing end to end ng tests
 use arrow::record_batch::RecordBatch;
 use futures::{stream::FuturesUnordered, StreamExt};
+use generated_types::influxdata::pbdata::v1::WriteResponse;
 use http::Response;
 use hyper::{Body, Client, Request};
 use influxdb_iox_client::{
     connection::Connection,
     flight::generated_types::ReadInfo,
-    write::generated_types::WriteResponse,
     write_info::generated_types::{merge_responses, GetWriteInfoResponse, ShardStatus},
 };
 use observability_deps::tracing::info;
diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs
index b3dc091a93..d4597e8584 100644
--- a/test_helpers_end_to_end/src/config.rs
+++ b/test_helpers_end_to_end/src/config.rs
@@ -290,6 +290,11 @@ impl TestConfig {
         self.with_env("INFLUXDB_IOX_FLIGHT_DO_GET_PANIC", times.to_string())
     }
 
+    /// Configure maximum per-table query bytes for the querier.
+    pub fn with_querier_max_table_query_bytes(self, bytes: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_MAX_TABLE_QUERY_BYTES", bytes.to_string())
+    }
+
     /// Changes the log to JSON for easier parsing.
     pub fn with_json_logs(self) -> Self {
         self.with_env("LOG_FORMAT", "json")
diff --git a/write_summary/Cargo.toml b/write_summary/Cargo.toml
index d3313a19ee..d303ad5b50 100644
--- a/write_summary/Cargo.toml
+++ b/write_summary/Cargo.toml
@@ -9,7 +9,7 @@ data_types = { path = "../data_types" }
 dml = { path = "../dml" }
 generated_types = { path = "../generated_types" }
 observability_deps = { path = "../observability_deps" }
-serde_json = "1.0.83"
+serde_json = "1.0.86"
 snafu = "0.7"
 workspace-hack = { path = "../workspace-hack"}