Merge branch 'main' into cn/one-at-a-time-please

2022-11-03 13:53:35 +00:00 · 2022-11-03 13:53:35 +00:00 · 3fcca070f0
parent 729ffffa3e 6565352a93
commit 3fcca070f0
121 changed files with 2345 additions and 1228 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -172,8 +172,7 @@ jobs:
      - cache_restore
      - run:
          name: Cargo doc
-          # excluding datafusion because it's effectively a dependency masqueraded as workspace crate.
-          command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion
+          command: cargo doc --document-private-items --no-deps --workspace
      - cache_save
      - run:
          name: Compress Docs
--- a/Cargo.lock
+++ b/Cargo.lock
@ -475,9 +475,9 @@ checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"

 [[package]]
 name = "bytemuck"
-version = "1.12.1"
+version = "1.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da"
+checksum = "5aec14f5d4e6e3f927cd0c81f72e5710d95ee9019fbeb4b3021193867491bfd8"

 [[package]]
 name = "byteorder"
@ -541,9 +541,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "cc"
-version = "1.0.73"
+version = "1.0.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
+checksum = "581f5dba903aac52ea3feb5ec4810848460ee833876f1f9b0fdeab1f19091574"
 dependencies = [
 "jobserver",
 ]
@ -718,9 +718,9 @@ dependencies = [

 [[package]]
 name = "cmake"
-version = "0.1.48"
+version = "0.1.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a"
+checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c"
 dependencies = [
 "cc",
 ]
@ -843,9 +843,9 @@ dependencies = [

 [[package]]
 name = "const-random"
-version = "0.1.14"
+version = "0.1.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acf7ab93790ae0eac37744aff15866e9e3dcc31515d7bf34a6d0fc6c9726b564"
+checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e"
 dependencies = [
 "const-random-macro",
 "proc-macro-hack",
@ -853,9 +853,9 @@ dependencies = [

 [[package]]
 name = "const-random-macro"
-version = "0.1.14"
+version = "0.1.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c6495bfab021aa116773c3e215be28cee0604417ea358f49966fba050c40d9c"
+checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb"
 dependencies = [
 "getrandom",
 "once_cell",
@ -1129,10 +1129,12 @@ dependencies = [
 [[package]]
 name = "datafusion"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "ahash 0.8.0",
 "arrow",
+ "arrow-buffer",
+ "arrow-schema",
 "async-compression",
 "async-trait",
 "bytes",
@ -1147,6 +1149,7 @@ dependencies = [
 "flate2",
 "futures",
 "glob",
+ "half 2.1.0",
 "hashbrown",
 "itertools",
 "lazy_static",
@ -1161,7 +1164,7 @@ dependencies = [
 "pin-project-lite",
 "rand",
 "smallvec",
- "sqlparser 0.25.0",
+ "sqlparser",
 "tempfile",
 "tokio",
 "tokio-stream",
@ -1173,31 +1176,32 @@ dependencies = [
 [[package]]
 name = "datafusion-common"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "arrow",
+ "chrono",
 "object_store",
 "ordered-float 3.3.0",
 "parquet",
- "sqlparser 0.25.0",
+ "sqlparser",
 ]

 [[package]]
 name = "datafusion-expr"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "ahash 0.8.0",
 "arrow",
 "datafusion-common",
 "log",
- "sqlparser 0.25.0",
+ "sqlparser",
 ]

 [[package]]
 name = "datafusion-optimizer"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "arrow",
 "async-trait",
@ -1212,7 +1216,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "ahash 0.8.0",
 "arrow",
@ -1223,6 +1227,7 @@ dependencies = [
 "datafusion-expr",
 "datafusion-row",
 "hashbrown",
+ "itertools",
 "lazy_static",
 "md-5",
 "ordered-float 3.3.0",
@ -1236,13 +1241,13 @@ dependencies = [
 [[package]]
 name = "datafusion-proto"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "arrow",
- "async-trait",
 "datafusion",
 "datafusion-common",
 "datafusion-expr",
+ "pbjson-build",
 "prost 0.11.0",
 "prost-build 0.11.1",
 ]
@ -1250,7 +1255,7 @@ dependencies = [
 [[package]]
 name = "datafusion-row"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "arrow",
 "datafusion-common",
@ -1261,12 +1266,12 @@ dependencies = [
 [[package]]
 name = "datafusion-sql"
 version = "13.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=48f73c6af3b0cc747c38b4a9c7a610f4630e8736#48f73c6af3b0cc747c38b4a9c7a610f4630e8736"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=2b08a43b82127ef144204e5999dd2730fa1c4756#2b08a43b82127ef144204e5999dd2730fa1c4756"
 dependencies = [
 "arrow",
 "datafusion-common",
 "datafusion-expr",
- "sqlparser 0.25.0",
+ "sqlparser",
 ]

 [[package]]
@ -1953,9 +1958,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"

 [[package]]
 name = "hyper"
-version = "0.14.20"
+version = "0.14.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac"
+checksum = "abfba89e19b959ca163c7752ba59d737c1ceea53a5d31a149c805446fc958064"
 dependencies = [
 "bytes",
 "futures-channel",
@ -2002,9 +2007,9 @@ dependencies = [

 [[package]]
 name = "iana-time-zone"
-version = "0.1.51"
+version = "0.1.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed"
+checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
 dependencies = [
 "android_system_properties",
 "core-foundation-sys",
@ -2266,7 +2271,7 @@ version = "0.1.0"
 dependencies = [
 "generated_types",
 "snafu",
- "sqlparser 0.26.0",
+ "sqlparser",
 "workspace-hack",
 ]

@ -2289,6 +2294,7 @@ dependencies = [
 "flatbuffers",
 "futures",
 "generated_types",
+ "hashbrown",
 "hyper",
 "iox_catalog",
 "iox_query",
@ -3056,6 +3062,7 @@ version = "0.1.0"
 dependencies = [
 "bytes",
 "criterion",
+ "data_types",
 "dml",
 "flate2",
 "generated_types",
@ -3277,9 +3284,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
+checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860"
 dependencies = [
 "parking_lot_core 0.9.4",
 ]
@ -3310,9 +3317,9 @@ dependencies = [

 [[package]]
 name = "os_str_bytes"
-version = "6.3.0"
+version = "6.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
+checksum = "3baf96e39c5359d2eb0dd6ccb42c62b91d9678aa68160d261b9e0ccbf9e9dea9"

 [[package]]
 name = "output_vt100"
@ -3606,9 +3613,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

 [[package]]
 name = "pkg-config"
-version = "0.3.25"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"

 [[package]]
 name = "pprof"
@ -3655,7 +3662,7 @@ dependencies = [
 "query_functions",
 "schema",
 "snafu",
- "sqlparser 0.26.0",
+ "sqlparser",
 "test_helpers",
 "workspace-hack",
 ]
@ -3931,6 +3938,7 @@ dependencies = [
 "schema",
 "service_common",
 "service_grpc_catalog",
+ "service_grpc_object_store",
 "service_grpc_schema",
 "sharder",
 "snafu",
@ -4269,7 +4277,7 @@ dependencies = [
 [[package]]
 name = "rskafka"
 version = "0.3.0"
-source = "git+https://github.com/influxdata/rskafka.git?rev=8c98c56b5d4b06206ce40e21404a75e6bb7bf7af#8c98c56b5d4b06206ce40e21404a75e6bb7bf7af"
+source = "git+https://github.com/influxdata/rskafka.git?rev=8678dfe049de05415929ffec7c1be8921bb057f7#8678dfe049de05415929ffec7c1be8921bb057f7"
 dependencies = [
 "async-socks5",
 "async-trait",
@ -4281,10 +4289,10 @@ dependencies = [
 "parking_lot 0.12.1",
 "pin-project-lite",
 "rand",
- "snap",
 "thiserror",
 "tokio",
 "tracing",
+ "zstd",
 ]

 [[package]]
@ -4764,15 +4772,6 @@ dependencies = [
 "unicode_categories",
 ]

-[[package]]
-name = "sqlparser"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "sqlparser"
 version = "0.26.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -110,8 +110,8 @@ license = "MIT OR Apache-2.0"
 [workspace.dependencies]
 arrow = { version = "25.0.0" }
 arrow-flight = { version = "25.0.0" }
-datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="48f73c6af3b0cc747c38b4a9c7a610f4630e8736", default-features = false }
-datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="48f73c6af3b0cc747c38b4a9c7a610f4630e8736" }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="2b08a43b82127ef144204e5999dd2730fa1c4756", default-features = false }
+datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="2b08a43b82127ef144204e5999dd2730fa1c4756" }
 parquet = { version = "25.0.0" }

 # This profile optimizes for runtime performance and small binary size at the expense of longer
--- a/compactor/src/query.rs
+++ b/compactor/src/query.rs
@ -4,40 +4,18 @@ use data_types::{
    ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
    TableSummary, Timestamp, Tombstone,
 };
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
+use datafusion::error::DataFusionError;
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
    util::create_basic_summary,
-    QueryChunk, QueryChunkMeta,
+    QueryChunk, QueryChunkData, QueryChunkMeta,
 };
-use observability_deps::tracing::trace;
 use parquet_file::chunk::ParquetChunk;
 use predicate::{delete_predicate::tombstones_to_delete_predicates, Predicate};
-use schema::{merge::SchemaMerger, selection::Selection, sort::SortKey, Schema};
-use snafu::{ResultExt, Snafu};
+use schema::{merge::SchemaMerger, sort::SortKey, Projection, Schema};
 use std::{any::Any, sync::Arc};
 use uuid::Uuid;

-#[derive(Debug, Snafu)]
-#[allow(missing_copy_implementations, missing_docs)]
-pub enum Error {
-    #[snafu(display("Failed to read parquet: {}", source))]
-    ReadParquet {
-        source: parquet_file::storage::ReadError,
-    },
-
-    #[snafu(display(
-        "Error reading IOx Metadata from Parquet IoxParquetMetadata: {}",
-        source
-    ))]
-    ReadParquetMeta {
-        source: parquet_file::storage::ReadError,
-    },
-}
-
-/// A specialized `Error` for Compactor's query errors
-pub type Result<T, E = Error> = std::result::Result<T, E>;
-
 /// QueryableParquetChunk that implements QueryChunk and QueryMetaChunk for building query plan
 #[derive(Debug, Clone)]
 pub struct QueryableParquetChunk {
@ -134,8 +112,8 @@ impl QueryableParquetChunk {
 }

 impl QueryChunkMeta for QueryableParquetChunk {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
-        Some(Arc::clone(&self.summary))
+    fn summary(&self) -> Arc<TableSummary> {
+        Arc::clone(&self.summary)
    }

    fn schema(&self) -> Arc<Schema> {
@ -194,7 +172,7 @@ impl QueryChunk for QueryableParquetChunk {
        &self,
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
-        _columns: Selection<'_>,
+        _columns: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }
@ -213,33 +191,8 @@ impl QueryChunk for QueryableParquetChunk {
        Ok(None)
    }

-    /// Provides access to raw `QueryChunk` data as an
-    /// asynchronous stream of `RecordBatch`es filtered by a *required*
-    /// predicate. Note that not all chunks can evaluate all types of
-    /// predicates and this function will return an error
-    /// if requested to evaluate with a predicate that is not supported
-    ///
-    /// This is the analog of the `TableProvider` in DataFusion
-    ///
-    /// The reason we can't simply use the `TableProvider` trait
-    /// directly is that the data for a particular Table lives in
-    /// several chunks within a partition, so there needs to be an
-    /// implementation of `TableProvider` that stitches together the
-    /// streams from several different `QueryChunk`s.
-    fn read_filter(
-        &self,
-        mut ctx: IOxSessionContext,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, DataFusionError> {
-        ctx.set_metadata("storage", "compactor");
-        ctx.set_metadata("projection", format!("{}", selection));
-        trace!(?selection, "selection");
-
-        self.data
-            .read_filter(predicate, selection, ctx.inner())
-            .context(ReadParquetSnafu)
-            .map_err(|e| DataFusionError::External(Box::new(e)))
+    fn data(&self) -> QueryChunkData {
+        QueryChunkData::Parquet(self.data.parquet_exec_input())
    }

    /// Returns chunk type
--- a/datafusion_util/src/config.rs
+++ b/datafusion_util/src/config.rs
@ -0,0 +1,32 @@
+use datafusion::{
+    config::{
+        OPT_COALESCE_TARGET_BATCH_SIZE, OPT_PARQUET_PUSHDOWN_FILTERS, OPT_PARQUET_REORDER_FILTERS,
+    },
+    prelude::SessionConfig,
+};
+
+// The default catalog name - this impacts what SQL queries use if not specified
+pub const DEFAULT_CATALOG: &str = "public";
+// The default schema name - this impacts what SQL queries use if not specified
+pub const DEFAULT_SCHEMA: &str = "iox";
+
+/// The maximum number of rows that DataFusion should create in each RecordBatch
+pub const BATCH_SIZE: usize = 8 * 1024;
+
+const COALESCE_BATCH_SIZE: usize = BATCH_SIZE / 2;
+
+/// Return a SessionConfig object configured for IOx
+pub fn iox_session_config() -> SessionConfig {
+    SessionConfig::new()
+        .with_batch_size(BATCH_SIZE)
+        .set_u64(
+            OPT_COALESCE_TARGET_BATCH_SIZE,
+            COALESCE_BATCH_SIZE.try_into().unwrap(),
+        )
+        // Enable parquet predicate pushdown optimization
+        .set_bool(OPT_PARQUET_PUSHDOWN_FILTERS, true)
+        .set_bool(OPT_PARQUET_REORDER_FILTERS, true)
+        .create_default_catalog_and_schema(true)
+        .with_information_schema(true)
+        .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
+}
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@ -10,6 +10,7 @@
 //! [datafusion_optimizer::utils](https://docs.rs/datafusion-optimizer/13.0.0/datafusion_optimizer/utils/index.html)
 //! for expression manipulation functions.

+pub mod config;
 pub mod sender;
 pub mod watch;

--- a/dml/src/lib.rs
+++ b/dml/src/lib.rs
@ -15,7 +15,10 @@

 use std::time::Duration;

-use data_types::{DeletePredicate, NonEmptyString, PartitionKey, Sequence, StatValues, Statistics};
+use data_types::{
+    DeletePredicate, NamespaceId, NonEmptyString, PartitionKey, Sequence, StatValues, Statistics,
+    TableId,
+};
 use hashbrown::HashMap;
 use iox_time::{Time, TimeProvider};
 use mutable_batch::MutableBatch;
@ -182,6 +185,32 @@ pub struct DmlWrite {
    max_timestamp: i64,
    /// The partition key derived for this write.
    partition_key: PartitionKey,
+
+    //                  !!!!!!! TRANSITION TIME !!!!!!!
+    //
+    // While implementing "sending IDs over Kafka" (#4880) there has to be a
+    // transition period where the producers (routers) populate the fields, but
+    // the consumers (ingesters) do not utilise them.
+    //
+    // This period of overlap is necessary to support a rolling deployment where
+    // the consumers MAY be deployed before the producers, or the producer code
+    // MAY be rolled back due to a defect. During this potential rollback
+    // window, all fields need to be populated to ensure both new and old
+    // versions of the code can process the enqueued messages.
+    //
+    // Because the consumers (ingesters) and the producers (routers) use the
+    // same common application-level type to represent writes (the DmlWrite), it
+    // has to support the producer pushing the IDs into the DmlWrite, but the
+    // consumer must not make use of them.
+    //
+    // In a follow-up PR, this consumer will be switched to make use of the
+    // TableIds, at which point the table map will change from the current
+    // `Table name -> Data` to `TableId -> Data`, and the second map can be
+    // removed from the DmlWrite.
+    #[allow(dead_code)]
+    namespace_id: NamespaceId,
+    // Used to resolve the table ID for a given table name during serialisation.
+    table_ids: HashMap<String, TableId>,
 }

 impl DmlWrite {
@ -196,7 +225,9 @@ impl DmlWrite {
    /// - a MutableBatch lacks an i64 "time" column
    pub fn new(
        namespace: impl Into<String>,
+        namespace_id: NamespaceId,
        tables: HashMap<String, MutableBatch>,
+        table_ids: HashMap<String, TableId>,
        partition_key: PartitionKey,
        meta: DmlMeta,
    ) -> Self {
@ -221,10 +252,12 @@ impl DmlWrite {
        Self {
            namespace: namespace.into(),
            tables,
+            table_ids,
            partition_key,
            meta,
            min_timestamp: stats.min.unwrap(),
            max_timestamp: stats.max.unwrap(),
+            namespace_id,
        }
    }

@ -284,7 +317,13 @@ impl DmlWrite {
                .iter()
                .map(|(k, v)| std::mem::size_of_val(k) + k.capacity() + v.size())
                .sum::<usize>()
+            + self
+                .table_ids
+                .keys()
+                .map(|k| std::mem::size_of_val(k) + k.capacity() + std::mem::size_of::<TableId>())
+                .sum::<usize>()
            + self.meta.size()
+            + std::mem::size_of::<NamespaceId>()
            + std::mem::size_of::<PartitionKey>()
            - std::mem::size_of::<DmlMeta>()
    }
@ -293,6 +332,28 @@ impl DmlWrite {
    pub fn partition_key(&self) -> &PartitionKey {
        &self.partition_key
    }
+
+    /// Return the map of [`TableId`] to table names for this batch.
+    ///
+    /// # Safety
+    ///
+    /// Marked unsafe because of the critical invariant; Kafka conumers MUST NOT
+    /// utilise this method until this warning is removed. See [`DmlWrite`]
+    /// docs.
+    pub unsafe fn table_id(&self, name: &str) -> Option<TableId> {
+        self.table_ids.get(name).cloned()
+    }
+
+    /// Return the [`NamespaceId`] to which this [`DmlWrite`] should be applied.
+    ///
+    /// # Safety
+    ///
+    /// Marked unsafe because of the critical invariant; Kafka conumers MUST NOT
+    /// utilise this method until this warning is removed. See [`DmlWrite`]
+    /// docs.
+    pub unsafe fn namespace_id(&self) -> NamespaceId {
+        self.namespace_id
+    }
 }

 /// A delete operation
@ -363,7 +424,7 @@ impl DmlDelete {
 /// Test utilities
 pub mod test_util {
    use arrow_util::display::pretty_format_batches;
-    use schema::selection::Selection;
+    use schema::Projection;

    use super::*;

@ -405,8 +466,8 @@ pub mod test_util {
            let b_batch = b.table(table_name).expect("table not found");

            assert_eq!(
-                pretty_format_batches(&[a_batch.to_arrow(Selection::All).unwrap()]).unwrap(),
-                pretty_format_batches(&[b_batch.to_arrow(Selection::All).unwrap()]).unwrap(),
+                pretty_format_batches(&[a_batch.to_arrow(Projection::All).unwrap()]).unwrap(),
+                pretty_format_batches(&[b_batch.to_arrow(Projection::All).unwrap()]).unwrap(),
                "batches for table \"{}\" differ",
                table_name
            );
--- a/docs/cli.md
+++ b/docs/cli.md
@ -1,6 +1,6 @@
 # InfluxDB CLI cookbook

-You can use the `influxdb_iox` command line tool to interact with the server in various ways
+You can use the `influxdb_iox` command line tool to interact with the IOx server in various ways. This document contains a brief tour of highlights and detailed information on each command can be found by passing `--help`.


 ## Ports
@ -12,65 +12,20 @@ To connect on a different port, use the `--host` argument:
 $ influxdb_iox --host http://localhost:8083  <command>
 ```

-## List all namespaces
+## Getting data in to IOx
+
+You can load data in parallel using the influxdb_iox client by specifing one or more files from the command line.
+
+This command uses the http v2 endpoint, which often runs on port 8080, rather than the default 8082 which handles gRPC:

 ```shell
-# Connects to port 8082 (gRPC by default)
-$ influxdb_iox debug namespace list
-[
-  {
-    "id": "1",
-    "name": "26f7e5a4b7be365b_917b97a92e883afc"
-  }
-]
-```
-
-## List Schema in a Namespace
-
-```shell
-$ influxdb_iox debug schema get 26f7e5a4b7be365b_917b97a92e883afc
-{
-  "id": "1",
-  "kafkaTopicId": "1",
-  "queryPoolId": "1",
-  "tables": {
-    "mem": {
-      "id": "2",
-      "columns": {
-        "time": {
-          "id": "10",
-          "columnType": 6
-        },
-        "host": {
-          "id": "16",
-          "columnType": 7
-        },
-        "available": {
-          "id": "17",
-          "columnType": 1
-        },
-        "wired": {
-...
-```
-
-Alternately you can use `show tables` using SQL (see [sql cookbook](sql.md) for more details):
-
-```shell
-$ influxdb_iox query 26f7e5a4b7be365b_917b97a92e883afc 'show tables'
-+---------------+--------------------+------------+------------+
-| table_catalog | table_schema       | table_name | table_type |
-+---------------+--------------------+------------+------------+
-| public        | iox                | cpu        | BASE TABLE |
-| public        | iox                | disk       | BASE TABLE |
-| public        | iox                | diskio     | BASE TABLE |
-...
-| public        | information_schema | columns    | VIEW       |
-+---------------+--------------------+------------+------------+
+influxdb_iox --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp
 ```

 ## Run Queries

 ### SQL
+You can run an individual SQL query using the `query` command and providing the namespace and the SQL text. See the [sql cookbook](sql.md)for more detailed documentation on SQL.

 ```shell
 $ influxdb_iox query 26f7e5a4b7be365b_917b97a92e883afc 'select count(*), cpu as cpu_num from cpu group by cpu'
@ -97,30 +52,9 @@ $ influxdb_iox query 26f7e5a4b7be365b_917b97a92e883afc 'select count(*), cpu as
 +-----------------+-----------+
 ```

-### InfluxRPC (used by Flux and InfluxQL)
+### SQL REPL

-```shell
-TODO
-```
-
-### Ingester (used internally to IOx to query unpersisted data)
-
-```shell
-# Note you need to connect to the ingester (running on port 8083 in all in one mode)
-$ influxdb_iox query-ingester --host http://localhost:8083  26f7e5a4b7be365b_917b97a92e883afc mem available_percent | head
-+--------------------+
-| available_percent  |
-+--------------------+
-| 56.58011436462402  |
-| 57.43834972381592  |
-| 57.46076703071594  |
-| 57.482320070266724 |
-| 57.447218894958496 |
-| 57.420217990875244 |
-| 57.361191511154175 |
-```
-
-### SQL Repl
+IOx comes with its own Read Evaluate Print Loop (REPL) for running SQL interactively. See the [sql cookbook](sql.md)for more detailed documentation.

 ```shell
 $ influxdb_iox sql
@ -137,3 +71,135 @@ You are now in remote mode, querying database 26f7e5a4b7be365b_917b97a92e883afc
 +-----------------+
 Returned 1 row in 59.410821ms
 ```
+
+## Getting data out of IOx
+
+## Fetch the parquet files for a particular table
+
+You can retrieve the parquet files used to store a particular table to a local directory
+
+```shell
+$ influxdb_iox remote store get-table 26f7e5a4b7be365b_917b97a92e883afc mem
+found 3 Parquet files, downloading...
+downloading file 1 of 3 (1ce7e327-7b48-478f-b141-96e8d366ca12.5.parquet)...
+downloading file 2 of 3 (fa45a0db-5e9e-4374-b3d3-8294b5e7ade0.5.parquet)...
+downloading file 3 of 3 (ad5e47f6-b984-400b-99c2-f562151985d6.5.parquet)...
+Done.
+```
+
+These are standard parquet files and can be read by any other tool that understands the parquet file format.
+
+## Convert parquet files into line protocol
+
+Parquet files created by IOx can be converted back into the Line Protocol format using metadata stored in the file:
+
+```shell
+$ influxdb_iox debug parquet-to-lp mem/1ce7e327-7b48-478f-b141-96e8d366ca12.5.parquet
+disk,device=disk1s1s1,fstype=apfs,host=MacBook-Pro-8.local,mode=ro,path=/ free=89205854208i,inodes_free=871150920i,inodes_total=871652968i,inodes_used=502048i,total=1000240963584i,used=911035109376i,used_percent=91.0815635975992 1667300090000000000
+disk,device=disk1s1,fstype=apfs,host=MacBook-Pro-8.local,mode=rw,path=/System/Volumes/Update/mnt1 free=89205854208i,inodes_free=871150920i,inodes_total=871652990i,inodes_used=502070i,total=1000240963584i,used=911035109376i,used_percent=91.0815635975992 1667300090000000000
+...
+```
+
+Note you can also write such parquet files that came from IOx to another IOx instance using the `influxdb_iox write` command.
+
+
+## Inspect The Catalog
+
+
+## List all namespaces
+
+```shell
+# Connects to port 8082 (gRPC by default)
+$ influxdb_iox namespace list
+[
+  {
+    "id": "1",
+    "name": "26f7e5a4b7be365b_917b97a92e883afc"
+  }
+]
+```
+
+## List Schema in a Namespace
+
+```shell
+$ influxdb_iox debug schema get 26f7e5a4b7be365b_917b97a92e883afc
+{
+  "id": "1",
+  "topicId": "1",
+  "queryPoolId": "1",
+  "tables": {
+    "cpu": {
+      "id": "5",
+      "columns": {
+        "host": {
+          "id": "56",
+          "columnType": "COLUMN_TYPE_TAG"
+        },
+        "usage_nice": {
+          "id": "51",
+          "columnType": "COLUMN_TYPE_F64"
+        },
+...
+```
+
+Alternately you can use `show tables` using SQL (see [sql cookbook](sql.md) for more details):
+
+```shell
+$ influxdb_iox query 26f7e5a4b7be365b_917b97a92e883afc 'show tables'
+---------------+--------------------+------------+------------+
+| table_catalog | table_schema       | table_name | table_type |
+---------------+--------------------+------------+------------+
+| public        | iox                | cpu        | BASE TABLE |
+| public        | iox                | disk       | BASE TABLE |
+| public        | iox                | diskio     | BASE TABLE |
+...
+| public        | information_schema | columns    | VIEW       |
+---------------+--------------------+------------+------------+
+```
+
+## Advanced Querying
+
+These CLI options are most often used for developing and debugging IOx rather than intended for end users.
+
+### InfluxRPC (used by Flux and InfluxQL)
+
+`influxrpc` is the name used to describe the protocol to talk with Flux and InfluxQL services. There is limited CLI support for making such queries. For example, to run `measurement-fields` request,
+
+```shell
+$ influxdb_iox storage 26f7e5a4b7be365b_917b97a92e883afc measurement-fields cpu
+
+tag values: 10
+----------------------------------------------+
+| values                                       |
+----------------------------------------------+
+| key: usage_guest, type: 0, timestamp: 0      |
+| key: usage_guest_nice, type: 0, timestamp: 0 |
+| key: usage_idle, type: 0, timestamp: 0       |
+| key: usage_iowait, type: 0, timestamp: 0     |
+| key: usage_irq, type: 0, timestamp: 0        |
+| key: usage_nice, type: 0, timestamp: 0       |
+| key: usage_softirq, type: 0, timestamp: 0    |
+| key: usage_steal, type: 0, timestamp: 0      |
+| key: usage_system, type: 0, timestamp: 0     |
+| key: usage_user, type: 0, timestamp: 0       |
+----------------------------------------------+
+```
+
+### Ingester (used internally to IOx to query unpersisted data)
+
+You can make direct queries to the ingester to see its unpersisted data using the `query-ingester` command. Note you need to connect to the ingester (running on port 8083 in all in one mode)
+
+```shell
+$ influxdb_iox query-ingester --host http://localhost:8083  26f7e5a4b7be365b_917b97a92e883afc swap
+------------+---------------------+----+-----+----------------------+------------+------------+-------------------+
+| free       | host                | in | out | time                 | total      | used       | used_percent      |
+------------+---------------------+----+-----+----------------------+------------+------------+-------------------+
+| 1496055808 | MacBook-Pro-8.local |    |     | 2022-11-01T10:08:40Z | 6442450944 | 4946395136 | 76.77815755208334 |
+|            | MacBook-Pro-8.local | 0  | 0   | 2022-11-01T10:08:40Z |            |            |                   |
+| 1496055808 | MacBook-Pro-8.local |    |     | 2022-11-01T10:08:40Z | 6442450944 | 4946395136 | 76.77815755208334 |
+|            | MacBook-Pro-8.local | 0  | 0   | 2022-11-01T10:08:40Z |            |            |                   |
+| 1496055808 | MacBook-Pro-8.local |    |     | 2022-11-01T10:08:50Z | 6442450944 | 4946395136 | 76.77815755208334 |
+|            | MacBook-Pro-8.local | 0  | 0   | 2022-11-01T10:08:50Z |            |            |                   |
+| 1496055808 | MacBook-Pro-8.local |    |     | 2022-11-01T10:08:50Z | 6442450944 | 4946395136 | 76.77815755208334 |
+...
+```
--- a/garbage_collector/Cargo.toml
+++ b/garbage_collector/Cargo.toml
@ -26,6 +26,6 @@ clap_blocks = { path = "../clap_blocks" }
 data_types = { path = "../data_types" }
 filetime = "0.2"
 metric = { path = "../metric" }
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
 tempfile = "3"
--- a/generated_types/protos/influxdata/pbdata/v1/influxdb_pb_data_protocol.proto
+++ b/generated_types/protos/influxdata/pbdata/v1/influxdb_pb_data_protocol.proto
@ -8,6 +8,9 @@ message DatabaseBatch {
    // The destination database name / namespace for this write.
    string database_name = 1;

+    // The catalog ID for this database / namespace.
+    int64 database_id = 4;
+
    // An optional partition key for this batch.
    //
    // If specified, all batches in this write MUST map to this partition key.
@ -22,6 +25,9 @@ message DatabaseBatch {
 message TableBatch {
    string table_name = 1;

+    // The catalog ID for this table.
+    int64 table_id = 4;
+
    // Data are represented here.
    //
    // Exactly one column named and typed "time" *must* exist,
@ -119,7 +125,7 @@ message Column {
 }

 // Note there used to be a service that would load this internal protobuf format.
-// See https://github.com/influxdata/influxdb_iox/pull/5750 and 
+// See https://github.com/influxdata/influxdb_iox/pull/5750 and
 // https://github.com/influxdata/influxdb_iox/issues/4866
 // for rationale of why it was removed

--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@ -17,7 +17,7 @@ uuid = { version = "1", features = ["v4"] }

 [dev-dependencies] # In alphabetical order
 mockito = "0.31"
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 parking_lot = "0.12"
 tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 test_helpers = { path = "../test_helpers" }
--- a/influxdb2_client/src/api/buckets.rs
+++ b/influxdb2_client/src/api/buckets.rs
@ -16,6 +16,7 @@ impl Client {

        let response = self
            .request(Method::POST, &create_bucket_url)
+            .header("Content-Type", "application/json")
            .body(
                serde_json::to_string(&post_bucket_request.unwrap_or_default())
                    .context(SerializingSnafu)?,
@ -47,6 +48,7 @@ mod tests {

        let mock_server = mock("POST", "/api/v2/buckets")
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"orgID":"{}","name":"{}","retentionRules":[]}}"#,
--- a/influxdb2_client/src/api/label.rs
+++ b/influxdb2_client/src/api/label.rs
@ -73,6 +73,7 @@ impl Client {
        };
        let response = self
            .request(Method::POST, &create_label_url)
+            .header("Content-Type", "application/json")
            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
            .send()
            .await
@ -100,6 +101,7 @@ impl Client {
        let body = LabelUpdate { name, properties };
        let response = self
            .request(Method::PATCH, &update_label_url)
+            .header("Content-Type", "application/json")
            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
            .send()
            .await
@ -198,6 +200,7 @@ mod tests {

        let mock_server = mock("POST", BASE_PATH)
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"orgID":"{}","name":"{}","properties":{{"some-key":"some-value"}}}}"#,
@ -222,6 +225,7 @@ mod tests {

        let mock_server = mock("POST", BASE_PATH)
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(format!(r#"{{"orgID":"{}","name":"{}"}}"#, org_id, name).as_str())
            .create();

@ -242,6 +246,7 @@ mod tests {

        let mock_server = mock("PATCH", format!("{}/{}", BASE_PATH, label_id).as_str())
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"name":"{}","properties":{{"some-key":"some-value"}}}}"#,
@ -267,6 +272,7 @@ mod tests {

        let mock_server = mock("PATCH", format!("{}/{}", BASE_PATH, label_id).as_str())
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body("{}")
            .create();

--- a/influxdb2_client/src/api/setup.rs
+++ b/influxdb2_client/src/api/setup.rs
@ -54,6 +54,7 @@ impl Client {

        let response = self
            .request(Method::POST, &setup_init_url)
+            .header("Content-Type", "application/json")
            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
            .send()
            .await
@ -94,6 +95,7 @@ impl Client {

        let response = self
            .request(Method::POST, &setup_new_url)
+            .header("Content-Type", "application/json")
            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
            .send()
            .await
@ -138,6 +140,7 @@ mod tests {
        let retention_period_hrs = 1;

        let mock_server = mock("POST", "/api/v2/setup")
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"username":"{}","org":"{}","bucket":"{}","password":"{}","retentionPeriodHrs":{}}}"#,
@ -173,6 +176,7 @@ mod tests {

        let mock_server = mock("POST", "/api/v2/setup/user")
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"username":"{}","org":"{}","bucket":"{}","password":"{}","retentionPeriodHrs":{}}}"#,
@ -204,6 +208,7 @@ mod tests {
        let bucket = "some-bucket";

        let mock_server = mock("POST", "/api/v2/setup")
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"username":"{}","org":"{}","bucket":"{}"}}"#,
@ -231,6 +236,7 @@ mod tests {

        let mock_server = mock("POST", "/api/v2/setup/user")
            .match_header("Authorization", format!("Token {}", token).as_str())
+            .match_header("Content-Type", "application/json")
            .match_body(
                format!(
                    r#"{{"username":"{}","org":"{}","bucket":"{}"}}"#,
--- a/influxdb_influxql_parser/src/expression/arithmetic.rs
+++ b/influxdb_influxql_parser/src/expression/arithmetic.rs
@ -12,7 +12,7 @@ use nom::bytes::complete::tag;
 use nom::character::complete::{char, multispace0};
 use nom::combinator::{cut, map, opt, value};
 use nom::multi::{many0, separated_list0};
-use nom::sequence::{delimited, pair, preceded, separated_pair, tuple};
+use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
 use std::fmt::{Display, Formatter, Write};

 /// An InfluxQL arithmetic expression.
@ -316,11 +316,54 @@ where
    )(i)
 }

-/// Parse a variable reference, which is an identifier followed by an optional cast expression.
+/// Parse a segmented identifier
+///
+/// ```text
+/// segmented_identifier ::= identifier |
+///                          ( identifier "." identifier ) |
+///                          ( identifier "." identifier? "." identifier )
+/// ```
+fn segmented_identifier(i: &str) -> ParseResult<&str, Identifier> {
+    let (remaining, (opt_prefix, name)) = pair(
+        opt(alt((
+            // ident2 "." ident1 "."
+            map(
+                pair(
+                    terminated(identifier, tag(".")),
+                    terminated(identifier, tag(".")),
+                ),
+                |(ident2, ident1)| (Some(ident2), Some(ident1)),
+            ),
+            // identifier ".."
+            map(terminated(identifier, tag("..")), |ident2| {
+                (Some(ident2), None)
+            }),
+            // identifier "."
+            map(terminated(identifier, tag(".")), |ident1| {
+                (None, Some(ident1))
+            }),
+        ))),
+        identifier,
+    )(i)?;
+
+    Ok((
+        remaining,
+        match opt_prefix {
+            Some((None, Some(ident1))) => format!("{}.{}", ident1.0, name.0).into(),
+            Some((Some(ident2), None)) => format!("{}..{}", ident2.0, name.0).into(),
+            Some((Some(ident2), Some(ident1))) => {
+                format!("{}.{}.{}", ident2.0, ident1.0, name.0).into()
+            }
+            _ => name,
+        },
+    ))
+}
+
+/// Parse a variable reference, which is a segmented identifier followed by an optional cast expression.
 pub(crate) fn var_ref(i: &str) -> ParseResult<&str, Expr> {
    map(
        pair(
-            identifier,
+            segmented_identifier,
            opt(preceded(
                tag("::"),
                expect(
@ -515,6 +558,19 @@ mod test {
        let (_, got) = var_ref("foo").unwrap();
        assert_eq!(got, var_ref!("foo"));

+        // Whilst this is parsed as a 3-part name, it is treated as a quoted string 🙄
+        // VarRefs are parsed as segmented identifiers
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2515-L2516
+        //
+        // and then the segments are joined as a single string
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2551
+        let (rem, got) = var_ref("db.rp.foo").unwrap();
+        assert_eq!(got, var_ref!("db.rp.foo"));
+        assert_eq!(format!("{}", got), r#""db.rp.foo""#);
+        assert_eq!(rem, "");
+
        // with cast operator
        let (_, got) = var_ref("foo::tag").unwrap();
        assert_eq!(got, var_ref!("foo", Tag));
@ -539,6 +595,62 @@ mod test {
        assert!(got.is_empty())
    }

+    #[test]
+    fn test_segmented_identifier() {
+        // Unquoted
+        let (rem, id) = segmented_identifier("part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "part0");
+
+        // id.id
+        let (rem, id) = segmented_identifier("part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part1.part0\"");
+
+        // id..id
+        let (rem, id) = segmented_identifier("part2..part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part2..part0\"");
+
+        // id.id.id
+        let (rem, id) = segmented_identifier("part2.part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
+
+        // "id"."id".id
+        let (rem, id) = segmented_identifier(r#""part 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part 2.part 1.part0\"");
+
+        // Only parses 3 segments
+        let (rem, id) = segmented_identifier("part2.part1.part0.foo").unwrap();
+        assert_eq!(rem, ".foo");
+        assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
+
+        // Quoted
+        let (rem, id) = segmented_identifier("\"part0\"").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "part0");
+
+        // Additional test cases, with compatibility proven via https://go.dev/play/p/k2150CJocVl
+
+        let (rem, id) = segmented_identifier(r#""part" 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), "part");
+
+        let (rem, id) = segmented_identifier(r#""part" 2."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2."part 1".part0"#);
+        assert_eq!(format!("{}", id), "part");
+
+        let (rem, id) = segmented_identifier(r#""part "2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#"2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), r#""part ""#);
+
+        let (rem, id) = segmented_identifier(r#""part ""2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#""2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), r#""part ""#);
+    }
+
    #[test]
    fn test_display_expr() {
        let (_, e) = arithmetic_expression("5 + 51").unwrap();
--- a/influxdb_influxql_parser/src/literal.rs
+++ b/influxdb_influxql_parser/src/literal.rs
@ -249,15 +249,15 @@ fn single_duration(i: &str) -> ParseResult<&str, i64> {
        pair(
            integer,
            alt((
-                value(Nanosecond, tag("ns")),   // nanoseconds
-                value(Microsecond, tag("µs")), // microseconds
-                value(Microsecond, tag("us")),  // microseconds
-                value(Millisecond, tag("ms")),  // milliseconds
-                value(Second, tag("s")),        // seconds
-                value(Minute, tag("m")),        // minutes
-                value(Hour, tag("h")),          // hours
-                value(Day, tag("d")),           // days
-                value(Week, tag("w")),          // weeks
+                value(Nanosecond, tag("ns")),  // nanoseconds
+                value(Microsecond, tag("µ")), // microseconds
+                value(Microsecond, tag("u")),  // microseconds
+                value(Millisecond, tag("ms")), // milliseconds
+                value(Second, tag("s")),       // seconds
+                value(Minute, tag("m")),       // minutes
+                value(Hour, tag("h")),         // hours
+                value(Day, tag("d")),          // days
+                value(Week, tag("w")),         // weeks
            )),
        ),
        |(v, unit)| match unit {
@ -410,10 +410,14 @@ mod test {
        let (_, got) = single_duration("38ns").unwrap();
        assert_eq!(got, 38);

-        let (_, got) = single_duration("22us").unwrap();
+        let (_, got) = single_duration("22u").unwrap();
        assert_eq!(got, 22 * NANOS_PER_MICRO);

-        let (_, got) = single_duration("7µs").unwrap();
+        let (rem, got) = single_duration("22us").unwrap();
+        assert_eq!(got, 22 * NANOS_PER_MICRO);
+        assert_eq!(rem, "s"); // prove that we ignore the trailing s
+
+        let (_, got) = single_duration("7µ").unwrap();
        assert_eq!(got, 7 * NANOS_PER_MICRO);

        let (_, got) = single_duration("15ms").unwrap();
--- a/influxdb_influxql_parser/src/select.rs
+++ b/influxdb_influxql_parser/src/select.rs
@ -774,6 +774,12 @@ mod test {
            select_statement("SELECT value FROM cpu WHERE time <= now()TZ('Australia/Hobart')")
                .unwrap();
        assert_eq!(rem, "");
+
+        // segmented var ref identifiers
+        let (rem, _) =
+            select_statement(r#"SELECT LAST("n.usage_user") FROM cpu WHERE n.usage_user > 0"#)
+                .unwrap();
+        assert_eq!(rem, "");
    }

    #[test]
@ -848,6 +854,16 @@ mod test {
            }
        );

+        // Parse expression with an alias and no unnecessary whitespace
+        let (_, got) = Field::parse("LAST(\"n.asks\")").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("LAST", var_ref!("n.asks")),
+                alias: None
+            }
+        );
+
        // Parse a call with a VarRef
        let (_, got) = Field::parse("DISTINCT foo AS bar").unwrap();
        assert_eq!(
--- a/influxdb_influxql_parser/src/string.rs
+++ b/influxdb_influxql_parser/src/string.rs
@ -7,8 +7,8 @@
 use crate::impl_tuple_clause;
 use crate::internal::{expect, ParseError, ParseResult};
 use nom::branch::alt;
-use nom::bytes::complete::{is_not, tag};
-use nom::character::complete::char;
+use nom::bytes::complete::{is_not, tag, take_till};
+use nom::character::complete::{anychar, char};
 use nom::combinator::{map, value, verify};
 use nom::error::Error;
 use nom::multi::fold_many0;
@ -137,13 +137,24 @@ fn regex_literal(i: &str) -> ParseResult<&str, &str> {

    loop {
        // match everything except `\`, `/` or `\n`
-        let (_, match_i) = is_not("\\/\n")(remaining)?;
+        let (_, match_i) = take_till(|c| c == '\\' || c == '/' || c == '\n')(remaining)?;
        consumed = &i[..(consumed.len() + match_i.len())];
        remaining = &i[consumed.len()..];

+        // If we didn't consume anything, check whether it is a newline or regex delimiter,
+        // which signals we should leave this parser for outer processing.
+        if consumed.is_empty() {
+            is_not("/\n")(remaining)?;
+        }
+
        // Try and consume '\' followed by a '/'
        if let Ok((remaining_i, _)) = char::<_, Error<&str>>('\\')(remaining) {
            if char::<_, Error<&str>>('/')(remaining_i).is_ok() {
+                // If we didn't consume anything, but we found "\/" sequence,
+                // we need to return an error so the outer fold_many0 parser does not trigger
+                // an infinite recursion error.
+                anychar(consumed)?;
+
                // We're escaping a '/' (a regex delimiter), so finish and let
                // the outer parser match and unescape
                return Ok((remaining, consumed));
@ -201,6 +212,10 @@ mod test {
        let (_, got) = double_quoted_string(r#""quick draw""#).unwrap();
        assert_eq!(got, "quick draw");

+        // ascii
+        let (_, got) = double_quoted_string(r#""n.asks""#).unwrap();
+        assert_eq!(got, "n.asks");
+
        // unicode
        let (_, got) = double_quoted_string("\"quick draw\u{1f47d}\"").unwrap();
        assert_eq!(
@ -265,6 +280,9 @@ mod test {
        let (_, got) = single_quoted_string(r#"'\n\''"#).unwrap();
        assert_eq!(got, "\n'");

+        let (_, got) = single_quoted_string(r#"'\'hello\''"#).unwrap();
+        assert_eq!(got, "'hello'");
+
        // literal tab
        let (_, got) = single_quoted_string("'quick\tdraw'").unwrap();
        assert_eq!(got, "quick\tdraw");
@ -300,13 +318,17 @@ mod test {
        assert_eq!(got, "hello".into());

        // handle escaped delimiters "\/"
-        let (_, got) = regex(r#"/this\/is\/a\/path/"#).unwrap();
-        assert_eq!(got, "this/is/a/path".into());
+        let (_, got) = regex(r#"/\/this\/is\/a\/path/"#).unwrap();
+        assert_eq!(got, "/this/is/a/path".into());

        // ignores any other possible escape sequence
        let (_, got) = regex(r#"/hello\n/"#).unwrap();
        assert_eq!(got, "hello\\n".into());

+        // can parse possible escape sequence at beginning of regex
+        let (_, got) = regex(r#"/\w.*/"#).unwrap();
+        assert_eq!(got, "\\w.*".into());
+
        // Empty regex
        let (i, got) = regex("//").unwrap();
        assert_eq!(i, "");
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@ -57,7 +57,7 @@ humantime = "2.1.0"
 itertools = "0.10.5"
 libc = { version = "0.2" }
 num_cpus = "1.13.0"
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 rustyline = { version = "10.0", default-features = false }
 serde_json = "1.0.87"
 snafu = "0.7"
--- a/influxdb_iox/src/commands/debug/mod.rs
+++ b/influxdb_iox/src/commands/debug/mod.rs
@ -2,7 +2,6 @@ use futures::Future;
 use influxdb_iox_client::connection::Connection;
 use snafu::prelude::*;

-mod namespace;
 mod parquet_to_lp;
 mod print_cpu;
 mod schema;
@ -14,10 +13,6 @@ pub enum Error {
    #[snafu(display("Error in schema subcommand: {}", source))]
    Schema { source: schema::Error },

-    #[snafu(context(false))]
-    #[snafu(display("Error in namespace subcommand: {}", source))]
-    Namespace { source: namespace::Error },
-
    #[snafu(context(false))]
    #[snafu(display("Error in parquet_to_lp subcommand: {}", source))]
    ParquetToLp { source: parquet_to_lp::Error },
@ -41,9 +36,6 @@ enum Command {
    /// Prints what CPU features are used by the compiler by default.
    PrintCpu,

-    /// Interrogate IOx namespaces
-    Namespace(namespace::Config),
-
    /// Interrogate the schema of a namespace
    Schema(schema::Config),

@ -61,10 +53,6 @@ where
 {
    match config.command {
        Command::PrintCpu => print_cpu::main(),
-        Command::Namespace(config) => {
-            let connection = connection().await;
-            namespace::command(connection, config).await?
-        }
        Command::Schema(config) => {
            let connection = connection().await;
            schema::command(connection, config).await?
--- a/influxdb_iox/src/commands/debug/namespace.rs
+++ b/influxdb_iox/src/commands/debug/namespace.rs
@ -20,7 +20,7 @@ pub struct Config {
    command: Command,
 }

-/// All possible subcommands for catalog
+/// All possible subcommands for namespace
 #[derive(Debug, clap::Parser)]
 enum Command {
    /// Fetch namespaces
--- a/influxdb_iox/src/commands/storage.rs
+++ b/influxdb_iox/src/commands/storage.rs
@ -172,6 +172,7 @@ pub enum Format {
 #[derive(Debug, clap::Parser)]
 enum Command {
    MeasurementFields(MeasurementFields),
+    MeasurementTagKeys(MeasurementTagKeys),
    ReadFilter,
    ReadGroup(ReadGroup),
    ReadWindowAggregate(ReadWindowAggregate),
@ -184,6 +185,12 @@ struct MeasurementFields {
    measurement: String,
 }

+#[derive(Debug, clap::Parser)]
+struct MeasurementTagKeys {
+    #[clap(action)]
+    measurement: String,
+}
+
 #[derive(Debug, clap::Parser)]
 struct ReadGroup {
    #[clap(
@ -279,6 +286,22 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                Format::Quiet => {}
            }
        }
+        Command::MeasurementTagKeys(m) => {
+            let result = client
+                .measurement_tag_keys(request::measurement_tag_keys(
+                    source,
+                    m.measurement,
+                    config.start,
+                    config.stop,
+                    predicate,
+                ))
+                .await
+                .context(ServerSnafu)?;
+            match config.format {
+                Format::Pretty => response::pretty_print_strings(result).context(ResponseSnafu)?,
+                Format::Quiet => {}
+            }
+        }
        Command::ReadFilter => {
            let result = client
                .read_filter(request::read_filter(
--- a/influxdb_iox/src/commands/storage/request.rs
+++ b/influxdb_iox/src/commands/storage/request.rs
@ -33,6 +33,21 @@ pub fn measurement_fields(
    }
 }

+pub fn measurement_tag_keys(
+    org_bucket: Any,
+    measurement: String,
+    start: i64,
+    stop: i64,
+    predicate: std::option::Option<Predicate>,
+) -> MeasurementTagKeysRequest {
+    generated_types::MeasurementTagKeysRequest {
+        source: Some(org_bucket),
+        measurement,
+        range: Some(TimestampRange { start, end: stop }),
+        predicate,
+    }
+}
+
 pub fn read_filter(
    org_bucket: Any,
    start: i64,
--- a/influxdb_iox/src/main.rs
+++ b/influxdb_iox/src/main.rs
@ -31,6 +31,7 @@ mod commands {
    pub mod compactor;
    pub mod debug;
    pub mod import;
+    pub mod namespace;
    pub mod query;
    pub mod query_ingester;
    pub mod remote;
@ -200,6 +201,9 @@ enum Command {

    /// Commands related to the bulk ingest of data
    Import(commands::import::Config),
+
+    /// Various commands for namespace manipulation
+    Namespace(commands::namespace::Config),
 }

 fn main() -> Result<(), std::io::Error> {
@ -349,6 +353,14 @@ fn main() -> Result<(), std::io::Error> {
                    std::process::exit(ReturnCode::Failure as _)
                }
            }
+            Some(Command::Namespace(config)) => {
+                let _tracing_guard = handle_init_logs(init_simple_logs(log_verbose_count));
+                let connection = connection().await;
+                if let Err(e) = commands::namespace::command(connection, config).await {
+                    eprintln!("{}", e);
+                    std::process::exit(ReturnCode::Failure as _)
+                }
+            }
        }
    });

--- a/influxdb_iox/tests/end_to_end_cases/cli.rs
+++ b/influxdb_iox/tests/end_to_end_cases/cli.rs
@ -533,7 +533,7 @@ async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, e
    );
 }

-/// Test the schema cli command
+/// Test the namespace cli command
 #[tokio::test]
 async fn namespaces_cli() {
    test_helpers::maybe_start_logging();
@ -556,7 +556,6 @@ async fn namespaces_cli() {
                        .unwrap()
                        .arg("-h")
                        .arg(&querier_addr)
-                        .arg("debug")
                        .arg("namespace")
                        .arg("list")
                        .assert()
--- a/influxdb_iox/tests/end_to_end_cases/remote.rs
+++ b/influxdb_iox/tests/end_to_end_cases/remote.rs
@ -177,6 +177,44 @@ async fn remote_store_get_table() {
                }
                .boxed()
            })),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    // Test that we can download files from the querier (not just the router)
+                    // to ensure it has the correct grpc services
+                    let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
+                    let namespace = state.cluster().namespace().to_string();
+
+                    // Ensure files are actually written to the filesystem
+                    let dir = tempfile::tempdir().expect("could not get temporary directory");
+
+                    Command::cargo_bin("influxdb_iox")
+                        .unwrap()
+                        .current_dir(&dir)
+                        .arg("-h")
+                        .arg(&querier_addr)
+                        .arg("remote")
+                        .arg("store")
+                        .arg("get-table")
+                        .arg(&namespace)
+                        .arg(&table_name)
+                        .assert()
+                        .success();
+
+                    let table_dir = dir.as_ref().join(&table_name);
+
+                    // There should be a directory created that, by default, is named the same as
+                    // the table
+                    assert!(table_dir.is_dir());
+                    let entries: Vec<_> = table_dir.read_dir().unwrap().flatten().collect();
+                    // The two Parquet files for this table should be present
+                    assert_eq!(
+                        entries.len(),
+                        2,
+                        "Expected 2 files in the directory, got: {entries:?}"
+                    );
+                }
+                .boxed()
+            })),
        ],
    )
    .run()
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@ -7,8 +7,8 @@ license.workspace = true

 [dependencies]
 arrow = { workspace = true, features = ["prettyprint"] }
-arrow-flight = { workspace = true }
 arrow_util = { path = "../arrow_util" }
+arrow-flight = { workspace = true }
 async-trait = "0.1.58"
 backoff = { path = "../backoff" }
 bytes = "1.2"
@ -20,6 +20,7 @@ dml = { path = "../dml" }
 flatbuffers = "2.1.2"
 futures = "0.3"
 generated_types = { path = "../generated_types" }
+hashbrown = "0.12.3"
 hyper = "0.14"
 iox_catalog = { path = "../iox_catalog" }
 iox_query = { path = "../iox_query" }
--- a/ingester/src/arcmap.rs
+++ b/ingester/src/arcmap.rs
@ -0,0 +1,331 @@
+//! A map key-value map where values are always wrapped in an [`Arc`], with
+//! helper methods for exactly-once initialisation.
+
+#![allow(dead_code)]
+
+use std::{
+    borrow::Borrow,
+    hash::{BuildHasher, Hash, Hasher},
+    sync::Arc,
+};
+
+use hashbrown::{
+    hash_map::{DefaultHashBuilder, RawEntryMut},
+    HashMap,
+};
+use parking_lot::RwLock;
+
+/// A key-value map where all values are wrapped in [`Arc`]'s and shared across
+/// all readers of a given key.
+///
+/// Each key in an [`ArcMap`] is initialised exactly once, with subsequent
+/// lookups being handed an [`Arc`] handle to the same instance.
+#[derive(Debug)]
+pub(crate) struct ArcMap<K, V, S = DefaultHashBuilder> {
+    map: RwLock<HashMap<K, Arc<V>, S>>,
+    hasher: S,
+}
+
+impl<K, V, S> std::ops::Deref for ArcMap<K, V, S> {
+    type Target = RwLock<HashMap<K, Arc<V>, S>>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.map
+    }
+}
+
+impl<K, V> Default for ArcMap<K, V> {
+    fn default() -> Self {
+        Self {
+            map: Default::default(),
+            hasher: Default::default(),
+        }
+    }
+}
+
+impl<K, V, S> ArcMap<K, V, S>
+where
+    K: Hash + Eq,
+    S: BuildHasher,
+{
+    /// Fetch an [`Arc`]-wrapped `V` for `key`, or if this is the first lookup
+    /// for `key`, initialise the value with the provided `init` closure.
+    ///
+    /// # Concurrency
+    ///
+    /// This call is thread-safe - if two calls race, a value will be
+    /// initialised exactly once (one arbitrary caller's `init` closure will be
+    /// executed) and both callers will obtain a handle to the same instance of
+    /// `V`. Both threads will eagerly initialise V and race to "win" storing V
+    /// in the map.
+    ///
+    /// # Performance
+    ///
+    /// This method is biased towards read-heavy workloads, with many readers
+    /// progressing in parallel. If the value for `key` must be initialised, all
+    /// readers are blocked while `init` executes and the resulting `V` is
+    /// memorised.
+    pub(crate) fn get_or_else<Q, F>(&self, key: &Q, init: F) -> Arc<V>
+    where
+        Q: Hash + PartialEq<K> + ToOwned<Owned = K> + ?Sized,
+        F: FnOnce() -> Arc<V>,
+    {
+        // Memorise the hash outside of the lock.
+        //
+        // This allows the hash to be re-used (and not recomputed) if the value
+        // has to be inserted into the map after the existence check. It also
+        // obviously keeps the hashing outside of the lock.
+        let hash = self.compute_hash(key);
+
+        // First check if the entry exists already.
+        //
+        // This does NOT use an upgradable read lock, as readers waiting for an
+        // upgradeable read lock block other readers wanting an upgradeable read
+        // lock. If all readers do that, it's effectively an exclusive lock.
+        if let Some((_, v)) = self.map.read().raw_entry().from_hash(hash, |q| key == q) {
+            return Arc::clone(v);
+        }
+
+        // Otherwise acquire a write lock and insert the value if necessary (it
+        // is possible another thread initialised the value after the read check
+        // above, but before this write lock was granted).
+        let mut guard = self.map.write();
+        match guard.raw_entry_mut().from_hash(hash, |q| key == q) {
+            RawEntryMut::Occupied(v) => Arc::clone(v.get()),
+            RawEntryMut::Vacant(v) => {
+                Arc::clone(v.insert_hashed_nocheck(hash, key.to_owned(), init()).1)
+            }
+        }
+    }
+
+    /// A convenience method over [`Self::get_or_else()`] that initialises `V`
+    /// to the default value when `key` has no entry.
+    pub(crate) fn get_or_default<Q>(&self, key: &Q) -> Arc<V>
+    where
+        Q: Hash + PartialEq<K> + ToOwned<Owned = K> + ?Sized,
+        V: Default,
+    {
+        self.get_or_else(key, Default::default)
+    }
+
+    /// A getter for `key` that returns an [`Arc`]-wrapped `V`, or [`None`] if
+    /// `key` has not yet been initialised.
+    ///
+    /// # Concurrency
+    ///
+    /// This method is cheap, and multiple callers progress in parallel. Callers
+    /// are blocked by a call to [`Self::get_or_else()`] only when a `V` needs
+    /// to be initialised.
+    pub(crate) fn get<Q>(&self, key: &Q) -> Option<Arc<V>>
+    where
+        K: Borrow<Q>,
+        Q: Hash + PartialEq<K> + ?Sized,
+    {
+        let hash = self.compute_hash(key);
+        self.map
+            .read()
+            .raw_entry()
+            .from_hash(hash, |q| key == q)
+            .map(|(_k, v)| Arc::clone(v))
+    }
+
+    /// Insert `value` indexed by `key`.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if a value already exists for `key`.
+    pub(crate) fn insert<Q>(&self, key: &Q, value: Arc<V>)
+    where
+        Q: Hash + PartialEq<K> + ToOwned<Owned = K> + ?Sized,
+    {
+        let hash = self.compute_hash(key);
+
+        match self
+            .map
+            .write()
+            .raw_entry_mut()
+            .from_hash(hash, |q| key == q)
+        {
+            RawEntryMut::Occupied(_) => panic!("inserting existing key into ArcMap"),
+            RawEntryMut::Vacant(view) => {
+                view.insert_hashed_nocheck(hash, key.to_owned(), value);
+            }
+        }
+    }
+
+    /// Return a state snapshot of all the values in this [`ArcMap`] in
+    /// arbitrary order.
+    ///
+    /// # Concurrency
+    ///
+    /// The snapshot generation is serialised w.r.t concurrent calls to mutate
+    /// `self` (that is, a new entry may appear immediately after the snapshot
+    /// is generated). Calls to [`Self::values`] and other "read" methods
+    /// proceed in parallel.
+    pub(crate) fn values(&self) -> Vec<Arc<V>> {
+        self.map.read().values().map(Arc::clone).collect()
+    }
+
+    fn compute_hash<Q: Hash + ?Sized>(&self, key: &Q) -> u64 {
+        let mut state = self.hasher.build_hasher();
+        key.hash(&mut state);
+        state.finish()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Barrier,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_get() {
+        let map = ArcMap::<String, usize>::default();
+
+        let key: &str = "bananas";
+
+        assert!(map.get(key).is_none());
+
+        // Assert the value is initialised from the closure
+        let got: Arc<usize> = map.get_or_else(key, || Arc::new(42));
+        assert_eq!(*got, 42);
+
+        // Assert the same Arc is returned later.
+        let other = map.get(key).expect("should have been initialised");
+        assert!(Arc::ptr_eq(&got, &other));
+    }
+
+    #[test]
+    fn test_init_once() {
+        let map = ArcMap::<String, usize>::default();
+
+        let key: &str = "bananas";
+
+        // Assert the value is initialised from the closure
+        let got = map.get_or_else(key, || Arc::new(42));
+        assert_eq!(*got, 42);
+
+        // And subsequent calls observe the same value, regardless of the init
+        // closure
+        let got = map.get_or_else(key, || Arc::new(13));
+        assert_eq!(*got, 42);
+
+        let got = map.get_or_default(key);
+        assert_eq!(*got, 42);
+    }
+
+    #[test]
+    fn test_insert() {
+        let map = ArcMap::<String, usize>::default();
+
+        let key: &str = "bananas";
+
+        assert!(map.get(key).is_none());
+
+        // Assert the value is initialised from the closure
+        map.insert(key, Arc::new(42));
+        let got = map.get(key).unwrap();
+        assert_eq!(*got, 42);
+
+        // Assert the same Arc is returned later.
+        let other = map.get(key).expect("should have been initialised");
+        assert_eq!(*other, 42);
+        assert!(Arc::ptr_eq(&got, &other));
+
+        // And subsequent calls observe the same value, regardless of the init
+        // closure
+        let got = map.get_or_else(key, || Arc::new(13));
+        assert_eq!(*got, 42);
+        assert!(Arc::ptr_eq(&got, &other));
+    }
+
+    #[test]
+    fn test_values() {
+        let map = ArcMap::<usize, String>::default();
+
+        map.insert(&1, Arc::new("bananas".to_string()));
+        map.insert(&2, Arc::new("platanos".to_string()));
+
+        let mut got = map
+            .values()
+            .into_iter()
+            .map(|v| String::clone(&*v))
+            .collect::<Vec<_>>();
+        got.sort_unstable();
+
+        assert_eq!(got, &["bananas", "platanos"]);
+    }
+
+    #[test]
+    #[should_panic = "inserting existing key"]
+    fn test_insert_existing() {
+        let map = ArcMap::<String, usize>::default();
+
+        let key: &str = "bananas";
+        map.insert(key, Arc::new(42));
+        map.insert(key, Arc::new(42));
+    }
+
+    #[test]
+    #[allow(clippy::needless_collect)] // Only needless if you like deadlocks.
+    fn test_init_once_parallel() {
+        let map = Arc::new(ArcMap::<String, usize>::default());
+
+        const NUM_THREADS: usize = 10;
+
+        let barrier = Arc::new(Barrier::new(NUM_THREADS));
+        let init_count = Arc::new(AtomicUsize::new(0));
+        let key: &str = "bananas";
+
+        // Spawn NUM_THREADS and have all of them wait until all the threads
+        // have initialised before racing to initialise a V for key.
+        //
+        // Each thread tries to initialise V to a unique per-thread value, and
+        // this test asserts only one thread successfully initialises V to it's
+        // unique value.
+        let handles = (0..NUM_THREADS)
+            .map(|i| {
+                let map = Arc::clone(&map);
+                let barrier = Arc::clone(&barrier);
+                let init_count = Arc::clone(&init_count);
+
+                std::thread::spawn(move || {
+                    // Rendezvous with all threads before continuing to maximise
+                    // the racy-ness.
+                    barrier.wait();
+
+                    let got = map.get_or_else(key, || {
+                        init_count.fetch_add(1, Ordering::SeqCst);
+                        Arc::new(i)
+                    });
+
+                    *got == i
+                })
+            })
+            .collect::<Vec<_>>();
+
+        let winners = handles
+            .into_iter()
+            .fold(0, |acc, h| if h.join().unwrap() { acc + 1 } else { acc });
+
+        assert_eq!(winners, 1); // Number of threads that observed their unique value
+        assert_eq!(init_count.load(Ordering::SeqCst), 1); // Number of init() calls
+    }
+
+    // Assert values can be "moved" due to FnOnce being used, vs. Fn.
+    //
+    // This is a compile-time assertion more than a runtime test.
+    #[test]
+    fn test_fn_once() {
+        let map = ArcMap::<String, String>::default();
+
+        // A non-copy value that is moved into the FnOnce
+        let v = "bananas".to_owned();
+        let v = map.get_or_else("platanos", move || Arc::new(v));
+        assert_eq!(*v, "bananas")
+    }
+}
--- a/ingester/src/compact.rs
+++ b/ingester/src/compact.rs
@ -159,7 +159,7 @@ mod tests {
    use arrow_util::assert_batches_eq;
    use data_types::PartitionId;
    use mutable_batch_lp::lines_to_batches;
-    use schema::selection::Selection;
+    use schema::Projection;

    use super::*;
    use crate::test_util::{
@ -182,7 +182,7 @@ mod tests {
            .unwrap()
            .get("cpu")
            .unwrap()
-            .to_arrow(Selection::All)
+            .to_arrow(Projection::All)
            .unwrap();

        let batch = QueryAdaptor::new(
--- a/ingester/src/data.rs
+++ b/ingester/src/data.rs
@ -614,9 +614,11 @@ mod tests {

    use dml::{DmlDelete, DmlMeta, DmlWrite};
    use futures::TryStreamExt;
-    use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
+    use hashbrown::HashMap;
+    use iox_catalog::{interface::RepoCollection, mem::MemCatalog, validate_or_insert_schema};
    use iox_time::Time;
    use metric::{MetricObserver, Observation};
+    use mutable_batch::MutableBatch;
    use mutable_batch_lp::lines_to_batches;
    use object_store::memory::InMemory;

@ -664,9 +666,12 @@ mod tests {

        let ignored_ts = Time::from_timestamp_millis(42);

+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -705,9 +710,12 @@ mod tests {
            .unwrap();
        assert_matches!(action, DmlApplyAction::Applied(false));

+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w2 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(&mut *catalog.repositories().await, namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(2)),
@ -750,16 +758,19 @@ mod tests {
            Arc::clone(&catalog),
            [(shard1.id, shard1.shard_index)],
            Arc::new(Executor::new(1)),
-            Arc::new(CatalogPartitionResolver::new(catalog)),
+            Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog))),
            BackoffConfig::default(),
            Arc::clone(&metrics),
        ));

        let schema = NamespaceSchema::new(namespace.id, topic.id, query_pool.id, 100);

+        let batch = lines_to_batches("mem foo=1 10\nmem foo=1 11", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10\nmem foo=1 11", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -866,9 +877,12 @@ mod tests {

        let ignored_ts = Time::from_timestamp_millis(42);

+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -882,9 +896,12 @@ mod tests {
            .unwrap()
            .unwrap();

+        let batch = lines_to_batches("cpu foo=1 10", 1).unwrap();
        let w2 = DmlWrite::new(
            "foo",
-            lines_to_batches("cpu foo=1 10", 1).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(2), SequenceNumber::new(1)),
@ -900,9 +917,12 @@ mod tests {

        // drop repos so the mem catalog won't deadlock.
        std::mem::drop(repos);
+        let batch = lines_to_batches("mem foo=1 30", 2).unwrap();
        let w3 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 30", 2).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(&mut *catalog.repositories().await, namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(2)),
@ -1131,7 +1151,7 @@ mod tests {
                (shard2.id, shard2.shard_index),
            ],
            Arc::new(Executor::new(1)),
-            Arc::new(CatalogPartitionResolver::new(catalog)),
+            Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog))),
            BackoffConfig::default(),
            Arc::clone(&metrics),
        ));
@ -1141,9 +1161,12 @@ mod tests {
        let ignored_ts = Time::from_timestamp_millis(42);

        // write with sequence number 1
+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -1158,9 +1181,12 @@ mod tests {
            .unwrap();

        // write with sequence number 2
+        let batch = lines_to_batches("mem foo=1 30\ncpu bar=1 20", 0).unwrap();
        let w2 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 30\ncpu bar=1 20", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(2)),
@ -1256,9 +1282,12 @@ mod tests {

        let ignored_ts = Time::from_timestamp_millis(42);

+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -1267,9 +1296,12 @@ mod tests {
                50,
            ),
        );
+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w2 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(2)),
@ -1434,7 +1466,7 @@ mod tests {
            Arc::clone(&catalog),
            [(shard1.id, shard_index)],
            Arc::new(Executor::new(1)),
-            Arc::new(CatalogPartitionResolver::new(catalog)),
+            Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog))),
            BackoffConfig::default(),
            Arc::clone(&metrics),
        ));
@ -1443,9 +1475,12 @@ mod tests {

        let ignored_ts = Time::from_timestamp_millis(42);

+        let batch = lines_to_batches("mem foo=1 10", 0).unwrap();
        let w1 = DmlWrite::new(
            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
+            namespace.id,
+            batch.clone(),
+            build_id_map(repos.deref_mut(), namespace.id, &batch).await,
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(1), SequenceNumber::new(1)),
@ -1515,4 +1550,28 @@ mod tests {

        assert_eq!(progresses, expected_progresses);
    }
+
+    pub async fn build_id_map<R>(
+        catalog: &mut R,
+        namespace_id: NamespaceId,
+        tables: &HashMap<String, MutableBatch>,
+    ) -> HashMap<String, TableId>
+    where
+        R: RepoCollection + ?Sized,
+    {
+        let mut ret = HashMap::with_capacity(tables.len());
+
+        for k in tables.keys() {
+            let id = catalog
+                .tables()
+                .create_or_get(k, namespace_id)
+                .await
+                .expect("table should create OK")
+                .id;
+
+            ret.insert(k.clone(), id);
+        }
+
+        ret
+    }
 }
--- a/ingester/src/data/partition/buffer/mutable_buffer.rs
+++ b/ingester/src/data/partition/buffer/mutable_buffer.rs
@ -2,7 +2,7 @@ use std::sync::Arc;

 use arrow::record_batch::RecordBatch;
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use schema::Projection;

 /// A [`Buffer`] is an internal mutable buffer wrapper over a [`MutableBatch`]
 /// for the [`BufferState`] FSM.
@ -42,7 +42,7 @@ impl Buffer {
    pub(super) fn snapshot(self) -> Option<Arc<RecordBatch>> {
        Some(Arc::new(
            self.buffer?
-                .to_arrow(Selection::All)
+                .to_arrow(Projection::All)
                .expect("failed to snapshot buffer data"),
        ))
    }
--- a/ingester/src/data/partition/buffer/state_machine.rs
+++ b/ingester/src/data/partition/buffer/state_machine.rs
@ -135,7 +135,7 @@ mod tests {

    use arrow_util::assert_batches_eq;
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
-    use schema::selection::Selection;
+    use schema::Projection;

    use super::*;

@ -267,7 +267,7 @@ mod tests {
        // Generate the combined buffer from the original inputs to compare
        // against.
        mb1.extend_from(&mb2).unwrap();
-        let want = mb1.to_arrow(Selection::All).unwrap();
+        let want = mb1.to_arrow(Projection::All).unwrap();

        assert_eq!(&**snapshot, &want);
    }
--- a/ingester/src/data/partition/buffer/state_machine/buffering.rs
+++ b/ingester/src/data/partition/buffer/state_machine/buffering.rs
@ -4,7 +4,7 @@ use std::sync::Arc;

 use arrow::record_batch::RecordBatch;
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use schema::Projection;

 use crate::data::partition::buffer::{
    mutable_buffer::Buffer,
@ -34,7 +34,7 @@ impl Queryable for Buffering {
    fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
        let data = self.buffer.buffer().map(|v| {
            Arc::new(
-                v.to_arrow(Selection::All)
+                v.to_arrow(Projection::All)
                    .expect("failed to snapshot buffer data"),
            )
        });
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@ -445,7 +445,7 @@ impl<T> Drop for IngestHandlerImpl<T> {
 mod tests {
    use std::{num::NonZeroU32, ops::DerefMut};

-    use data_types::{Namespace, NamespaceSchema, Sequence, SequenceNumber};
+    use data_types::{Namespace, NamespaceId, NamespaceSchema, Sequence, SequenceNumber, TableId};
    use dml::{DmlMeta, DmlWrite};
    use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
    use iox_time::Time;
@ -600,7 +600,9 @@ mod tests {
        let ingest_ts1 = Time::from_timestamp_millis(42);
        let write_operations = vec![DmlWrite::new(
            "foo",
+            NamespaceId::new(1),
            lines_to_batches("cpu bar=2 20", 0).unwrap(),
+            [("cpu".to_string(), TableId::new(1))].into_iter().collect(),
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(0), SequenceNumber::new(10)),
@ -626,7 +628,9 @@ mod tests {
        let ingest_ts1 = Time::from_timestamp_millis(42);
        let write_operations = vec![DmlWrite::new(
            "foo",
+            NamespaceId::new(1),
            lines_to_batches("cpu bar=2 20", 0).unwrap(),
+            [("cpu".to_string(), TableId::new(1))].into_iter().collect(),
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(0), SequenceNumber::new(2)),
@ -652,7 +656,9 @@ mod tests {
        let ingest_ts1 = Time::from_timestamp_millis(42);
        let write_operations = vec![DmlWrite::new(
            "foo",
+            NamespaceId::new(1),
            lines_to_batches("cpu bar=2 20", 0).unwrap(),
+            [("cpu".to_string(), TableId::new(1))].into_iter().collect(),
            "1970-01-01".into(),
            DmlMeta::sequenced(
                Sequence::new(ShardIndex::new(0), SequenceNumber::new(2)),
--- a/ingester/src/lib.rs
+++ b/ingester/src/lib.rs
@ -17,6 +17,7 @@
    clippy::dbg_macro
 )]

+mod arcmap;
 pub(crate) mod compact;
 pub mod data;
 pub mod handler;
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@ -10,7 +10,7 @@ use datafusion_util::MemoryStream;
 use futures::{Stream, StreamExt, TryStreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use observability_deps::tracing::debug;
-use schema::{merge::SchemaMerger, selection::Selection};
+use schema::{merge::SchemaMerger, Projection};
 use snafu::{ensure, Snafu};
 use trace::span::{Span, SpanRecorder};

@ -344,9 +344,9 @@ pub async fn prepare_data_to_querier(
                        .map(String::as_str)
                        .collect::<Vec<_>>();
                    let selection = if columns.is_empty() {
-                        Selection::All
+                        Projection::All
                    } else {
-                        Selection::Some(columns.as_ref())
+                        Projection::Some(columns.as_ref())
                    };

                    let snapshots = batch.project_selection(selection).into_iter().map(|batch| {
@ -666,6 +666,6 @@ mod tests {
    }

    fn lp_to_batch(lp: &str) -> RecordBatch {
-        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
+        lp_to_mutable_batch(lp).1.to_arrow(Projection::All).unwrap()
    }
 }
--- a/ingester/src/query_adaptor.rs
+++ b/ingester/src/query_adaptor.rs
@ -6,24 +6,16 @@ use std::{any::Any, sync::Arc};
 use arrow::record_batch::RecordBatch;
 use arrow_util::util::ensure_schema;
 use data_types::{ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary};
-use datafusion::{
-    error::DataFusionError,
-    physical_plan::{
-        common::SizedRecordBatchStream,
-        metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
-        SendableRecordBatchStream,
-    },
-};
+use datafusion::error::DataFusionError;
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
    util::{compute_timenanosecond_min_max, create_basic_summary},
-    QueryChunk, QueryChunkMeta,
+    QueryChunk, QueryChunkData, QueryChunkMeta,
 };
-use observability_deps::tracing::trace;
 use once_cell::sync::OnceCell;
 use predicate::Predicate;
-use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
-use snafu::{ResultExt, Snafu};
+use schema::{merge::merge_record_batch_schemas, sort::SortKey, Projection, Schema};
+use snafu::Snafu;

 use crate::data::table::TableName;

@ -109,7 +101,7 @@ impl QueryAdaptor {
        }
    }

-    pub(crate) fn project_selection(&self, selection: Selection<'_>) -> Vec<RecordBatch> {
+    pub(crate) fn project_selection(&self, selection: Projection<'_>) -> Vec<RecordBatch> {
        // Project the column selection across all RecordBatch
        self.data
            .iter()
@ -119,8 +111,8 @@ impl QueryAdaptor {

                // Apply selection to in-memory batch
                match selection {
-                    Selection::All => batch.clone(),
-                    Selection::Some(columns) => {
+                    Projection::All => batch.clone(),
+                    Projection::Some(columns) => {
                        let projection = columns
                            .iter()
                            .flat_map(|&column_name| {
@ -148,8 +140,8 @@ impl QueryAdaptor {
 }

 impl QueryChunkMeta for QueryAdaptor {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
-        Some(Arc::clone(self.summary.get_or_init(|| {
+    fn summary(&self) -> Arc<TableSummary> {
+        Arc::clone(self.summary.get_or_init(|| {
            let ts_min_max = compute_timenanosecond_min_max(self.data.iter().map(|b| b.as_ref()))
                .expect("Should have time range");

@ -158,7 +150,7 @@ impl QueryChunkMeta for QueryAdaptor {
                &self.schema(),
                ts_min_max,
            ))
-        })))
+        }))
    }

    fn schema(&self) -> Arc<Schema> {
@ -211,7 +203,7 @@ impl QueryChunk for QueryAdaptor {
        &self,
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
-        _columns: Selection<'_>,
+        _columns: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }
@ -230,42 +222,15 @@ impl QueryChunk for QueryAdaptor {
        Ok(None)
    }

-    /// Provides access to raw `QueryChunk` data as an
-    /// asynchronous stream of `RecordBatch`es
-    fn read_filter(
-        &self,
-        mut ctx: IOxSessionContext,
-        _predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, DataFusionError> {
-        ctx.set_metadata("storage", "ingester");
-        ctx.set_metadata("projection", format!("{}", selection));
-        trace!(?selection, "selection");
+    fn data(&self) -> QueryChunkData {
+        let schema = self.schema().as_arrow();

-        let schema = self
-            .schema()
-            .select(selection)
-            .context(SchemaSnafu)
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
-
-        // Apply the projection over all the data in self, ensuring each batch
-        // has the specified schema.
-        let batches = self
-            .project_selection(selection)
-            .into_iter()
-            .map(|batch| {
-                ensure_schema(&schema.as_arrow(), &batch)
-                    .context(ConcatBatchesSnafu {})
-                    .map(Arc::new)
-            })
-            .collect::<Result<Vec<_>, _>>()
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
-
-        // Return stream of data
-        let dummy_metrics = ExecutionPlanMetricsSet::new();
-        let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
-        let stream = SizedRecordBatchStream::new(schema.as_arrow(), batches, mem_metrics);
-        Ok(Box::pin(stream))
+        QueryChunkData::RecordBatches(
+            self.data
+                .iter()
+                .map(|b| ensure_schema(&schema, b).expect("schema handling broken"))
+                .collect(),
+        )
    }

    /// Returns chunk type
--- a/ingester/src/server/grpc.rs
+++ b/ingester/src/server/grpc.rs
@ -481,7 +481,7 @@ mod tests {
    use data_types::PartitionId;
    use futures::StreamExt;
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
-    use schema::selection::Selection;
+    use schema::Projection;

    use crate::querier_handler::PartitionStatus;

@ -496,7 +496,7 @@ mod tests {
    async fn test_get_stream_all_types() {
        let batch = lp_to_mutable_batch("table z=1 0")
            .1
-            .to_arrow(Selection::All)
+            .to_arrow(Projection::All)
            .unwrap();
        let schema = batch.schema();

@ -572,7 +572,7 @@ mod tests {
    async fn test_get_stream_dictionary_batches() {
        let batch = lp_to_mutable_batch("table,x=\"foo\",y=\"bar\" z=1 0")
            .1
-            .to_arrow(Selection::All)
+            .to_arrow(Projection::All)
            .unwrap();

        assert_get_stream(
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@ -514,7 +514,7 @@ mod tests {

    use assert_matches::assert_matches;
    use async_trait::async_trait;
-    use data_types::{DeletePredicate, Sequence, TimestampRange};
+    use data_types::{DeletePredicate, NamespaceId, Sequence, TableId, TimestampRange};
    use dml::{DmlDelete, DmlMeta, DmlWrite};
    use futures::stream::{self, BoxStream};
    use iox_time::{SystemProvider, Time};
@ -539,6 +539,11 @@ mod tests {
    // Return a DmlWrite with the given namespace and a single table.
    fn make_write(name: impl Into<String>, write_time: u64) -> DmlWrite {
        let tables = lines_to_batches("bananas level=42 4242", 0).unwrap();
+        let ids = tables
+            .keys()
+            .enumerate()
+            .map(|(i, v)| (v.clone(), TableId::new(i as _)))
+            .collect();
        let sequence = DmlMeta::sequenced(
            Sequence::new(ShardIndex::new(1), SequenceNumber::new(2)),
            TEST_TIME
@ -547,7 +552,14 @@ mod tests {
            None,
            42,
        );
-        DmlWrite::new(name, tables, "1970-01-01".into(), sequence)
+        DmlWrite::new(
+            name,
+            NamespaceId::new(42),
+            tables,
+            ids,
+            "1970-01-01".into(),
+            sequence,
+        )
    }

    // Return a DmlDelete with the given namespace.
--- a/ingester/src/stream_handler/sink_instrumentation.rs
+++ b/ingester/src/stream_handler/sink_instrumentation.rs
@ -239,7 +239,7 @@ mod tests {
    use std::sync::Arc;

    use assert_matches::assert_matches;
-    use data_types::{Sequence, SequenceNumber};
+    use data_types::{NamespaceId, Sequence, SequenceNumber, TableId};
    use dml::{DmlMeta, DmlWrite};
    use iox_time::Time;
    use metric::{Metric, MetricObserver, Observation};
@ -272,7 +272,19 @@ mod tests {
    /// Return a DmlWrite with the given metadata and a single table.
    fn make_write(meta: DmlMeta) -> DmlWrite {
        let tables = lines_to_batches("bananas level=42 4242", 0).unwrap();
-        DmlWrite::new("bananas", tables, "1970-01-01".into(), meta)
+        let ids = tables
+            .keys()
+            .enumerate()
+            .map(|(i, v)| (v.clone(), TableId::new(i as _)))
+            .collect();
+        DmlWrite::new(
+            "bananas",
+            NamespaceId::new(42),
+            tables,
+            ids,
+            "1970-01-01".into(),
+            meta,
+        )
    }

    /// Extract the metric with the given name from `metrics`.
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@ -572,9 +572,17 @@ pub(crate) fn make_write_op(
    sequence_number: i64,
    lines: &str,
 ) -> DmlWrite {
+    let tables = lines_to_batches(lines, 0).unwrap();
+    let ids = tables
+        .keys()
+        .enumerate()
+        .map(|(i, v)| (v.clone(), TableId::new(i as _)))
+        .collect();
    DmlWrite::new(
        namespace.to_string(),
-        lines_to_batches(lines, 0).unwrap(),
+        NamespaceId::new(42),
+        tables,
+        ids,
        partition_key.clone(),
        DmlMeta::sequenced(
            Sequence {
--- a/ingester/tests/common/mod.rs
+++ b/ingester/tests/common/mod.rs
@ -5,6 +5,7 @@ use data_types::{
    ShardIndex, TopicId,
 };
 use dml::{DmlMeta, DmlWrite};
+use futures::{stream::FuturesUnordered, StreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use ingester::{
    handler::{IngestHandler, IngestHandlerImpl},
@ -261,9 +262,42 @@ impl TestContext {
        partition_key: PartitionKey,
        sequence_number: i64,
    ) -> SequenceNumber {
+        // Resolve the namespace ID needed to construct the DML op
+        let namespace_id = self
+            .namespaces
+            .get(namespace)
+            .expect("namespace does not exist")
+            .id;
+
+        // Build the TableId -> TableName map, upserting the tables in the
+        // process.
+        let ids = lines_to_batches(lp, 0)
+            .unwrap()
+            .keys()
+            .map(|v| {
+                let catalog = Arc::clone(&self.catalog);
+                async move {
+                    let id = catalog
+                        .repositories()
+                        .await
+                        .tables()
+                        .create_or_get(v, namespace_id)
+                        .await
+                        .expect("table should create OK")
+                        .id;
+
+                    (v.clone(), id)
+                }
+            })
+            .collect::<FuturesUnordered<_>>()
+            .collect::<hashbrown::HashMap<_, _>>()
+            .await;
+
        self.enqueue_write(DmlWrite::new(
            namespace,
+            namespace_id,
            lines_to_batches(lp, 0).unwrap(),
+            ids,
            partition_key,
            DmlMeta::sequenced(
                Sequence::new(TEST_SHARD_INDEX, SequenceNumber::new(sequence_number)),
--- a/iox_catalog/migrations/20221101170001_add_cols_for_retention_policy.sql
+++ b/iox_catalog/migrations/20221101170001_add_cols_for_retention_policy.sql
@ -0,0 +1,6 @@
+ALTER TABLE IF EXISTS namespace
+    ADD COLUMN IF NOT EXISTS retention_period_ns BIGINT DEFAULT NULL;
+
+
+ALTER TABLE IF EXISTS partition
+    ADD COLUMN IF NOT EXISTS to_delete BIGINT DEFAULT NULL;
--- a/iox_data_generator/src/write.rs
+++ b/iox_data_generator/src/write.rs
@ -7,7 +7,7 @@ use futures::stream;
 use influxdb2_client::models::WriteDataPoint;
 use mutable_batch_lp::lines_to_batches;
 use parquet_file::{metadata::IoxMetadata, serialize};
-use schema::selection::Selection;
+use schema::Projection;
 use snafu::{ensure, ResultExt, Snafu};
 #[cfg(test)]
 use std::{collections::BTreeMap, sync::Arc};
@ -349,7 +349,7 @@ impl InnerPointsWriter {

                for (measurement, batch) in batches_by_measurement {
                    let record_batch = batch
-                        .to_arrow(Selection::All)
+                        .to_arrow(Projection::All)
                        .context(ConvertToArrowSnafu)?;
                    let stream = Box::pin(MemoryStream::new(vec![record_batch]));

--- a/iox_query/src/exec.rs
+++ b/iox_query/src/exec.rs
@ -10,7 +10,6 @@ mod schema_pivot;
 pub mod seriesset;
 pub(crate) mod split;
 pub mod stringset;
-pub use context::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
 use executor::DedicatedExecutor;
 use object_store::DynObjectStore;
 use parquet_file::storage::StorageId;
--- a/iox_query/src/exec/context.rs
+++ b/iox_query/src/exec/context.rs
@ -27,7 +27,6 @@ use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::{
    catalog::catalog::CatalogProvider,
-    config::OPT_COALESCE_TARGET_BATCH_SIZE,
    execution::{
        context::{QueryPlanner, SessionState, TaskContext},
        runtime_env::RuntimeEnv,
@ -41,10 +40,10 @@ use datafusion::{
    },
    prelude::*,
 };
+use datafusion_util::config::{iox_session_config, DEFAULT_CATALOG};
 use executor::DedicatedExecutor;
 use futures::TryStreamExt;
 use observability_deps::tracing::debug;
-use parquet_file::serialize::ROW_GROUP_WRITE_SIZE;
 use query_functions::selectors::register_selector_aggregates;
 use std::{convert::TryInto, fmt, sync::Arc};
 use trace::{
@ -55,11 +54,6 @@ use trace::{
 // Reuse DataFusion error and Result types for this module
 pub use datafusion::error::{DataFusionError as Error, Result};

-// The default catalog name - this impacts what SQL queries use if not specified
-pub const DEFAULT_CATALOG: &str = "public";
-// The default schema name - this impacts what SQL queries use if not specified
-pub const DEFAULT_SCHEMA: &str = "iox";
-
 /// This structure implements the DataFusion notion of "query planner"
 /// and is needed to create plans with the IOx extension nodes.
 struct IOxQueryPlanner {}
@ -175,26 +169,9 @@ impl fmt::Debug for IOxSessionConfig {
    }
 }

-const BATCH_SIZE: usize = 8 * 1024;
-const COALESCE_BATCH_SIZE: usize = BATCH_SIZE / 2;
-
-// ensure read and write work well together
-// Skip clippy due to <https://github.com/rust-lang/rust-clippy/issues/8159>.
-#[allow(clippy::assertions_on_constants)]
-const _: () = assert!(ROW_GROUP_WRITE_SIZE % BATCH_SIZE == 0);
-
 impl IOxSessionConfig {
    pub(super) fn new(exec: DedicatedExecutor, runtime: Arc<RuntimeEnv>) -> Self {
-        let session_config = SessionConfig::new()
-            .with_batch_size(BATCH_SIZE)
-            // TODO add function in SessionCofig
-            .set_u64(
-                OPT_COALESCE_TARGET_BATCH_SIZE,
-                COALESCE_BATCH_SIZE.try_into().unwrap(),
-            )
-            .create_default_catalog_and_schema(true)
-            .with_information_schema(true)
-            .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA);
+        let session_config = iox_session_config();

        Self {
            exec,
--- a/iox_query/src/frontend/influxrpc.rs
+++ b/iox_query/src/frontend/influxrpc.rs
@ -31,7 +31,7 @@ use query_functions::{
    make_window_bound_expr,
    selectors::{selector_first, selector_last, selector_max, selector_min, SelectorOutput},
 };
-use schema::{selection::Selection, InfluxColumnType, Schema, TIME_COLUMN_NAME};
+use schema::{InfluxColumnType, Projection, Schema, TIME_COLUMN_NAME};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::collections::HashSet as StdHashSet;
 use std::{cmp::Reverse, collections::BTreeSet, sync::Arc};
@ -387,7 +387,7 @@ impl InfluxRpcPlanner {
                        .map(|f| f.name().as_str())
                        .collect::<Vec<&str>>();

-                    let selection = Selection::Some(&column_names);
+                    let selection = Projection::Some(&column_names);

                    // If there are delete predicates, we need to scan (or do full plan) the data to eliminate
                    // deleted data before getting tag keys
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@ -10,17 +10,18 @@
    clippy::dbg_macro
 )]

+use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use data_types::{ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary};
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
+use datafusion::{error::DataFusionError, prelude::SessionContext};
 use exec::{stringset::StringSet, IOxSessionContext};
 use hashbrown::HashMap;
 use observability_deps::tracing::{debug, trace};
+use parquet_file::storage::ParquetExecInput;
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate, PredicateMatch};
 use schema::{
-    selection::Selection,
    sort::{SortKey, SortKeyBuilder},
-    Schema, TIME_COLUMN_NAME,
+    Projection, Schema, TIME_COLUMN_NAME,
 };
 use std::{any::Any, collections::BTreeSet, fmt::Debug, iter::FromIterator, sync::Arc};

@ -32,7 +33,6 @@ pub mod pruning;
 pub mod statistics;
 pub mod util;

-pub use exec::context::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
 pub use frontend::common::ScanPlanBuilder;
 pub use query_functions::group_by::{Aggregate, WindowDuration};

@ -40,7 +40,7 @@ pub use query_functions::group_by::{Aggregate, WindowDuration};
 /// metadata
 pub trait QueryChunkMeta {
    /// Return a summary of the data
-    fn summary(&self) -> Option<Arc<TableSummary>>;
+    fn summary(&self) -> Arc<TableSummary>;

    /// return a reference to the summary of the data held in this chunk
    fn schema(&self) -> Arc<Schema>;
@ -174,6 +174,37 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
    fn as_meta(&self) -> &dyn QueryDatabaseMeta;
 }

+/// Raw data of a [`QueryChunk`].
+#[derive(Debug)]
+pub enum QueryChunkData {
+    /// In-memory record batches.
+    ///
+    /// **IMPORTANT: All batches MUST have the schema that the [chunk reports](QueryChunkMeta::schema).**
+    RecordBatches(Vec<RecordBatch>),
+
+    /// Parquet file.
+    ///
+    /// See [`ParquetExecInput`] for details.
+    Parquet(ParquetExecInput),
+}
+
+impl QueryChunkData {
+    /// Read data into [`RecordBatch`]es. This is mostly meant for testing!
+    pub async fn read_to_batches(
+        self,
+        schema: Arc<Schema>,
+        session_ctx: &SessionContext,
+    ) -> Vec<RecordBatch> {
+        match self {
+            Self::RecordBatches(batches) => batches,
+            Self::Parquet(exec_input) => exec_input
+                .read_to_batches(schema.as_arrow(), Projection::All, session_ctx)
+                .await
+                .unwrap(),
+        }
+    }
+}
+
 /// Collection of data that shares the same partition key
 pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
    /// returns the Id of this chunk. Ids are unique within a
@ -197,10 +228,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        &self,
        predicate: &Predicate,
    ) -> Result<PredicateMatch, DataFusionError> {
-        Ok(self
-            .summary()
-            .map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow()))
-            .unwrap_or(PredicateMatch::Unknown))
+        Ok(predicate.apply_to_table_summary(&self.summary(), self.schema().as_arrow()))
    }

    /// Returns a set of Strings with column names from the specified
@ -211,7 +239,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        &self,
        ctx: IOxSessionContext,
        predicate: &Predicate,
-        columns: Selection<'_>,
+        columns: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError>;

    /// Return a set of Strings containing the distinct values in the
@ -226,25 +254,10 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
        predicate: &Predicate,
    ) -> Result<Option<StringSet>, DataFusionError>;

-    /// Provides access to raw `QueryChunk` data as an
-    /// asynchronous stream of `RecordBatch`es filtered by a *required*
-    /// predicate. Note that not all chunks can evaluate all types of
-    /// predicates and this function will return an error
-    /// if requested to evaluate with a predicate that is not supported
+    /// Provides access to raw [`QueryChunk`] data.
    ///
-    /// This is the analog of the `TableProvider` in DataFusion
-    ///
-    /// The reason we can't simply use the `TableProvider` trait
-    /// directly is that the data for a particular Table lives in
-    /// several chunks within a partition, so there needs to be an
-    /// implementation of `TableProvider` that stitches together the
-    /// streams from several different `QueryChunk`s.
-    fn read_filter(
-        &self,
-        ctx: IOxSessionContext,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, DataFusionError>;
+    /// The engine assume that minimal work shall be performed to gather the `QueryChunkData`.
+    fn data(&self) -> QueryChunkData;

    /// Returns chunk type. Useful in tests and debug logs.
    fn chunk_type(&self) -> &str;
@ -261,7 +274,7 @@ impl<P> QueryChunkMeta for Arc<P>
 where
    P: QueryChunkMeta,
 {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
+    fn summary(&self) -> Arc<TableSummary> {
        self.as_ref().summary()
    }

@ -290,7 +303,7 @@ where

 /// Implement ChunkMeta for Arc<dyn QueryChunk>
 impl QueryChunkMeta for Arc<dyn QueryChunk> {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
+    fn summary(&self) -> Arc<TableSummary> {
        self.as_ref().summary()
    }

@ -317,26 +330,32 @@ impl QueryChunkMeta for Arc<dyn QueryChunk> {
    }
 }

-/// return true if all the chunks include statistics
-pub fn chunks_have_stats<'a>(chunks: impl IntoIterator<Item = &'a Arc<dyn QueryChunk>>) -> bool {
+/// return true if all the chunks include distinct counts for all columns.
+pub fn chunks_have_distinct_counts<'a>(
+    chunks: impl IntoIterator<Item = &'a Arc<dyn QueryChunk>>,
+) -> bool {
    // If at least one of the provided chunk cannot provide stats,
    // do not need to compute potential duplicates. We will treat
    // as all of them have duplicates
-    chunks.into_iter().all(|c| c.summary().is_some())
+    chunks.into_iter().all(|chunk| {
+        chunk
+            .summary()
+            .columns
+            .iter()
+            .all(|col| col.stats.distinct_count().is_some())
+    })
 }

 pub fn compute_sort_key_for_chunks<'a>(
    schema: &Schema,
    chunks: impl Copy + IntoIterator<Item = &'a Arc<dyn QueryChunk>>,
 ) -> SortKey {
-    if !chunks_have_stats(chunks) {
+    if !chunks_have_distinct_counts(chunks) {
        // chunks have not enough stats, return its pk that is
        // sorted lexicographically but time column always last
        SortKey::from_columns(schema.primary_key())
    } else {
-        let summaries = chunks
-            .into_iter()
-            .map(|x| x.summary().expect("Chunk should have summary"));
+        let summaries = chunks.into_iter().map(|x| x.summary());
        compute_sort_key(summaries)
    }
 }
--- a/iox_query/src/provider.rs
+++ b/iox_query/src/provider.rs
@ -37,12 +37,15 @@ use snafu::{ResultExt, Snafu};

 mod adapter;
 mod deduplicate;
+mod metrics;
 pub mod overlap;
 mod physical;
 use self::overlap::group_potential_duplicates;
 pub use deduplicate::{DeduplicateExec, RecordBatchDeduplicator};
 pub(crate) use physical::IOxReadFilterNode;

+pub use metrics::parquet_metrics;
+
 #[derive(Debug, Snafu)]
 pub enum Error {
    #[snafu(display(
--- a/iox_query/src/provider/metrics.rs
+++ b/iox_query/src/provider/metrics.rs
@ -0,0 +1,36 @@
+use std::sync::Arc;
+
+use datafusion::physical_plan::{file_format::ParquetExec, metrics::MetricsSet, ExecutionPlan};
+
+use super::IOxReadFilterNode;
+
+/// Recursively retrieve metrics from all ParquetExec's in `plan`
+pub fn parquet_metrics(plan: Arc<dyn ExecutionPlan>) -> Vec<MetricsSet> {
+    let mut output = vec![];
+    parquet_metrics_impl(plan, &mut output);
+    output
+}
+
+fn parquet_metrics_impl(plan: Arc<dyn ExecutionPlan>, output: &mut Vec<MetricsSet>) {
+    // Temporarily need to special case `IoxReadFilter` as it
+    // may create  `ParquetExec` during execution.
+    //
+    // This can be removed when
+    // <https://github.com/influxdata/influxdb_iox/issues/5897> is
+    // completed
+    if let Some(iox_read_node) = plan.as_any().downcast_ref::<IOxReadFilterNode>() {
+        if let Some(metrics) = iox_read_node.metrics() {
+            output.push(metrics)
+        }
+    }
+
+    if let Some(parquet) = plan.as_any().downcast_ref::<ParquetExec>() {
+        if let Some(metrics) = parquet.metrics() {
+            output.push(metrics)
+        }
+    }
+
+    for child in plan.children() {
+        parquet_metrics_impl(child, output)
+    }
+}
--- a/iox_query/src/provider/overlap.rs
+++ b/iox_query/src/provider/overlap.rs
@ -112,7 +112,7 @@ pub fn group_potential_duplicates(
 }

 fn timestamp_min_max(chunk: &dyn QueryChunk) -> Option<TimestampMinMax> {
-    chunk.summary().and_then(|summary| summary.time_range())
+    chunk.summary().time_range()
 }

 #[cfg(test)]
--- a/iox_query/src/provider/physical.rs
+++ b/iox_query/src/provider/physical.rs
@ -1,22 +1,29 @@
 //! Implementation of a DataFusion PhysicalPlan node across partition chunks

 use super::adapter::SchemaAdapterStream;
-use crate::{exec::IOxSessionContext, QueryChunk};
+use crate::{exec::IOxSessionContext, QueryChunk, QueryChunkData};
 use arrow::datatypes::SchemaRef;
 use data_types::TableSummary;
 use datafusion::{
+    datasource::listing::PartitionedFile,
    error::DataFusionError,
    execution::context::TaskContext,
    physical_plan::{
+        execute_stream,
        expressions::PhysicalSortExpr,
+        file_format::{FileScanConfig, ParquetExec},
+        memory::MemoryStream,
        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
+        stream::RecordBatchStreamAdapter,
        DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics,
    },
 };
+use futures::TryStreamExt;
 use observability_deps::tracing::trace;
+use parking_lot::Mutex;
 use predicate::Predicate;
-use schema::{selection::Selection, Schema};
-use std::{fmt, sync::Arc};
+use schema::Schema;
+use std::{collections::HashSet, fmt, sync::Arc};

 /// Implements the DataFusion physical plan interface
 #[derive(Debug)]
@ -27,9 +34,18 @@ pub(crate) struct IOxReadFilterNode {
    iox_schema: Arc<Schema>,
    chunks: Vec<Arc<dyn QueryChunk>>,
    predicate: Predicate,
+
    /// Execution metrics
    metrics: ExecutionPlanMetricsSet,

+    /// remember all ParquetExecs created by this node so we can pass
+    /// along metrics.
+    ///
+    /// When we use ParquetExec directly (rather
+    /// than an IOxReadFilterNode) the metric will be directly
+    /// available: <https://github.com/influxdata/influxdb_iox/issues/5897>
+    parquet_execs: Mutex<Vec<Arc<ParquetExec>>>,
+
    // execution context used for tracing
    ctx: IOxSessionContext,
 }
@ -46,14 +62,21 @@ impl IOxReadFilterNode {
        predicate: Predicate,
    ) -> Self {
        Self {
-            ctx,
            table_name,
            iox_schema,
            chunks,
            predicate,
            metrics: ExecutionPlanMetricsSet::new(),
+            parquet_execs: Mutex::new(vec![]),
+            ctx,
        }
    }
+
+    // Meant for testing -- provide input to the inner parquet execs
+    // that were created
+    fn parquet_execs(&self) -> Vec<Arc<ParquetExec>> {
+        self.parquet_execs.lock().to_vec()
+    }
 }

 impl ExecutionPlan for IOxReadFilterNode {
@ -95,6 +118,7 @@ impl ExecutionPlan for IOxReadFilterNode {
            iox_schema: Arc::clone(&self.iox_schema),
            chunks,
            predicate: self.predicate.clone(),
+            parquet_execs: Mutex::new(self.parquet_execs()),
            metrics: ExecutionPlanMetricsSet::new(),
        };

@ -104,16 +128,13 @@ impl ExecutionPlan for IOxReadFilterNode {
    fn execute(
        &self,
        partition: usize,
-        _context: Arc<TaskContext>,
+        context: Arc<TaskContext>,
    ) -> datafusion::error::Result<SendableRecordBatchStream> {
        trace!(partition, "Start IOxReadFilterNode::execute");

        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
-        let timer = baseline_metrics.elapsed_compute().timer();

        let schema = self.schema();
-        let fields = schema.fields();
-        let selection_cols = fields.iter().map(|f| f.name() as &str).collect::<Vec<_>>();

        let chunk = Arc::clone(&self.chunks[partition]);

@ -125,32 +146,88 @@ impl ExecutionPlan for IOxReadFilterNode {
        // restrict the requested selection to the actual columns
        // available, and use SchemaAdapterStream to pad the rest of
        // the columns with NULLs if necessary
-        let selection_cols = restrict_selection(selection_cols, &chunk_table_schema);
-        let selection = Selection::Some(&selection_cols);
+        let final_output_column_names: HashSet<_> =
+            schema.fields().iter().map(|f| f.name()).collect();
+        let projection: Vec<_> = chunk_table_schema
+            .iter()
+            .enumerate()
+            .filter(|(_idx, (_t, field))| final_output_column_names.contains(field.name()))
+            .map(|(idx, _)| idx)
+            .collect();
+        let projection = (!((projection.len() == chunk_table_schema.len())
+            && (projection.iter().enumerate().all(|(a, b)| a == *b))))
+        .then_some(projection);
+        let incomplete_output_schema = projection
+            .as_ref()
+            .map(|projection| {
+                Arc::new(
+                    chunk_table_schema
+                        .as_arrow()
+                        .project(projection)
+                        .expect("projection broken"),
+                )
+            })
+            .unwrap_or_else(|| chunk_table_schema.as_arrow());

-        let stream = chunk
-            .read_filter(
-                self.ctx.child_ctx("chunk read_filter"),
-                &self.predicate,
-                selection,
-            )
-            .map_err(|e| {
-                DataFusionError::Execution(format!(
-                    "Error creating scan for table {} chunk {}: {}",
-                    self.table_name,
-                    chunk.id(),
-                    e
-                ))
-            })?;
+        let stream = match chunk.data() {
+            QueryChunkData::RecordBatches(batches) => {
+                let stream = Box::pin(MemoryStream::try_new(
+                    batches,
+                    incomplete_output_schema,
+                    projection,
+                )?);
+                let adapter = SchemaAdapterStream::try_new(stream, schema, baseline_metrics)
+                    .map_err(|e| DataFusionError::Internal(e.to_string()))?;
+                Box::pin(adapter) as SendableRecordBatchStream
+            }
+            QueryChunkData::Parquet(exec_input) => {
+                let base_config = FileScanConfig {
+                    object_store_url: exec_input.object_store_url,
+                    file_schema: Arc::clone(&schema),
+                    file_groups: vec![vec![PartitionedFile {
+                        object_meta: exec_input.object_meta,
+                        partition_values: vec![],
+                        range: None,
+                        extensions: None,
+                    }]],
+                    statistics: Statistics::default(),
+                    projection: None,
+                    limit: None,
+                    table_partition_cols: vec![],
+                    config_options: context.session_config().config_options(),
+                };
+                let delete_predicates: Vec<_> = chunk
+                    .delete_predicates()
+                    .iter()
+                    .map(|pred| Arc::new(pred.as_ref().clone().into()))
+                    .collect();
+                let predicate = self
+                    .predicate
+                    .clone()
+                    .with_delete_predicates(&delete_predicates);
+                let metadata_size_hint = None;

-        // all CPU time is now done, pass in baseline metrics to adapter
-        timer.done();
+                let exec = Arc::new(ParquetExec::new(
+                    base_config,
+                    predicate.filter_expr(),
+                    metadata_size_hint,
+                ));

-        let adapter = SchemaAdapterStream::try_new(stream, schema, baseline_metrics)
-            .map_err(|e| DataFusionError::Internal(e.to_string()))?;
+                self.parquet_execs.lock().push(Arc::clone(&exec));
+
+                let stream = RecordBatchStreamAdapter::new(
+                    schema,
+                    futures::stream::once(execute_stream(exec, context)).try_flatten(),
+                );
+
+                // Note: No SchemaAdapterStream required here because `ParquetExec` already creates NULL columns for us.
+
+                Box::pin(stream)
+            }
+        };

        trace!(partition, "End IOxReadFilterNode::execute");
-        Ok(Box::pin(adapter))
+        Ok(stream)
    }

    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@ -168,23 +245,27 @@ impl ExecutionPlan for IOxReadFilterNode {
    }

    fn metrics(&self) -> Option<MetricsSet> {
-        Some(self.metrics.clone_inner())
+        let mut metrics = self.metrics.clone_inner();
+
+        // copy all metrics from the child parquet_execs
+        for exec in self.parquet_execs() {
+            if let Some(parquet_metrics) = exec.metrics() {
+                for m in parquet_metrics.iter() {
+                    metrics.push(Arc::clone(m))
+                }
+            }
+        }
+
+        Some(metrics)
    }

    fn statistics(&self) -> Statistics {
        let mut combined_summary_option: Option<TableSummary> = None;
        for chunk in &self.chunks {
            combined_summary_option = match combined_summary_option {
-                None => Some(
-                    chunk
-                        .summary()
-                        .expect("Chunk should have summary")
-                        .as_ref()
-                        .clone(),
-                ),
+                None => Some(chunk.summary().as_ref().clone()),
                Some(mut combined_summary) => {
-                    combined_summary
-                        .update_from(&chunk.summary().expect("Chunk should have summary"));
+                    combined_summary.update_from(&chunk.summary());
                    Some(combined_summary)
                }
            }
@ -197,17 +278,3 @@ impl ExecutionPlan for IOxReadFilterNode {
            .unwrap_or_default()
    }
 }
-
-/// Removes any columns that are not present in schema, returning a possibly
-/// restricted set of columns
-fn restrict_selection<'a>(
-    selection_cols: Vec<&'a str>,
-    chunk_table_schema: &'a Schema,
-) -> Vec<&'a str> {
-    let arrow_schema = chunk_table_schema.as_arrow();
-
-    selection_cols
-        .into_iter()
-        .filter(|col| arrow_schema.fields().iter().any(|f| f.name() == col))
-        .collect()
-}
--- a/iox_query/src/pruning.rs
+++ b/iox_query/src/pruning.rs
@ -87,7 +87,7 @@ pub fn prune_chunks(
 /// `false` for every single row.
 pub fn prune_summaries(
    table_schema: Arc<Schema>,
-    summaries: &Vec<Option<Arc<TableSummary>>>,
+    summaries: &Vec<Arc<TableSummary>>,
    predicate: &Predicate,
 ) -> Result<Vec<bool>, NotPrunedReason> {
    let filter_expr = match predicate.filter_expr() {
@ -127,7 +127,7 @@ pub fn prune_summaries(
 /// interface required by [`PruningPredicate`]
 struct ChunkPruningStatistics<'a> {
    table_schema: &'a Schema,
-    summaries: &'a Vec<Option<Arc<TableSummary>>>,
+    summaries: &'a Vec<Arc<TableSummary>>,
 }

 impl<'a> ChunkPruningStatistics<'a> {
@ -143,10 +143,9 @@ impl<'a> ChunkPruningStatistics<'a> {
        &'c self,
        column: &'b Column,
    ) -> impl Iterator<Item = Option<Statistics>> + 'a {
-        self.summaries.iter().map(|summary| match summary {
-            Some(summary) => Some(summary.column(&column.name)?.stats.clone()),
-            None => None,
-        })
+        self.summaries
+            .iter()
+            .map(|summary| Some(summary.column(&column.name)?.stats.clone()))
    }
 }

--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@ -8,15 +8,14 @@ use crate::{
        stringset::{StringSet, StringSetRef},
        ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext,
    },
-    Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase,
-    QueryText,
+    Predicate, PredicateMatch, QueryChunk, QueryChunkData, QueryChunkMeta, QueryCompletedToken,
+    QueryDatabase, QueryText,
 };
 use arrow::{
    array::{
        ArrayRef, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray, UInt64Array,
    },
    datatypes::{DataType, Int32Type, TimeUnit},
-    error::ArrowError,
    record_batch::RecordBatch,
 };
 use async_trait::async_trait;
@ -24,16 +23,14 @@ use data_types::{
    ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues,
    Statistics, TableSummary,
 };
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
-use datafusion_util::stream_from_batches;
-use futures::StreamExt;
+use datafusion::error::DataFusionError;
 use hashbrown::HashSet;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 use predicate::rpc_predicate::QueryDatabaseMeta;
 use schema::{
-    builder::SchemaBuilder, merge::SchemaMerger, selection::Selection, sort::SortKey,
-    InfluxColumnType, Schema, TIME_COLUMN_NAME,
+    builder::SchemaBuilder, merge::SchemaMerger, sort::SortKey, InfluxColumnType, Projection,
+    Schema, TIME_COLUMN_NAME,
 };
 use std::{any::Any, collections::BTreeMap, fmt, num::NonZeroU64, sync::Arc};
 use trace::ctx::SpanContext;
@ -949,34 +946,8 @@ impl QueryChunk for TestChunk {
        self.may_contain_pk_duplicates
    }

-    fn read_filter(
-        &self,
-        _ctx: IOxSessionContext,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, DataFusionError> {
-        self.check_error()?;
-
-        // save the predicate
-        self.predicates.lock().push(predicate.clone());
-
-        let batches = match self
-            .schema
-            .df_projection(selection)
-            .map_err(|e| DataFusionError::External(Box::new(e)))?
-        {
-            None => self.table_data.clone(),
-            Some(projection) => self
-                .table_data
-                .iter()
-                .map(|batch| {
-                    let batch = batch.project(&projection)?;
-                    Ok(Arc::new(batch))
-                })
-                .collect::<std::result::Result<Vec<_>, ArrowError>>()?,
-        };
-
-        Ok(stream_from_batches(self.schema().as_arrow(), batches))
+    fn data(&self) -> QueryChunkData {
+        QueryChunkData::RecordBatches(self.table_data.iter().map(|b| b.as_ref().clone()).collect())
    }

    fn chunk_type(&self) -> &str {
@ -1014,7 +985,7 @@ impl QueryChunk for TestChunk {
        &self,
        _ctx: IOxSessionContext,
        predicate: &Predicate,
-        selection: Selection<'_>,
+        selection: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        self.check_error()?;

@ -1023,8 +994,8 @@ impl QueryChunk for TestChunk {

        // only return columns specified in selection
        let column_names = match selection {
-            Selection::All => self.all_column_names(),
-            Selection::Some(cols) => self.specific_column_names_selection(cols),
+            Projection::All => self.all_column_names(),
+            Projection::Some(cols) => self.specific_column_names_selection(cols),
        };

        Ok(Some(column_names))
@ -1040,8 +1011,8 @@ impl QueryChunk for TestChunk {
 }

 impl QueryChunkMeta for TestChunk {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
-        Some(Arc::new(self.table_summary.clone()))
+    fn summary(&self) -> Arc<TableSummary> {
+        Arc::new(self.table_summary.clone())
    }

    fn schema(&self) -> Arc<Schema> {
@ -1071,17 +1042,10 @@ impl QueryChunkMeta for TestChunk {

 /// Return the raw data from the list of chunks
 pub async fn raw_data(chunks: &[Arc<dyn QueryChunk>]) -> Vec<RecordBatch> {
+    let ctx = IOxSessionContext::with_testing();
    let mut batches = vec![];
    for c in chunks {
-        let pred = Predicate::default();
-        let selection = Selection::All;
-        let mut stream = c
-            .read_filter(IOxSessionContext::with_testing(), &pred, selection)
-            .expect("Error in read_filter");
-        while let Some(b) = stream.next().await {
-            let b = b.expect("Error in stream");
-            batches.push(b)
-        }
+        batches.append(&mut c.data().read_to_batches(c.schema(), ctx.inner()).await);
    }
    batches
 }
--- a/iox_query/src/util.rs
+++ b/iox_query/src/util.rs
@ -25,7 +25,7 @@ use datafusion::{
    logical_expr::{
        expr_rewriter::ExprRewriter, BinaryExpr, ExprSchemable, LogicalPlan, LogicalPlanBuilder,
    },
-    optimizer::expr_simplifier::{ExprSimplifier, SimplifyContext},
+    optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext},
    physical_expr::create_physical_expr,
    physical_plan::{
        expressions::{col as physical_col, PhysicalSortExpr},
--- a/iox_tests/Cargo.toml
+++ b/iox_tests/Cargo.toml
@ -18,7 +18,7 @@ metric = { path = "../metric" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 object_store = "0.5.1"
 observability_deps = { path = "../observability_deps" }
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
 predicate = { path = "../predicate" }
 iox_query = { path = "../iox_query" }
--- a/iox_tests/src/util.rs
+++ b/iox_tests/src/util.rs
@ -30,11 +30,9 @@ use parquet_file::{
    metadata::IoxMetadata,
    storage::{ParquetStorage, StorageId},
 };
-use predicate::Predicate;
 use schema::{
-    selection::Selection,
    sort::{adjust_sort_key_columns, compute_sort_key, SortKey},
-    Schema,
+    Projection, Schema,
 };
 use std::{collections::HashMap, sync::Arc};
 use uuid::Uuid;
@ -389,14 +387,13 @@ impl TestTable {
            Arc::new(schema),
            self.catalog.parquet_store.clone(),
        );
-        let rx = chunk
-            .read_filter(
-                &Predicate::default(),
-                Selection::All,
+        chunk
+            .parquet_exec_input()
+            .read_to_batches(
+                chunk.schema().as_arrow(),
+                Projection::All,
                &chunk.store().test_df_context(),
            )
-            .unwrap();
-        datafusion::physical_plan::common::collect(rx)
            .await
            .unwrap()
    }
@ -753,8 +750,8 @@ impl TestParquetFileBuilder {
    pub fn with_line_protocol(self, line_protocol: &str) -> Self {
        let (table, batch) = lp_to_mutable_batch(line_protocol);

-        let schema = batch.schema(Selection::All).unwrap();
-        let record_batch = batch.to_arrow(Selection::All).unwrap();
+        let schema = batch.schema(Projection::All).unwrap();
+        let record_batch = batch.to_arrow(Projection::All).unwrap();

        self.with_record_batch(record_batch)
            .with_table(table)
--- a/ioxd_querier/src/lib.rs
+++ b/ioxd_querier/src/lib.rs
@ -95,6 +95,7 @@ impl<C: QuerierHandler + std::fmt::Debug + 'static> ServerType for QuerierServer
        );
        add_service!(builder, self.server.handler().schema_service());
        add_service!(builder, self.server.handler().catalog_service());
+        add_service!(builder, self.server.handler().object_store_service());

        serve_builder!(builder);

@ -204,7 +205,11 @@ pub async fn create_querier_server_type(
        )
        .await?,
    );
-    let querier_handler = Arc::new(QuerierHandlerImpl::new(args.catalog, Arc::clone(&database)));
+    let querier_handler = Arc::new(QuerierHandlerImpl::new(
+        args.catalog,
+        Arc::clone(&database),
+        Arc::clone(&args.object_store),
+    ));

    let querier = QuerierServer::new(args.metric_registry, querier_handler);
    Ok(Arc::new(QuerierServerType::new(
--- a/ioxd_router/src/lib.rs
+++ b/ioxd_router/src/lib.rs
@ -289,12 +289,11 @@ pub async fn create_router_server_type(
    let shard_service = init_shard_service(sharder, write_buffer_config, catalog).await?;

    // Initialise the API delegates
-    let handler_stack = Arc::new(handler_stack);
    let http = HttpDelegate::new(
        common_state.run_config().max_http_request_size,
        request_limit,
        namespace_resolver,
-        Arc::clone(&handler_stack),
+        handler_stack,
        &metrics,
    );
    let grpc = GrpcDelegate::new(schema_catalog, object_store, shard_service);
--- a/logfmt/Cargo.toml
+++ b/logfmt/Cargo.toml
@ -12,7 +12,7 @@ tracing-subscriber = "0.3"
 workspace-hack = { path = "../workspace-hack"}

 [dev-dependencies] # In alphabetical order
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 parking_lot = "0.12"
 regex = "1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
--- a/mutable_batch/src/lib.rs
+++ b/mutable_batch/src/lib.rs
@ -22,7 +22,7 @@ use arrow::record_batch::RecordBatch;
 use data_types::StatValues;
 use hashbrown::HashMap;
 use iox_time::Time;
-use schema::selection::Selection;
+use schema::Projection;
 use schema::{builder::SchemaBuilder, Schema, TIME_COLUMN_NAME};
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::{collections::BTreeSet, ops::Range};
@ -85,10 +85,10 @@ impl MutableBatch {
    /// Returns the schema for a given selection
    ///
    /// If Selection::All the returned columns are sorted by name
-    pub fn schema(&self, selection: Selection<'_>) -> Result<Schema> {
+    pub fn schema(&self, selection: Projection<'_>) -> Result<Schema> {
        let mut schema_builder = SchemaBuilder::new();
        let schema = match selection {
-            Selection::All => {
+            Projection::All => {
                for (column_name, column_idx) in self.column_names.iter() {
                    let column = &self.columns[*column_idx];
                    schema_builder.influx_column(column_name, column.influx_type());
@ -99,7 +99,7 @@ impl MutableBatch {
                    .context(InternalSchemaSnafu)?
                    .sort_fields_by_name()
            }
-            Selection::Some(cols) => {
+            Projection::Some(cols) => {
                for col in cols {
                    let column = self.column(col)?;
                    schema_builder.influx_column(col, column.influx_type());
@ -112,7 +112,7 @@ impl MutableBatch {
    }

    /// Convert all the data in this `MutableBatch` into a `RecordBatch`
-    pub fn to_arrow(&self, selection: Selection<'_>) -> Result<RecordBatch> {
+    pub fn to_arrow(&self, selection: Projection<'_>) -> Result<RecordBatch> {
        let schema = self.schema(selection)?;
        let columns = schema
            .iter()
--- a/mutable_batch/tests/extend.rs
+++ b/mutable_batch/tests/extend.rs
@ -1,7 +1,7 @@
 use arrow_util::assert_batches_eq;
 use data_types::{StatValues, Statistics};
 use mutable_batch::{writer::Writer, MutableBatch};
-use schema::selection::Selection;
+use schema::Projection;
 use std::{collections::BTreeMap, num::NonZeroU64};

 #[test]
@ -56,7 +56,7 @@ fn test_extend() {

    writer.commit();

-    let a_before = a.to_arrow(Selection::All).unwrap();
+    let a_before = a.to_arrow(Projection::All).unwrap();

    a.extend_from(&b).unwrap();

@ -90,7 +90,7 @@ fn test_extend() {
            "| v1   | v5   | 1970-01-01T00:00:00.000000012Z |",
            "+------+------+--------------------------------+",
        ],
-        &[b.to_arrow(Selection::All).unwrap()]
+        &[b.to_arrow(Projection::All).unwrap()]
    );

    assert_batches_eq!(
@ -113,7 +113,7 @@ fn test_extend() {
            "| v1   |      | v5   | 1970-01-01T00:00:00.000000012Z |",
            "+------+------+------+--------------------------------+",
        ],
-        &[a.to_arrow(Selection::All).unwrap()]
+        &[a.to_arrow(Projection::All).unwrap()]
    );

    let stats: BTreeMap<_, _> = a.columns().map(|(k, v)| (k.as_str(), v.stats())).collect();
--- a/mutable_batch/tests/extend_range.rs
+++ b/mutable_batch/tests/extend_range.rs
@ -1,7 +1,7 @@
 use arrow_util::assert_batches_eq;
 use data_types::{StatValues, Statistics};
 use mutable_batch::{writer::Writer, MutableBatch};
-use schema::selection::Selection;
+use schema::Projection;
 use std::{collections::BTreeMap, num::NonZeroU64};

 #[test]
@ -68,7 +68,7 @@ fn test_extend_range() {
            "|     | v2   | 1970-01-01T00:00:00.000000004Z |",
            "+-----+------+--------------------------------+",
        ],
-        &[a.to_arrow(Selection::All).unwrap()]
+        &[a.to_arrow(Projection::All).unwrap()]
    );

    assert_batches_eq!(
@ -86,7 +86,7 @@ fn test_extend_range() {
            "|       | v1   | v2   | 1970-01-01T00:00:00.000000012Z |",
            "+-------+------+------+--------------------------------+",
        ],
-        &[b.to_arrow(Selection::All).unwrap()]
+        &[b.to_arrow(Projection::All).unwrap()]
    );

    a.extend_from_range(&b, 1..4).unwrap();
@ -106,7 +106,7 @@ fn test_extend_range() {
            "|       |     |      | v1   | 1970-01-01T00:00:00.000000008Z |",
            "+-------+-----+------+------+--------------------------------+",
        ],
-        &[a.to_arrow(Selection::All).unwrap()]
+        &[a.to_arrow(Projection::All).unwrap()]
    );

    let stats: BTreeMap<_, _> = a.columns().map(|(k, v)| (k.as_str(), v.stats())).collect();
--- a/mutable_batch/tests/writer.rs
+++ b/mutable_batch/tests/writer.rs
@ -1,7 +1,7 @@
 use arrow_util::assert_batches_eq;
 use data_types::{StatValues, Statistics};
 use mutable_batch::{writer::Writer, MutableBatch, TimestampSummary};
-use schema::selection::Selection;
+use schema::Projection;
 use std::num::NonZeroU64;

 fn get_stats(batch: &MutableBatch) -> Vec<(&str, Statistics)> {
@ -158,7 +158,7 @@ fn test_basic() {
        ),
    ];

-    assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
    assert_eq!(stats, expected_stats);

    let mut writer = Writer::new(&mut batch, 4);
@ -175,7 +175,7 @@ fn test_basic() {
    let stats: Vec<_> = get_stats(&batch);

    // Writer dropped, should not impact stats or data
-    assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
    assert_eq!(stats, expected_stats);

    let err = Writer::new(&mut batch, 1)
@ -208,7 +208,7 @@ fn test_basic() {
    let stats: Vec<_> = get_stats(&batch);

    // Writer not committed, should not impact stats or data
-    assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
    assert_eq!(stats, expected_stats);

    let mut writer = Writer::new(&mut batch, 17);
@ -330,7 +330,7 @@ fn test_basic() {
        ),
    ];

-    assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
    assert_eq!(stats, expected_stats);

    let mut expected_timestamps = TimestampSummary::default();
--- a/mutable_batch/tests/writer_drop.rs
+++ b/mutable_batch/tests/writer_drop.rs
@ -1,7 +1,7 @@
 use arrow_util::assert_batches_eq;
 use mutable_batch::writer::Writer;
 use mutable_batch::MutableBatch;
-use schema::selection::Selection;
+use schema::Projection;

 #[test]
 fn test_new_column() {
@ -23,7 +23,7 @@ fn test_new_column() {
        "+-------+",
    ];

-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

    let mut writer = Writer::new(&mut batch, 1);
    writer
@ -33,5 +33,5 @@ fn test_new_column() {
    std::mem::drop(writer);

    // Should not include tag1 column
-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
 }
--- a/mutable_batch/tests/writer_fuzz.rs
+++ b/mutable_batch/tests/writer_fuzz.rs
@ -18,7 +18,7 @@ use data_types::{IsNan, PartitionTemplate, StatValues, Statistics, TemplatePart}
 use hashbrown::HashSet;
 use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload};
 use rand::prelude::*;
-use schema::selection::Selection;
+use schema::Projection;
 use std::{collections::BTreeMap, num::NonZeroU64, ops::Range, sync::Arc};

 fn make_rng() -> StdRng {
@ -391,7 +391,7 @@ fn test_writer_fuzz() {
        expected.concat(&ret.filter(&ranges));
    }

-    let actual = batch.to_arrow(Selection::All).unwrap();
+    let actual = batch.to_arrow(Projection::All).unwrap();

    assert_eq!(
        arrow_util::display::pretty_format_batches(&[actual]).unwrap(),
--- a/mutable_batch_lp/src/lib.rs
+++ b/mutable_batch_lp/src/lib.rs
@ -327,7 +327,7 @@ mod tests {
    use super::*;
    use arrow_util::assert_batches_eq;
    use assert_matches::assert_matches;
-    use schema::selection::Selection;
+    use schema::Projection;

    #[test]
    fn test_basic() {
@ -353,7 +353,7 @@ mod tests {
                "| 2    | v1   | v2   | 1970-01-01T00:00:00.000000005Z |     |",
                "+------+------+------+--------------------------------+-----+",
            ],
-            &[batches["cpu"].to_arrow(Selection::All).unwrap()]
+            &[batches["cpu"].to_arrow(Projection::All).unwrap()]
        );

        assert_batches_eq!(
@ -365,7 +365,7 @@ mod tests {
                "| 2    | v5   | 1970-01-01T00:00:00.000000001Z |",
                "+------+------+--------------------------------+",
            ],
-            &[batches["mem"].to_arrow(Selection::All).unwrap()]
+            &[batches["mem"].to_arrow(Projection::All).unwrap()]
        );
    }

@ -378,7 +378,7 @@ m f1=10i 1639612800000000000
        let batches = lines_to_batches(lp, 5).unwrap();
        assert_eq!(batches.len(), 1);

-        let batch = batches["m"].to_arrow(Selection::All).unwrap();
+        let batch = batches["m"].to_arrow(Projection::All).unwrap();
        assert_batches_eq!(
            &[
                "+-----+----+----------------------+",
@ -412,7 +412,7 @@ m b=t 1639612800000000000
        let batches = lines_to_batches(lp, 5).unwrap();
        assert_eq!(batches.len(), 1);

-        let batch = batches["m"].to_arrow(Selection::All).unwrap();
+        let batch = batches["m"].to_arrow(Projection::All).unwrap();
        assert_batches_eq!(
            &[
                "+------+---+----------------------+---+",
@ -464,7 +464,7 @@ m b=t 1639612800000000000
                    "| 1970-01-01T00:00:00Z | 2   |",
                    "+----------------------+-----+",
                ],
-                &[batches["m1"].to_arrow(Selection::All).unwrap()]
+                &[batches["m1"].to_arrow(Projection::All).unwrap()]
            );
        }

@ -484,7 +484,7 @@ m b=t 1639612800000000000
                    "| 1970-01-01T00:00:00Z | 2   |",
                    "+----------------------+-----+",
                ],
-                &[batches["m1"].to_arrow(Selection::All).unwrap()]
+                &[batches["m1"].to_arrow(Projection::All).unwrap()]
            );
        }

--- a/mutable_batch_pb/src/decode.rs
+++ b/mutable_batch_pb/src/decode.rs
@ -60,16 +60,21 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// Decodes a [`DatabaseBatch`] to a map of [`MutableBatch`] keyed by table name
 pub fn decode_database_batch(
    database_batch: &DatabaseBatch,
-) -> Result<HashMap<String, MutableBatch>> {
-    let mut ret = HashMap::with_capacity(database_batch.table_batches.len());
+) -> Result<(HashMap<String, MutableBatch>, HashMap<i64, String>)> {
+    let mut name_to_data = HashMap::with_capacity(database_batch.table_batches.len());
+    let mut id_to_name = HashMap::with_capacity(database_batch.table_batches.len());
+
    for table_batch in &database_batch.table_batches {
-        let (_, batch) = ret
+        let (_, batch) = name_to_data
            .raw_entry_mut()
            .from_key(table_batch.table_name.as_str())
            .or_insert_with(|| (table_batch.table_name.clone(), MutableBatch::new()));
+
+        id_to_name.insert(table_batch.table_id, table_batch.table_name.clone());
+
        write_table_batch(batch, table_batch)?;
    }
-    Ok(ret)
+    Ok((name_to_data, id_to_name))
 }

 /// Writes the provided [`TableBatch`] to a [`MutableBatch`] on error any changes made
@ -432,7 +437,7 @@ fn pb_value_type(column: &str, values: &PbValues) -> Result<InfluxFieldType> {
 mod tests {
    use arrow_util::assert_batches_eq;
    use generated_types::influxdata::pbdata::v1::InternedStrings;
-    use schema::selection::Selection;
+    use schema::Projection;

    use super::*;

@ -620,6 +625,7 @@ mod tests {
                ),
            ],
            row_count: 5,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
@ -638,7 +644,7 @@ mod tests {
            "+-----+-----+------+------+--------------------------------+-----+",
        ];

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        table_batch.columns.push(table_batch.columns[0].clone());

@ -658,7 +664,7 @@ mod tests {
            .to_string();
        assert_eq!(err, "table batch must contain time column");

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        // Nulls in time column -> error
        time.null_mask = vec![1];
@ -669,7 +675,7 @@ mod tests {
            .to_string();
        assert_eq!(err, "time column must not contain nulls");

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        // Missing values -> error
        table_batch.columns[0].values.take().unwrap();
@ -679,7 +685,7 @@ mod tests {
            .to_string();
        assert_eq!(err, "column with no values: tag1");

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        // No data -> error
        table_batch.columns[0].values = Some(PbValues {
@ -698,7 +704,7 @@ mod tests {
            .to_string();
        assert_eq!(err, "column with no values: tag1");

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
    }

    #[test]
@ -759,6 +765,7 @@ mod tests {
                ),
            ],
            row_count: 6,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
@ -777,7 +784,7 @@ mod tests {
            "+----------+----+--------+-------+------+--------------------------------+",
        ];

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        // Try to write 6 rows expecting an error
        let mut try_write = |other: PbColumn, expected_err: &str| {
@ -792,6 +799,7 @@ mod tests {
                    other,
                ],
                row_count: 6,
+                table_id: 42,
            };

            let err = write_table_batch(&mut batch, &table_batch)
@ -799,7 +807,7 @@ mod tests {
                .to_string();

            assert_eq!(err, expected_err);
-            assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+            assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
        };

        try_write(
@ -899,6 +907,7 @@ mod tests {
                ),
            ],
            row_count: 10,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
@ -922,7 +931,7 @@ mod tests {
            "+-----+--------------------------------+",
        ];

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
    }

    #[test]
@ -936,6 +945,7 @@ mod tests {
                vec![],
            )],
            row_count: 9,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
@ -958,7 +968,7 @@ mod tests {
            "+--------------------------------+",
        ];

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
    }

    #[test]
@ -1038,6 +1048,7 @@ mod tests {
                with_i64(column("time", SemanticType::Time), vec![1, 2, 3], vec![]),
            ],
            row_count: 9,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
@ -1060,13 +1071,14 @@ mod tests {
            "+-------+-----+-----+-----+-----+-----+-----+-----+-----+--------------------------------+-----+",
        ];

-        assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

        // we need at least one value though
        let table_batch = TableBatch {
            table_name: "table".to_string(),
            columns: vec![with_i64(column("time", SemanticType::Time), vec![], vec![])],
            row_count: 9,
+            table_id: 42,
        };

        let mut batch = MutableBatch::new();
--- a/mutable_batch_pb/src/encode.rs
+++ b/mutable_batch_pb/src/encode.rs
@ -12,19 +12,38 @@ use mutable_batch::MutableBatch;
 use schema::InfluxColumnType;

 /// Convert a [`DmlWrite`] to a [`DatabaseBatch`]
-pub fn encode_write(db_name: &str, write: &DmlWrite) -> DatabaseBatch {
+pub fn encode_write(db_name: &str, database_id: i64, write: &DmlWrite) -> DatabaseBatch {
    DatabaseBatch {
        database_name: db_name.to_string(),
        table_batches: write
            .tables()
-            .map(|(table_name, batch)| encode_batch(table_name, batch))
+            .map(|(table_name, batch)| {
+                // Temporary code.
+                //
+                // Once only IDs are pushed over the network this extra lookup
+                // can be removed.
+                //
+                // Safety: this code path is invoked only in the producer, and
+                // therefore accessing the table IDs is acceptable. See
+                // DmlWrite for context.
+                let table_id = unsafe {
+                    write.table_id(table_name).unwrap_or_else(|| {
+                        panic!(
+                            "no table ID mapping found for {} table {}",
+                            db_name, table_name
+                        )
+                    })
+                };
+                encode_batch(table_name, table_id.get(), batch)
+            })
            .collect(),
        partition_key: write.partition_key().to_string(),
+        database_id,
    }
 }

 /// Convert a [`MutableBatch`] to [`TableBatch`]
-pub fn encode_batch(table_name: &str, batch: &MutableBatch) -> TableBatch {
+pub fn encode_batch(table_name: &str, table_id: i64, batch: &MutableBatch) -> TableBatch {
    TableBatch {
        table_name: table_name.to_string(),
        columns: batch
@ -45,6 +64,7 @@ pub fn encode_batch(table_name: &str, batch: &MutableBatch) -> TableBatch {
            })
            .collect(),
        row_count: batch.rows() as u32,
+        table_id,
    }
 }

--- a/mutable_batch_pb/tests/encode.rs
+++ b/mutable_batch_pb/tests/encode.rs
@ -2,7 +2,7 @@ use arrow_util::assert_batches_eq;
 use data_types::{PartitionTemplate, TemplatePart};
 use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload};
 use mutable_batch_pb::{decode::write_table_batch, encode::encode_batch};
-use schema::selection::Selection;
+use schema::Projection;

 #[test]
 fn test_encode_decode() {
@ -28,14 +28,15 @@ fn test_encode_decode() {
        "+-------+------+-------+-----+------+--------------------------------+-----+",
    ];

-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

-    let encoded = encode_batch("foo", &batch);
+    let encoded = encode_batch("foo", 42, &batch);
+    assert_eq!(encoded.table_id, 42);

    let mut batch = MutableBatch::new();
    write_table_batch(&mut batch, &encoded).unwrap();

-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
 }

 // This test asserts columns containing no values do not prevent an encoded
@ -139,7 +140,9 @@ fn test_encode_decode_null_columns_issue_4272() {
        .write_to_batch(&mut got)
        .expect("should write");

-    let encoded = encode_batch("bananas", &got);
+    let encoded = encode_batch("bananas", 24, &got);
+    assert_eq!(encoded.table_id, 24);
+
    let mut batch = MutableBatch::new();
    // Without the fix for #4272 this deserialisation call would fail.
    write_table_batch(&mut batch, &encoded).unwrap();
@ -151,7 +154,7 @@ fn test_encode_decode_null_columns_issue_4272() {
        "| 1 | 1970-01-01T00:00:00.000000160Z |",
        "+---+--------------------------------+",
    ];
-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);

    // And finally assert the "1970-07-05" round-trip
    let mut got = MutableBatch::default();
@ -161,7 +164,9 @@ fn test_encode_decode_null_columns_issue_4272() {
        .write_to_batch(&mut got)
        .expect("should write");

-    let encoded = encode_batch("bananas", &got);
+    let encoded = encode_batch("bananas", 42, &got);
+    assert_eq!(encoded.table_id, 42);
+
    let mut batch = MutableBatch::new();
    // Without the fix for #4272 this deserialisation call would fail.
    write_table_batch(&mut batch, &encoded).unwrap();
@ -173,5 +178,5 @@ fn test_encode_decode_null_columns_issue_4272() {
        "| 1 | 1970-07-05T06:32:41.568756160Z |",
        "+---+--------------------------------+",
    ];
-    assert_batches_eq!(expected, &[batch.to_arrow(Selection::All).unwrap()]);
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
 }
--- a/mutable_batch_tests/Cargo.toml
+++ b/mutable_batch_tests/Cargo.toml
@ -18,6 +18,7 @@ prost = "0.11"
 [dev-dependencies]
 bytes = "1.2"
 criterion = { version = "0.4", default-features = false, features = ["rayon"]}
+data_types = { path = "../data_types", default-features = false }

 [[bench]]
 name = "write_lp"
--- a/mutable_batch_tests/benches/write_pb.rs
+++ b/mutable_batch_tests/benches/write_pb.rs
@ -1,5 +1,6 @@
 use bytes::{Bytes, BytesMut};
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use data_types::{NamespaceId, TableId};
 use dml::DmlWrite;
 use generated_types::influxdata::pbdata::v1::DatabaseBatch;
 use mutable_batch::MutableBatch;
@ -12,8 +13,21 @@ fn generate_pbdata_bytes() -> Vec<(String, (usize, Bytes))> {
        .into_iter()
        .map(|(bench, lp)| {
            let batches = lines_to_batches(&lp, 0).unwrap();
-            let write = DmlWrite::new("test_db", batches, "bananas".into(), Default::default());
-            let database_batch = mutable_batch_pb::encode::encode_write("db", &write);
+            let ids = batches
+                .keys()
+                .enumerate()
+                .map(|(i, name)| (name.clone(), TableId::new(i as _)))
+                .collect();
+
+            let write = DmlWrite::new(
+                "test_db",
+                NamespaceId::new(42),
+                batches,
+                ids,
+                "bananas".into(),
+                Default::default(),
+            );
+            let database_batch = mutable_batch_pb::encode::encode_write("db", 42, &write);

            let mut bytes = BytesMut::new();
            database_batch.encode(&mut bytes).unwrap();
--- a/parquet_file/src/chunk.rs
+++ b/parquet_file/src/chunk.rs
@ -1,11 +1,12 @@
 //! A metadata summary of a Parquet file in object storage, with the ability to
 //! download & execute a scan.

-use crate::{storage::ParquetStorage, ParquetFilePath};
+use crate::{
+    storage::{ParquetExecInput, ParquetStorage},
+    ParquetFilePath,
+};
 use data_types::{ParquetFile, TimestampMinMax};
-use datafusion::{physical_plan::SendableRecordBatchStream, prelude::SessionContext};
-use predicate::Predicate;
-use schema::{selection::Selection, Schema};
+use schema::{Projection, Schema};
 use std::{collections::BTreeSet, mem, sync::Arc};
 use uuid::Uuid;

@ -60,11 +61,11 @@ impl ParquetChunk {
    }

    /// Return the columns names that belong to the given column selection
-    pub fn column_names(&self, selection: Selection<'_>) -> Option<BTreeSet<String>> {
+    pub fn column_names(&self, selection: Projection<'_>) -> Option<BTreeSet<String>> {
        let fields = self.schema.inner().fields().iter();

        Some(match selection {
-            Selection::Some(cols) => fields
+            Projection::Some(cols) => fields
                .filter_map(|x| {
                    if cols.contains(&x.name().as_str()) {
                        Some(x.name().clone())
@ -73,26 +74,19 @@ impl ParquetChunk {
                    }
                })
                .collect(),
-            Selection::All => fields.map(|x| x.name().clone()).collect(),
+            Projection::All => fields.map(|x| x.name().clone()).collect(),
        })
    }

    /// Return stream of data read from parquet file
-    pub fn read_filter(
-        &self,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-        session_ctx: &SessionContext,
-    ) -> Result<SendableRecordBatchStream, crate::storage::ReadError> {
+    /// Inputs for [`ParquetExec`].
+    ///
+    /// See [`ParquetExecInput`] for more information.
+    ///
+    /// [`ParquetExec`]: datafusion::physical_plan::file_format::ParquetExec
+    pub fn parquet_exec_input(&self) -> ParquetExecInput {
        let path: ParquetFilePath = self.parquet_file.as_ref().into();
-        self.store.read_filter(
-            predicate,
-            selection,
-            Arc::clone(&self.schema.as_arrow()),
-            &path,
-            self.file_size_bytes(),
-            session_ctx,
-        )
+        self.store.parquet_exec_input(&path, self.file_size_bytes())
    }

    /// The total number of rows in all row groups in this chunk.
--- a/parquet_file/src/serialize.rs
+++ b/parquet_file/src/serialize.rs
@ -6,6 +6,7 @@ use std::{io::Write, sync::Arc};

 use arrow::error::ArrowError;
 use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion_util::config::BATCH_SIZE;
 use futures::{pin_mut, TryStreamExt};
 use observability_deps::tracing::{debug, trace, warn};
 use parquet::{
@ -21,6 +22,11 @@ use crate::metadata::{IoxMetadata, METADATA_KEY};
 /// Parquet row group write size
 pub const ROW_GROUP_WRITE_SIZE: usize = 1024 * 1024;

+/// ensure read and write work well together
+/// Skip clippy due to <https://github.com/rust-lang/rust-clippy/issues/8159>.
+#[allow(clippy::assertions_on_constants)]
+const _: () = assert!(ROW_GROUP_WRITE_SIZE % BATCH_SIZE == 0);
+
 /// [`RecordBatch`] to Parquet serialisation errors.
 ///
 /// [`RecordBatch`]: arrow::record_batch::RecordBatch
--- a/parquet_file/src/storage.rs
+++ b/parquet_file/src/storage.rs
@ -6,26 +6,26 @@ use crate::{
    serialize::{self, CodecError},
    ParquetFilePath,
 };
-use arrow::datatypes::{Field, SchemaRef};
+use arrow::{
+    datatypes::{Field, SchemaRef},
+    record_batch::RecordBatch,
+};
 use bytes::Bytes;
 use datafusion::{
    datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl},
+    error::DataFusionError,
    execution::context::TaskContext,
    physical_plan::{
-        execute_stream,
        file_format::{FileScanConfig, ParquetExec},
-        stream::RecordBatchStreamAdapter,
-        SendableRecordBatchStream, Statistics,
+        ExecutionPlan, SendableRecordBatchStream, Statistics,
    },
    prelude::SessionContext,
 };
-use futures::TryStreamExt;
+use datafusion_util::config::iox_session_config;
 use object_store::{DynObjectStore, ObjectMeta};
 use observability_deps::tracing::*;
-use predicate::Predicate;
-use schema::selection::{select_schema, Selection};
+use schema::Projection;
 use std::{
-    num::TryFromIntError,
    sync::Arc,
    time::{Duration, Instant},
 };
@ -52,38 +52,6 @@ pub enum UploadError {
    Upload(#[from] object_store::Error),
 }

-/// Errors during Parquet file download & scan.
-#[derive(Debug, Error)]
-#[allow(clippy::large_enum_variant)]
-pub enum ReadError {
-    /// Error writing the bytes fetched from object store to the temporary
-    /// parquet file on disk.
-    #[error("i/o error writing downloaded parquet: {0}")]
-    IO(#[from] std::io::Error),
-
-    /// An error fetching Parquet file bytes from object store.
-    #[error("failed to read data from object store: {0}")]
-    ObjectStore(#[from] object_store::Error),
-
-    /// An error reading the downloaded Parquet file.
-    #[error("invalid parquet file: {0}")]
-    Parquet(#[from] parquet::errors::ParquetError),
-
-    /// Schema mismatch
-    #[error("Schema mismatch (expected VS actual parquet file) for file '{path}': {source}")]
-    SchemaMismatch {
-        /// Path of the affected parquet file.
-        path: object_store::path::Path,
-
-        /// Source error
-        source: ProjectionError,
-    },
-
-    /// Malformed integer data for row count
-    #[error("Malformed row count integer")]
-    MalformedRowCount(#[from] TryFromIntError),
-}
-
 /// ID for an object store hooked up into DataFusion.
 #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
 pub struct StorageId(&'static str);
@ -106,6 +74,69 @@ impl std::fmt::Display for StorageId {
    }
 }

+/// Inputs required to build a [`ParquetExec`] for one or multiple files.
+///
+/// The files shall be grouped by [`object_store_url`](Self::object_store_url). For each each object store, you shall
+/// create one [`ParquetExec`] and put each file into its own "file group".
+///
+/// [`ParquetExec`]: datafusion::physical_plan::file_format::ParquetExec
+#[derive(Debug)]
+pub struct ParquetExecInput {
+    /// Store where the file is located.
+    pub object_store_url: ObjectStoreUrl,
+
+    /// Object metadata.
+    pub object_meta: ObjectMeta,
+}
+
+impl ParquetExecInput {
+    /// Read parquet file into [`RecordBatch`]es.
+    ///
+    /// This should only be used for testing purposes.
+    pub async fn read_to_batches(
+        &self,
+        schema: SchemaRef,
+        projection: Projection<'_>,
+        session_ctx: &SessionContext,
+    ) -> Result<Vec<RecordBatch>, DataFusionError> {
+        // Compute final (output) schema after selection
+        let schema = Arc::new(
+            projection
+                .project_schema(&schema)
+                .as_ref()
+                .clone()
+                .with_metadata(Default::default()),
+        );
+
+        let base_config = FileScanConfig {
+            object_store_url: self.object_store_url.clone(),
+            file_schema: schema,
+            file_groups: vec![vec![PartitionedFile {
+                object_meta: self.object_meta.clone(),
+                partition_values: vec![],
+                range: None,
+                extensions: None,
+            }]],
+            statistics: Statistics::default(),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            // TODO avoid this `copied_config` when config_options are directly available on context
+            config_options: session_ctx.copied_config().config_options(),
+        };
+        let exec = ParquetExec::new(base_config, None, None);
+        let exec_schema = exec.schema();
+        datafusion::physical_plan::collect(Arc::new(exec), session_ctx.task_ctx())
+            .await
+            .map(|batches| {
+                for batch in &batches {
+                    assert_eq!(batch.schema(), exec_schema);
+                }
+                batches
+            })
+    }
+}
+
 /// The [`ParquetStorage`] type encapsulates [`RecordBatch`] persistence to an
 /// underlying [`ObjectStore`].
 ///
@ -147,7 +178,7 @@ impl ParquetStorage {
    pub fn test_df_context(&self) -> SessionContext {
        // set up "fake" DataFusion session
        let object_store = Arc::clone(&self.object_store);
-        let session_ctx = SessionContext::new();
+        let session_ctx = SessionContext::with_config(iox_session_config());
        let task_ctx = Arc::new(TaskContext::from(&session_ctx));
        task_ctx
            .runtime_env()
@ -219,72 +250,22 @@ impl ParquetStorage {
        Ok((parquet_meta, file_size))
    }

-    /// Pull the Parquet-encoded [`RecordBatch`] at the file path derived from
-    /// the provided [`ParquetFilePath`].
+    /// Inputs for [`ParquetExec`].
    ///
-    /// The `selection` projection is pushed down to the Parquet deserializer.
+    /// See [`ParquetExecInput`] for more information.
    ///
-    /// This impl fetches the associated Parquet file bytes from object storage,
-    /// temporarily persisting them to a local temp file to feed to the arrow
-    /// reader.
-    ///
-    /// No caching is performed by `read_filter()`, and each call to
-    /// `read_filter()` will re-download the parquet file unless the underlying
-    /// object store impl caches the fetched bytes.
-    ///
-    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
-    pub fn read_filter(
-        &self,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-        schema: SchemaRef,
-        path: &ParquetFilePath,
-        file_size: usize,
-        session_ctx: &SessionContext,
-    ) -> Result<SendableRecordBatchStream, ReadError> {
-        let path = path.object_store_path();
-        trace!(path=?path, "fetching parquet data for filtered read");
-
-        // Compute final (output) schema after selection
-        let schema = Arc::new(
-            select_schema(selection, &schema)
-                .as_ref()
-                .clone()
-                .with_metadata(Default::default()),
-        );
-
-        // create ParquetExec node
-        let object_meta = ObjectMeta {
-            location: path,
-            // we don't care about the "last modified" field
-            last_modified: Default::default(),
-            size: file_size,
-        };
-        let expr = predicate.filter_expr();
-        let base_config = FileScanConfig {
+    /// [`ParquetExec`]: datafusion::physical_plan::file_format::ParquetExec
+    pub fn parquet_exec_input(&self, path: &ParquetFilePath, file_size: usize) -> ParquetExecInput {
+        ParquetExecInput {
            object_store_url: ObjectStoreUrl::parse(format!("iox://{}/", self.id))
                .expect("valid object store URL"),
-            file_schema: Arc::clone(&schema),
-            file_groups: vec![vec![PartitionedFile {
-                object_meta,
-                partition_values: vec![],
-                range: None,
-                extensions: None,
-            }]],
-            statistics: Statistics::default(),
-            projection: None,
-            limit: None,
-            table_partition_cols: vec![],
-            // TODO avoid this `copied_config` when config_options are directly available on context
-            config_options: session_ctx.copied_config().config_options(),
-        };
-        let exec = ParquetExec::new(base_config, expr, None);
-
-        Ok(Box::pin(RecordBatchStreamAdapter::new(
-            Arc::clone(&schema),
-            futures::stream::once(execute_stream(Arc::new(exec), session_ctx.task_ctx()))
-                .try_flatten(),
-        )))
+            object_meta: ObjectMeta {
+                location: path.object_store_path(),
+                // we don't care about the "last modified" field
+                last_modified: Default::default(),
+                size: file_size,
+            },
+        }
    }
 }

@ -348,7 +329,7 @@ mod tests {
        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
        let schema = batch.schema();

-        assert_roundtrip(batch.clone(), Selection::All, schema, batch).await;
+        assert_roundtrip(batch.clone(), Projection::All, schema, batch).await;
    }

    #[tokio::test]
@ -367,7 +348,7 @@ mod tests {
            ("c", to_string_array(&["foo"])),
        ])
        .unwrap();
-        assert_roundtrip(batch, Selection::Some(&["d", "c"]), schema, expected_batch).await;
+        assert_roundtrip(batch, Projection::Some(&["d", "c"]), schema, expected_batch).await;
    }

    #[tokio::test]
@ -380,7 +361,7 @@ mod tests {
        let schema = batch.schema();

        let expected_batch = RecordBatch::try_from_iter([("b", to_int_array(&[1]))]).unwrap();
-        assert_roundtrip(batch, Selection::Some(&["b", "c"]), schema, expected_batch).await;
+        assert_roundtrip(batch, Projection::Some(&["b", "c"]), schema, expected_batch).await;
    }

    #[tokio::test]
@ -396,7 +377,7 @@ mod tests {
        ])
        .unwrap();
        let schema = schema_batch.schema();
-        assert_roundtrip(file_batch, Selection::All, schema, schema_batch).await;
+        assert_roundtrip(file_batch, Projection::All, schema, schema_batch).await;
    }

    #[tokio::test]
@ -422,7 +403,7 @@ mod tests {
            ("c", to_string_array(&["foo"])),
        ])
        .unwrap();
-        assert_roundtrip(batch, Selection::Some(&["d", "c"]), schema, expected_batch).await;
+        assert_roundtrip(batch, Projection::Some(&["d", "c"]), schema, expected_batch).await;
    }

    #[tokio::test]
@ -485,7 +466,7 @@ mod tests {
                .clone()
                .with_metadata(HashMap::from([(String::from("foo"), String::from("bar"))])),
        );
-        download(&store, &meta, Selection::All, schema, file_size)
+        download(&store, &meta, Projection::All, schema, file_size)
            .await
            .unwrap();
    }
@ -514,7 +495,7 @@ mod tests {
        // Serialize & upload the record batches.
        let (_iox_md, file_size) = upload(&store, &meta, batch).await;

-        download(&store, &meta, Selection::All, schema, file_size)
+        download(&store, &meta, Projection::All, schema, file_size)
            .await
            .unwrap();
    }
@ -529,7 +510,7 @@ mod tests {
        let expected_batch =
            RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
        let schema = expected_batch.schema();
-        assert_roundtrip(file_batch, Selection::All, schema, expected_batch).await;
+        assert_roundtrip(file_batch, Projection::All, schema, expected_batch).await;
    }

    #[tokio::test]
@ -547,7 +528,7 @@ mod tests {
        let schema = schema_batch.schema();
        let expected_batch =
            RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
-        assert_roundtrip(file_batch, Selection::Some(&["a"]), schema, expected_batch).await;
+        assert_roundtrip(file_batch, Projection::Some(&["a"]), schema, expected_batch).await;
    }

    fn to_string_array(strs: &[&str]) -> ArrayRef {
@ -592,35 +573,24 @@ mod tests {
    async fn download<'a>(
        store: &ParquetStorage,
        meta: &IoxMetadata,
-        selection: Selection<'_>,
+        selection: Projection<'_>,
        expected_schema: SchemaRef,
        file_size: usize,
    ) -> Result<RecordBatch, DataFusionError> {
        let path: ParquetFilePath = meta.into();
-        let rx = store
-            .read_filter(
-                &Predicate::default(),
-                selection,
-                expected_schema,
-                &path,
-                file_size,
-                &store.test_df_context(),
-            )
-            .expect("should read record batches from object store");
-        let schema = rx.schema();
-        datafusion::physical_plan::common::collect(rx)
+        store
+            .parquet_exec_input(&path, file_size)
+            .read_to_batches(expected_schema, selection, &store.test_df_context())
            .await
            .map(|mut batches| {
                assert_eq!(batches.len(), 1);
-                let batch = batches.remove(0);
-                assert_eq!(batch.schema(), schema);
-                batch
+                batches.remove(0)
            })
    }

    async fn assert_roundtrip(
        upload_batch: RecordBatch,
-        selection: Selection<'_>,
+        selection: Projection<'_>,
        expected_schema: SchemaRef,
        expected_batch: RecordBatch,
    ) {
@ -651,7 +621,7 @@ mod tests {
        let meta = meta();
        let (_iox_md, file_size) = upload(&store, &meta, persisted_batch).await;

-        let err = download(&store, &meta, Selection::All, expected_schema, file_size)
+        let err = download(&store, &meta, Projection::All, expected_schema, file_size)
            .await
            .unwrap_err();

--- a/parquet_to_line_protocol/src/batch.rs
+++ b/parquet_to_line_protocol/src/batch.rs
@ -180,7 +180,7 @@ fn timestamp_value<'a>(
 mod tests {
    use super::*;
    use mutable_batch_lp::lines_to_batches;
-    use schema::selection::Selection;
+    use schema::Projection;

    #[test]
    fn basic() {
@ -228,7 +228,7 @@ m,tag2=multi_field bool_field=false,str_field="blargh" 610
        );
        let (table_name, mutable_batch) = mutable_batches.into_iter().next().unwrap();

-        let selection = Selection::All;
+        let selection = Projection::All;
        let record_batch = mutable_batch.to_arrow(selection).unwrap();
        let iox_schema = mutable_batch.schema(selection).unwrap();

--- a/predicate/src/delete_expr.rs
+++ b/predicate/src/delete_expr.rs
@ -20,6 +20,7 @@ pub(crate) fn expr_to_df(expr: DeleteExpr) -> Expr {
 }

 #[derive(Debug, Snafu)]
+#[allow(clippy::large_enum_variant)]
 pub enum DataFusionToExprError {
    #[snafu(display("unsupported expression: {:?}", expr))]
    UnsupportedExpression { expr: Expr },
--- a/predicate/src/rpc_predicate.rs
+++ b/predicate/src/rpc_predicate.rs
@ -11,7 +11,7 @@ use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::expr_rewriter::ExprRewritable;
 use datafusion::logical_expr::ExprSchemable;
-use datafusion::optimizer::expr_simplifier::{ExprSimplifier, SimplifyInfo};
+use datafusion::optimizer::simplify_expressions::{ExprSimplifier, SimplifyInfo};
 use datafusion::prelude::{lit, Column, Expr};
 use observability_deps::tracing::{debug, trace};
 use schema::Schema;
--- a/querier/Cargo.toml
+++ b/querier/Cargo.toml
@ -32,6 +32,7 @@ rand = "0.8.3"
 service_common = { path = "../service_common" }
 service_grpc_catalog = { path = "../service_grpc_catalog"}
 service_grpc_schema = { path = "../service_grpc_schema" }
+service_grpc_object_store = { path = "../service_grpc_object_store" }
 schema = { path = "../schema" }
 sharder = { path = "../sharder" }
 snafu = "0.7"
--- a/querier/src/chunk/mod.rs
+++ b/querier/src/chunk/mod.rs
@ -340,14 +340,13 @@ pub mod tests {
    use arrow::{datatypes::DataType, record_batch::RecordBatch};
    use arrow_util::assert_batches_eq;
    use data_types::{ColumnType, NamespaceSchema};
-    use futures::StreamExt;
    use iox_query::{
        exec::{ExecutorType, IOxSessionContext},
        QueryChunk, QueryChunkMeta,
    };
    use iox_tests::util::{TestCatalog, TestNamespace, TestParquetFileBuilder};
    use metric::{Attributes, Observation, RawReporter};
-    use schema::{builder::SchemaBuilder, selection::Selection, sort::SortKeyBuilder};
+    use schema::{builder::SchemaBuilder, sort::SortKeyBuilder};
    use test_helpers::maybe_start_logging;
    use tokio::runtime::Handle;

@ -373,7 +372,7 @@ pub mod tests {
        assert_sort_key(&chunk);

        // back up table summary
-        let table_summary_1 = chunk.summary().unwrap();
+        let table_summary_1 = chunk.summary();

        // check if chunk can be queried
        assert_content(&chunk, &test_data).await;
@ -382,7 +381,7 @@ pub mod tests {
        assert_eq!(chunk.chunk_type(), "parquet");

        // summary has NOT changed
-        let table_summary_2 = chunk.summary().unwrap();
+        let table_summary_2 = chunk.summary();
        assert_eq!(table_summary_1, table_summary_2);

        // retrieving the chunk again should not require any catalog requests
@ -397,13 +396,9 @@ pub mod tests {
        ctx: IOxSessionContext,
    ) -> Vec<RecordBatch> {
        chunk
-            .read_filter(ctx, &Default::default(), Selection::All)
-            .unwrap()
-            .collect::<Vec<_>>()
+            .data()
+            .read_to_batches(chunk.schema(), ctx.inner())
            .await
-            .into_iter()
-            .map(Result::unwrap)
-            .collect()
    }

    struct TestData {
--- a/querier/src/chunk/query_access.rs
+++ b/querier/src/chunk/query_access.rs
@ -1,39 +1,17 @@
 use crate::chunk::QuerierChunk;
 use data_types::{ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary};
-use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
+use datafusion::error::DataFusionError;
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkMeta,
+    QueryChunk, QueryChunkData, QueryChunkMeta,
 };
-use observability_deps::tracing::debug;
 use predicate::Predicate;
-use schema::{selection::Selection, sort::SortKey, Schema};
-use snafu::{ResultExt, Snafu};
+use schema::{sort::SortKey, Projection, Schema};
 use std::{any::Any, sync::Arc};
-use trace::span::SpanRecorder;
-
-#[derive(Debug, Snafu)]
-pub enum Error {
-    #[snafu(display("Parquet File Error in chunk {}: {}", chunk_id, source))]
-    ParquetFileChunk {
-        source: Box<parquet_file::storage::ReadError>,
-        chunk_id: ChunkId,
-    },
-
-    #[snafu(display(
-        "Could not find column name '{}' in read buffer column_values results for chunk {}",
-        column_name,
-        chunk_id,
-    ))]
-    ColumnNameNotFound {
-        column_name: String,
-        chunk_id: ChunkId,
-    },
-}

 impl QueryChunkMeta for QuerierChunk {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
-        Some(Arc::clone(&self.table_summary))
+    fn summary(&self) -> Arc<TableSummary> {
+        Arc::clone(&self.table_summary)
    }

    fn schema(&self) -> Arc<Schema> {
@ -74,7 +52,7 @@ impl QueryChunk for QuerierChunk {
        &self,
        mut ctx: IOxSessionContext,
        predicate: &Predicate,
-        columns: Selection<'_>,
+        columns: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        ctx.set_metadata("projection", format!("{}", columns));
        ctx.set_metadata("predicate", format!("{}", &predicate));
@ -103,42 +81,8 @@ impl QueryChunk for QuerierChunk {
        Ok(None)
    }

-    fn read_filter(
-        &self,
-        mut ctx: IOxSessionContext,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<SendableRecordBatchStream, DataFusionError> {
-        let span_recorder = SpanRecorder::new(
-            ctx.span()
-                .map(|span| span.child("QuerierChunk::read_filter")),
-        );
-        let delete_predicates: Vec<_> = self
-            .delete_predicates()
-            .iter()
-            .map(|pred| Arc::new(pred.as_ref().clone().into()))
-            .collect();
-        ctx.set_metadata("delete_predicates", delete_predicates.len() as i64);
-
-        // merge the negated delete predicates into the select predicate
-        let pred_with_deleted_exprs = predicate.clone().with_delete_predicates(&delete_predicates);
-        debug!(?pred_with_deleted_exprs, "Merged negated predicate");
-
-        ctx.set_metadata("predicate", format!("{}", &pred_with_deleted_exprs));
-        ctx.set_metadata("projection", format!("{}", selection));
-        ctx.set_metadata("storage", "parquet");
-
-        let chunk_id = self.id();
-        debug!(?predicate, "parquet read_filter");
-
-        // TODO(marco): propagate span all the way down to the object store cache access
-        let _span_recorder = span_recorder;
-
-        self.parquet_chunk
-            .read_filter(&pred_with_deleted_exprs, selection, ctx.inner())
-            .map_err(Box::new)
-            .context(ParquetFileChunkSnafu { chunk_id })
-            .map_err(|e| DataFusionError::External(Box::new(e)))
+    fn data(&self) -> QueryChunkData {
+        QueryChunkData::Parquet(self.parquet_chunk.parquet_exec_input())
    }

    fn chunk_type(&self) -> &str {
--- a/querier/src/handler.rs
+++ b/querier/src/handler.rs
@ -9,10 +9,13 @@ use futures::{
 use influxdb_iox_client::{
    catalog::generated_types::catalog_service_server::CatalogServiceServer,
    schema::generated_types::schema_service_server::SchemaServiceServer,
+    store::generated_types::object_store_service_server::ObjectStoreServiceServer,
 };
 use iox_catalog::interface::Catalog;
+use object_store::ObjectStore;
 use observability_deps::tracing::warn;
 use service_grpc_catalog::CatalogService;
+use service_grpc_object_store::ObjectStoreService;
 use service_grpc_schema::SchemaService;
 use std::sync::Arc;
 use thiserror::Error;
@ -28,16 +31,15 @@ pub enum Error {}
 /// The [`QuerierHandler`] does nothing at this point
 #[async_trait]
 pub trait QuerierHandler: Send + Sync {
-    /// Acquire a [`SchemaService`] gRPC service implementation.
-    ///
-    /// [`SchemaService`]: generated_types::influxdata::iox::schema::v1::schema_service_server::SchemaService.
+    /// Acquire a [`SchemaServiceServer`] gRPC service implementation.
    fn schema_service(&self) -> SchemaServiceServer<SchemaService>;

-    /// Acquire a [`CatalogService`] gRPC service implementation.
-    ///
-    /// [`CatalogService`]: generated_types::influxdata::iox::catalog::v1::catalog_service_server::CatalogService.
+    /// Acquire a [`CatalogServiceServer`] gRPC service implementation.
    fn catalog_service(&self) -> CatalogServiceServer<CatalogService>;

+    /// Acquire an [`ObjectStoreServiceServer`] gRPC service implementation.
+    fn object_store_service(&self) -> ObjectStoreServiceServer<ObjectStoreService>;
+
    /// Wait until the handler finished  to shutdown.
    ///
    /// Use [`shutdown`](Self::shutdown) to trigger a shutdown.
@ -65,6 +67,9 @@ pub struct QuerierHandlerImpl {
    /// Database that handles query operation
    database: Arc<QuerierDatabase>,

+    /// The object store
+    object_store: Arc<dyn ObjectStore>,
+
    /// Future that resolves when the background worker exits
    join_handles: Vec<(String, SharedJoinHandle)>,

@ -78,7 +83,11 @@ pub struct QuerierHandlerImpl {

 impl QuerierHandlerImpl {
    /// Initialize the Querier
-    pub fn new(catalog: Arc<dyn Catalog>, database: Arc<QuerierDatabase>) -> Self {
+    pub fn new(
+        catalog: Arc<dyn Catalog>,
+        database: Arc<QuerierDatabase>,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Self {
        let shutdown = CancellationToken::new();
        let poison_cabinet = Arc::new(PoisonCabinet::new());

@ -86,6 +95,7 @@ impl QuerierHandlerImpl {
        Self {
            catalog,
            database,
+            object_store,
            join_handles,
            shutdown,
            poison_cabinet,
@ -103,6 +113,13 @@ impl QuerierHandler for QuerierHandlerImpl {
        CatalogServiceServer::new(CatalogService::new(Arc::clone(&self.catalog)))
    }

+    fn object_store_service(&self) -> ObjectStoreServiceServer<ObjectStoreService> {
+        ObjectStoreServiceServer::new(ObjectStoreService::new(
+            Arc::clone(&self.catalog),
+            Arc::clone(&self.object_store),
+        ))
+    }
+
    async fn join(&self) {
        // Need to poll handlers unordered to detect early exists of any worker in the list.
        let mut unordered: FuturesUnordered<_> = self
@ -176,14 +193,15 @@ mod tests {
        async fn new() -> Self {
            let metric_registry = Arc::new(metric::Registry::new());
            let catalog = Arc::new(MemCatalog::new(Arc::clone(&metric_registry))) as _;
-            let object_store = Arc::new(InMemory::new());
+            let object_store = Arc::new(InMemory::new()) as _;
+
            let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
            let exec = Arc::new(Executor::new(1));
            let catalog_cache = Arc::new(CatalogCache::new_testing(
                Arc::clone(&catalog),
                time_provider,
                Arc::clone(&metric_registry),
-                Arc::clone(&object_store) as _,
+                Arc::clone(&object_store),
                &Handle::current(),
            ));
            // QuerierDatabase::new returns an error if there are no shards in the catalog
@ -211,7 +229,7 @@ mod tests {
                .await
                .unwrap(),
            );
-            let querier = QuerierHandlerImpl::new(catalog, database);
+            let querier = QuerierHandlerImpl::new(catalog, database, object_store);

            Self { querier }
        }
--- a/querier/src/ingester/flight_client.rs
+++ b/querier/src/ingester/flight_client.rs
@ -5,7 +5,7 @@ use influxdb_iox_client::flight::{
    generated_types as proto,
    low_level::{Client as LowLevelFlightClient, LowLevelMessage, PerformQuery},
 };
-use observability_deps::tracing::debug;
+use observability_deps::tracing::{debug, warn};
 use snafu::{ResultExt, Snafu};
 use std::{collections::HashMap, fmt::Debug, ops::DerefMut, sync::Arc};
 use trace::ctx::SpanContext;
@ -100,14 +100,35 @@ impl FlightClient for FlightClientImpl {
            LowLevelFlightClient::<proto::IngesterQueryRequest>::new(connection, span_context);

        debug!(%ingester_addr, ?request, "Sending request to ingester");
-        let request: proto::IngesterQueryRequest =
-            request.try_into().context(CreatingRequestSnafu)?;
+        let request = serialize_ingester_query_request(request)?;

        let perform_query = client.perform_query(request).await.context(FlightSnafu)?;
        Ok(Box::new(perform_query))
    }
 }

+/// Tries to serialize the request to the ingester
+///
+/// Note if the predicate is too "complicated" to be serialized simply
+/// ask for all the data from the ingester. More details:
+/// <https://github.com/apache/arrow-datafusion/issues/3968>
+fn serialize_ingester_query_request(
+    mut request: IngesterQueryRequest,
+) -> Result<proto::IngesterQueryRequest, Error> {
+    match request.clone().try_into() {
+        Ok(proto) => Ok(proto),
+        Err(e) if (e.field == "exprs") && (e.description.contains("recursion limit reached")) => {
+            warn!(
+                predicate=?request.predicate,
+                "Cannot serialize predicate due to recursion limit, stripping it",
+            );
+            request.predicate = None;
+            request.try_into().context(CreatingRequestSnafu)
+        }
+        Err(e) => Err(Error::CreatingRequest { source: e }),
+    }
+}
+
 /// Data that is returned by an ingester gRPC query.
 ///
 /// This is mostly the same as [`PerformQuery`] but allows some easier mocking.
@ -189,3 +210,48 @@ impl CachedConnection {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use datafusion::prelude::{col, lit};
+    use predicate::Predicate;
+
+    use super::*;
+
+    #[test]
+    fn serialize_deeply_nested_predicate() {
+        // see https://github.com/influxdata/influxdb_iox/issues/5974
+
+        // we need more stack space so this doesn't overflow in dev builds
+        std::thread::Builder::new().stack_size(10_000_000).spawn(|| {
+            // don't know what "too much" is, so let's slowly try to increase complexity
+            let n_max = 100;
+
+            for n in [1, 2, n_max] {
+                println!("testing: {n}");
+
+                let expr_base = col("a").lt(lit(5i32));
+                let expr = (0..n).fold(expr_base.clone(), |expr, _| expr.and(expr_base.clone()));
+
+                let predicate = Predicate {exprs: vec![expr], ..Default::default()};
+
+                let request = IngesterQueryRequest {
+                    namespace: String::from("ns"),
+                    table: String::from("table"),
+                    columns: vec![String::from("col1"), String::from("col2")],
+                    predicate: Some(predicate),
+                };
+
+                let proto = serialize_ingester_query_request(request.clone()).expect("serialization");
+                let request2 = IngesterQueryRequest::try_from(proto).expect("deserialization");
+
+                if request2.predicate.is_none() {
+                    assert!(n > 2, "not really deeply nested");
+                    return;
+                }
+            }
+
+            panic!("did not find a 'too deeply nested' expression, tested up to a depth of {n_max}")
+        }).expect("spawning thread").join().expect("joining thread");
+    }
+}
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@ -12,7 +12,6 @@ use data_types::{
    TableSummary, TimestampMinMax,
 };
 use datafusion::error::DataFusionError;
-use datafusion_util::MemoryStream;
 use futures::{stream::FuturesUnordered, TryStreamExt};
 use generated_types::{
    influxdata::iox::ingester::v1::GetWriteInfoResponse,
@ -25,13 +24,13 @@ use influxdb_iox_client::flight::{
 use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
    util::{compute_timenanosecond_min_max, create_basic_summary},
-    QueryChunk, QueryChunkMeta,
+    QueryChunk, QueryChunkData, QueryChunkMeta,
 };
 use iox_time::{Time, TimeProvider};
 use metric::{DurationHistogram, Metric};
 use observability_deps::tracing::{debug, trace, warn};
 use predicate::Predicate;
-use schema::{selection::Selection, sort::SortKey, Schema};
+use schema::{sort::SortKey, Projection, Schema};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{
    any::Any,
@ -1050,8 +1049,8 @@ impl IngesterChunk {
 }

 impl QueryChunkMeta for IngesterChunk {
-    fn summary(&self) -> Option<Arc<TableSummary>> {
-        Some(Arc::clone(&self.summary))
+    fn summary(&self) -> Arc<TableSummary> {
+        Arc::clone(&self.summary)
    }

    fn schema(&self) -> Arc<Schema> {
@ -1095,7 +1094,7 @@ impl QueryChunk for IngesterChunk {
        &self,
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
-        _columns: Selection<'_>,
+        _columns: Projection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        // TODO maybe some special handling?
        Ok(None)
@ -1111,30 +1110,8 @@ impl QueryChunk for IngesterChunk {
        Ok(None)
    }

-    fn read_filter(
-        &self,
-        _ctx: IOxSessionContext,
-        predicate: &Predicate,
-        selection: Selection<'_>,
-    ) -> Result<datafusion::physical_plan::SendableRecordBatchStream, DataFusionError> {
-        trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data");
-
-        // Apply selection to in-memory batch
-        let batches = match self
-            .schema
-            .df_projection(selection)
-            .map_err(|e| DataFusionError::External(Box::new(e)))?
-        {
-            None => self.batches.clone(),
-            Some(projection) => self
-                .batches
-                .iter()
-                .map(|batch| batch.project(&projection))
-                .collect::<std::result::Result<Vec<_>, ArrowError>>()?,
-        };
-        trace!(?predicate, ?selection, output_batches=?batches, input_batches=?self.batches, "Reading data");
-
-        Ok(Box::pin(MemoryStream::new(batches)))
+    fn data(&self) -> QueryChunkData {
+        QueryChunkData::RecordBatches(self.batches.clone())
    }

    fn chunk_type(&self) -> &str {
@ -1806,7 +1783,7 @@ mod tests {
    }

    fn lp_to_record_batch(lp: &str) -> RecordBatch {
-        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
+        lp_to_mutable_batch(lp).1.to_arrow(Projection::All).unwrap()
    }

    #[derive(Debug)]
--- a/querier/src/ingester/test_util.rs
+++ b/querier/src/ingester/test_util.rs
@ -1,12 +1,10 @@
 use super::IngesterConnection;
-use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use data_types::ShardIndex;
-use futures::StreamExt;
 use generated_types::influxdata::iox::ingester::v1::GetWriteInfoResponse;
-use iox_query::{exec::IOxSessionContext, util::create_basic_summary, QueryChunk};
+use iox_query::util::create_basic_summary;
 use parking_lot::Mutex;
-use schema::selection::Selection;
+use schema::Projection;
 use schema::Schema as IOxSchema;
 use std::{any::Any, sync::Arc};
 use trace::span::Span;
@ -38,17 +36,17 @@ impl IngesterConnection for MockIngesterConnection {
        _namespace_name: Arc<str>,
        _table_name: Arc<str>,
        columns: Vec<String>,
-        predicate: &predicate::Predicate,
+        _predicate: &predicate::Predicate,
        _expected_schema: Arc<schema::Schema>,
        _span: Option<Span>,
    ) -> super::Result<Vec<super::IngesterPartition>> {
        // see if we want to do projection pushdown
        let mut prune_columns = true;
        let cols: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
-        let selection = Selection::Some(&cols);
+        let selection = Projection::Some(&cols);
        match selection {
-            Selection::All => prune_columns = false,
-            Selection::Some(val) => {
+            Projection::All => prune_columns = false,
+            Projection::Some(val) => {
                if val.is_empty() {
                    prune_columns = false;
                }
@ -77,14 +75,14 @@ impl IngesterConnection for MockIngesterConnection {
                    .chunks
                    .into_iter()
                    .map(|ic| async move {
-                        let mut batches: Vec<RecordBatch> = vec![];
-                        let mut stream = ic
-                            .read_filter(IOxSessionContext::with_testing(), predicate, selection)
-                            .expect("Error in read_filter");
-                        while let Some(b) = stream.next().await {
-                            let b = b.expect("Error in stream");
-                            batches.push(b)
-                        }
+                        let batches: Vec<_> = ic
+                            .batches
+                            .iter()
+                            .map(|batch| match ic.schema.df_projection(selection).unwrap() {
+                                Some(projection) => batch.project(&projection).unwrap(),
+                                None => batch.clone(),
+                            })
+                            .collect();

                        assert!(!batches.is_empty(), "Error: empty batches");
                        let new_schema = IOxSchema::try_from(batches[0].schema()).unwrap();
--- a/querier/src/namespace/query_access.rs
+++ b/querier/src/namespace/query_access.rs
@ -13,9 +13,10 @@ use datafusion::{
    datasource::TableProvider,
    error::DataFusionError,
 };
+use datafusion_util::config::DEFAULT_SCHEMA;
 use iox_query::{
    exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext},
-    QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
+    QueryChunk, QueryCompletedToken, QueryDatabase, QueryText,
 };
 use observability_deps::tracing::{debug, trace};
 use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@ -312,7 +312,6 @@ impl QuerierTable {
                            },
                        ))
                    })
-                    .map(Some)
                    .collect();

                // Prune on the most basic summary data (timestamps and column names) before trying to fully load the chunks
@ -521,7 +520,7 @@ mod tests {
    use iox_query::exec::IOxSessionContext;
    use iox_tests::util::{TestCatalog, TestParquetFileBuilder, TestTable};
    use predicate::Predicate;
-    use schema::{builder::SchemaBuilder, selection::Selection, InfluxFieldType};
+    use schema::{builder::SchemaBuilder, InfluxFieldType};
    use std::sync::Arc;
    use test_helpers::maybe_start_logging;
    use trace::{span::SpanStatus, RingBufferTraceCollector};
@ -712,8 +711,8 @@ mod tests {
            .await
            .unwrap();
        assert_eq!(chunks.len(), 1);
-
        let chunk = &chunks[0];
+        assert_eq!(chunk.chunk_type(), "IngesterPartition");

        // verify chunk schema
        let schema = chunk.schema();
@ -740,17 +739,9 @@ mod tests {

        // verify chunk data
        let batches = chunk
-            .read_filter(
-                IOxSessionContext::with_testing(),
-                &Default::default(),
-                Selection::All,
-            )
-            .unwrap()
-            .collect::<Vec<_>>()
-            .await
-            .into_iter()
-            .map(Result::unwrap)
-            .collect::<Vec<_>>();
+            .data()
+            .read_to_batches(chunk.schema(), IOxSessionContext::with_testing().inner())
+            .await;
        let expected = vec![
            "+-----+------+------+--------------------------------+",
            "| foo | tag1 | tag2 | time                           |",
--- a/querier/src/table/test_util.rs
+++ b/querier/src/table/test_util.rs
@ -8,7 +8,7 @@ use data_types::{ChunkId, SequenceNumber, ShardIndex};
 use iox_catalog::interface::get_schema_by_name;
 use iox_tests::util::{TestCatalog, TestPartition, TestShard, TestTable};
 use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
-use schema::{selection::Selection, sort::SortKey, Schema};
+use schema::{sort::SortKey, Projection, Schema};
 use sharder::JumpHash;
 use std::sync::Arc;
 use tokio::runtime::Handle;
@ -49,7 +49,7 @@ pub async fn querier_table(catalog: &Arc<TestCatalog>, table: &Arc<TestTable>) -

 /// Convert the line protocol in `lp `to a RecordBatch
 pub(crate) fn lp_to_record_batch(lp: &str) -> RecordBatch {
-    lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
+    lp_to_mutable_batch(lp).1.to_arrow(Projection::All).unwrap()
 }

 /// Helper for creating IngesterPartitions
--- a/query_tests/Cargo.toml
+++ b/query_tests/Cargo.toml
@ -25,7 +25,7 @@ iox_tests = { path = "../iox_tests" }
 itertools = "0.10"
 mutable_batch = { path = "../mutable_batch" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-once_cell = { version = "1.15.0", features = ["parking_lot"] }
+once_cell = { version = "1.16.0", features = ["parking_lot"] }
 parquet_file = { version = "0.1.0", path = "../parquet_file" }
 predicate = { path = "../predicate" }
 querier = { path = "../querier" }
--- a/query_tests/src/lib.rs
+++ b/query_tests/src/lib.rs
@ -15,6 +15,8 @@ mod runner;
 #[cfg(test)]
 pub mod sql;
 #[cfg(test)]
+pub mod sql_metrics;
+#[cfg(test)]
 pub mod table_schema;

 pub mod db;
--- a/query_tests/src/scenarios/util.rs
+++ b/query_tests/src/scenarios/util.rs
@ -30,7 +30,7 @@ use querier::{
    IngesterConnectionImpl, IngesterFlightClient, IngesterFlightClientError,
    IngesterFlightClientQueryData, QuerierCatalogCache, QuerierNamespace,
 };
-use schema::selection::Selection;
+use schema::Projection;
 use sharder::JumpHash;
 use std::{
    cmp::Ordering,
@ -806,11 +806,17 @@ impl MockIngester {
                .await;
            partition_ids.push(partition.partition.id);
        }
+
+        let ids = tables
+            .iter()
+            .map(|v| (v.table.name.clone(), v.table.id))
+            .collect();
+
        for table in tables {
            let schema = mutable_batches
                .get(&table.table.name)
                .unwrap()
-                .schema(Selection::All)
+                .schema(Projection::All)
                .unwrap();

            for (t, field) in schema.iter() {
@ -829,7 +835,9 @@ impl MockIngester {
        );
        let op = DmlOperation::Write(DmlWrite::new(
            self.ns.namespace.name.clone(),
+            self.ns.namespace.id,
            mutable_batches,
+            ids,
            PartitionKey::from(partition_key),
            meta,
        ));
--- a/query_tests/src/sql_metrics.rs
+++ b/query_tests/src/sql_metrics.rs
@ -0,0 +1,99 @@
+use std::sync::Arc;
+
+use crate::scenarios::{DbScenario, DbSetup, OneMeasurementFourChunksWithDuplicatesParquetOnly};
+use arrow::record_batch::RecordBatch;
+use arrow_util::assert_batches_sorted_eq;
+use datafusion::physical_plan::{
+    display::DisplayableExecutionPlan,
+    metrics::{MetricValue, MetricsSet},
+};
+use iox_query::{frontend::sql::SqlQueryPlanner, provider::parquet_metrics};
+
+#[tokio::test]
+async fn sql_predicate_pushdown() {
+    test_helpers::maybe_start_logging();
+
+    // parquet pushdown is only relevant for parquet
+    let db_setup = OneMeasurementFourChunksWithDuplicatesParquetOnly {};
+
+    // This predicate should result in rows being pruned, and we verify this with metrics
+    let sql = "SELECT * from h2o where state = 'MA'".to_string();
+
+    let expected = vec![
+        "+------+---------+----------+----------+-------+--------------------------------+",
+        "| area | city    | max_temp | min_temp | state | time                           |",
+        "+------+---------+----------+----------+-------+--------------------------------+",
+        "|      | Andover | 69.2     |          | MA    | 1970-01-01T00:00:00.000000250Z |",
+        "|      | Boston  |          | 67.4     | MA    | 1970-01-01T00:00:00.000000600Z |",
+        "|      | Boston  |          | 70.4     | MA    | 1970-01-01T00:00:00.000000050Z |",
+        "|      | Boston  | 75.4     | 65.4     | MA    | 1970-01-01T00:00:00.000000250Z |",
+        "|      | Boston  | 82.67    | 65.4     | MA    | 1970-01-01T00:00:00.000000400Z |",
+        "|      | Reading |          | 53.4     | MA    | 1970-01-01T00:00:00.000000250Z |",
+        "|      | Reading |          | 60.4     | MA    | 1970-01-01T00:00:00.000000600Z |",
+        "| 742  | Bedford | 78.75    | 71.59    | MA    | 1970-01-01T00:00:00.000000150Z |",
+        "| 742  | Bedford | 88.75    |          | MA    | 1970-01-01T00:00:00.000000600Z |",
+        "| 750  | Bedford | 80.75    | 65.22    | MA    | 1970-01-01T00:00:00.000000400Z |",
+        "+------+---------+----------+----------+-------+--------------------------------+",
+    ];
+
+    for scenario in db_setup.make().await {
+        let DbScenario {
+            scenario_name, db, ..
+        } = scenario;
+
+        println!("Running scenario '{}'", scenario_name);
+        println!("SQL: '{:#?}'", sql);
+        let planner = SqlQueryPlanner::default();
+        let ctx = db.new_query_context(None);
+
+        let physical_plan = planner
+            .query(&sql, &ctx)
+            .await
+            .expect("built plan successfully");
+
+        let results: Vec<RecordBatch> = ctx
+            .collect(Arc::clone(&physical_plan))
+            .await
+            .expect("Running plan");
+        assert_batches_sorted_eq!(expected, &results);
+
+        println!(
+            "Physical plan:\n\n{}",
+            DisplayableExecutionPlan::new(physical_plan.as_ref()).indent()
+        );
+
+        // verify that pushdown was enabled and that it filtered rows
+        let metrics = parquet_metrics(physical_plan);
+        assert_eq!(
+            metric_value_sum(&metrics, "pushdown_rows_filtered"),
+            8,
+            "Unexpected number of rows filtered in:\n\n{:#?}",
+            metrics
+        );
+    }
+}
+
+/// returns the sum of all the metrics with the specified name
+/// the returned set.
+///
+/// Count: returns value
+///
+/// Panics if no such metric.
+fn metric_value_sum(metrics: &[MetricsSet], metric_name: &str) -> usize {
+    metrics.iter().map(|m| metric_value(m, metric_name)).sum()
+}
+
+fn metric_value(metrics: &MetricsSet, metric_name: &str) -> usize {
+    let sum = metrics
+        .sum(|m| matches!(m.value(), MetricValue::Count { name, .. } if name == metric_name));
+
+    match sum {
+        Some(MetricValue::Count { count, .. }) => count.value(),
+        _ => {
+            panic!(
+                "Expected metric not found. Looking for '{}' in\n\n{:#?}",
+                metric_name, metrics
+            );
+        }
+    }
+}
--- a/query_tests/src/table_schema.rs
+++ b/query_tests/src/table_schema.rs
@ -2,7 +2,7 @@

 use arrow::datatypes::DataType;
 use iox_query::QueryChunk;
-use schema::selection::Selection;
+use schema::Projection;
 use schema::{builder::SchemaBuilder, sort::SortKey, Schema, TIME_COLUMN_NAME};

 use super::scenarios::*;
@ -14,7 +14,7 @@ use super::scenarios::*;
 /// output
 async fn run_table_schema_test_case<D>(
    db_setup: D,
-    selection: Selection<'_>,
+    selection: Projection<'_>,
    table_name: &str,
    expected_schema: Schema,
    expected_sort_key: Option<&SortKey>,
@ -91,7 +91,7 @@ async fn list_schema_cpu_all() {

    run_table_schema_test_case(
        TwoMeasurements {},
-        Selection::All,
+        Projection::All,
        "cpu",
        expected_schema,
        Some(&sort_key),
@ -114,7 +114,7 @@ async fn list_schema_cpu_all_set_sort_key() {

    run_table_schema_test_case(
        TwoMeasurements {},
-        Selection::All,
+        Projection::All,
        "cpu",
        expected_schema,
        Some(&sort_key),
@ -137,7 +137,7 @@ async fn list_schema_disk_all() {

    run_table_schema_test_case(
        TwoMeasurements {},
-        Selection::All,
+        Projection::All,
        "disk",
        expected_schema,
        None,
@ -155,7 +155,7 @@ async fn list_schema_cpu_selection() {
        .unwrap();

    // Pick an order that is not lexographic
-    let selection = Selection::Some(&["user", "region"]);
+    let selection = Projection::Some(&["user", "region"]);

    run_table_schema_test_case(TwoMeasurements {}, selection, "cpu", expected_schema, None).await;
 }
@ -171,7 +171,7 @@ async fn list_schema_disk_selection() {
        .unwrap();

    // Pick an order that is not lexicographic
-    let selection = Selection::Some(&["time", "bytes"]);
+    let selection = Projection::Some(&["time", "bytes"]);

    run_table_schema_test_case(TwoMeasurements {}, selection, "disk", expected_schema, None).await;
 }
@ -189,7 +189,7 @@ async fn list_schema_location_all() {

    run_table_schema_test_case(
        TwoMeasurementsUnsignedType {},
-        Selection::All,
+        Projection::All,
        "restaurant",
        expected_schema,
        None,
--- a/router/benches/schema_validator.rs
+++ b/router/benches/schema_validator.rs
@ -13,7 +13,7 @@ use router::{
    dml_handlers::{DmlHandler, SchemaValidator},
    namespace_cache::{MemoryNamespaceCache, ShardedCache},
 };
-use schema::selection::Selection;
+use schema::Projection;
 use tokio::runtime::Runtime;

 static NAMESPACE: Lazy<DatabaseName<'static>> = Lazy::new(|| "bananas".try_into().unwrap());
@ -55,7 +55,7 @@ fn bench(group: &mut BenchmarkGroup<WallTime>, tables: usize, columns_per_table:
    let write = lp_to_writes(&generate_lp(tables, columns_per_table));
    let column_count = write
        .values()
-        .fold(0, |acc, b| acc + b.schema(Selection::All).unwrap().len());
+        .fold(0, |acc, b| acc + b.schema(Projection::All).unwrap().len());

    group.throughput(Throughput::Elements(column_count as _));
    group.bench_function(format!("{tables}x{columns_per_table}"), |b| {
--- a/router/src/dml_handlers/chain.rs
+++ b/router/src/dml_handlers/chain.rs
@ -79,17 +79,24 @@ where
    async fn delete(
        &self,
        namespace: &DatabaseName<'static>,
+        namespace_id: NamespaceId,
        table_name: &str,
        predicate: &DeletePredicate,
        span_ctx: Option<SpanContext>,
    ) -> Result<(), Self::DeleteError> {
        self.first
-            .delete(namespace, table_name, predicate, span_ctx.clone())
+            .delete(
+                namespace,
+                namespace_id,
+                table_name,
+                predicate,
+                span_ctx.clone(),
+            )
            .await
            .map_err(Into::into)?;

        self.second
-            .delete(namespace, table_name, predicate, span_ctx)
+            .delete(namespace, namespace_id, table_name, predicate, span_ctx)
            .await
            .map_err(Into::into)
    }
--- a/router/src/dml_handlers/fan_out.rs
+++ b/router/src/dml_handlers/fan_out.rs
@ -75,12 +75,13 @@ where
    async fn delete(
        &self,
        namespace: &DatabaseName<'static>,
+        namespace_id: NamespaceId,
        table_name: &str,
        predicate: &DeletePredicate,
        span_ctx: Option<SpanContext>,
    ) -> Result<(), Self::DeleteError> {
        self.inner
-            .delete(namespace, table_name, predicate, span_ctx)
+            .delete(namespace, namespace_id, table_name, predicate, span_ctx)
            .await
    }
 }
--- a/router/src/dml_handlers/instrumentation.rs
+++ b/router/src/dml_handlers/instrumentation.rs
@ -105,6 +105,7 @@ where
    async fn delete(
        &self,
        namespace: &DatabaseName<'static>,
+        namespace_id: NamespaceId,
        table_name: &str,
        predicate: &DeletePredicate,
        span_ctx: Option<SpanContext>,
@ -116,7 +117,7 @@ where

        let res = self
            .inner
-            .delete(namespace, table_name, predicate, span_ctx)
+            .delete(namespace, namespace_id, table_name, predicate, span_ctx)
            .await;

        // Avoid exploding if time goes backwards - simply drop the measurement
@ -256,7 +257,7 @@ mod tests {
        };

        decorator
-            .delete(&ns, "a table", &pred, Some(span))
+            .delete(&ns, NamespaceId::new(42), "a table", &pred, Some(span))
            .await
            .expect("inner handler configured to succeed");

@ -284,7 +285,7 @@ mod tests {
        };

        decorator
-            .delete(&ns, "a table", &pred, Some(span))
+            .delete(&ns, NamespaceId::new(42), "a table", &pred, Some(span))
            .await
            .expect_err("inner handler configured to fail");

--- a/router/src/dml_handlers/mock.rs
+++ b/router/src/dml_handlers/mock.rs
@ -19,6 +19,7 @@ pub enum MockDmlHandlerCall<W> {
    },
    Delete {
        namespace: String,
+        namespace_id: NamespaceId,
        table: String,
        predicate: DeletePredicate,
    },
@ -121,6 +122,7 @@ where
    async fn delete(
        &self,
        namespace: &DatabaseName<'static>,
+        namespace_id: NamespaceId,
        table_name: &str,
        predicate: &DeletePredicate,
        _span_ctx: Option<SpanContext>,
@ -129,6 +131,7 @@ where
            self,
            MockDmlHandlerCall::Delete {
                namespace: namespace.into(),
+                namespace_id,
                table: table_name.to_owned(),
                predicate: predicate.clone(),
            },
--- a/router/src/dml_handlers/nop.rs
+++ b/router/src/dml_handlers/nop.rs
@ -43,11 +43,12 @@ where
    async fn delete(
        &self,
        namespace: &DatabaseName<'static>,
+        namespace_id: NamespaceId,
        table_name: &str,
        predicate: &DeletePredicate,
        _span_ctx: Option<SpanContext>,
    ) -> Result<(), Self::DeleteError> {
-        info!(%namespace, %table_name, ?predicate, "dropping delete operation");
+        info!(%namespace, %namespace_id, %table_name, ?predicate, "dropping delete operation");
        Ok(())
    }
 }
--- a/router/src/dml_handlers/partitioner.rs
+++ b/router/src/dml_handlers/partitioner.rs
@ -1,5 +1,7 @@
 use async_trait::async_trait;
-use data_types::{DatabaseName, DeletePredicate, NamespaceId, PartitionKey, PartitionTemplate};
+use data_types::{
+    DatabaseName, DeletePredicate, NamespaceId, PartitionKey, PartitionTemplate, TableId,
+};
 use hashbrown::HashMap;
 use mutable_batch::{MutableBatch, PartitionWrite, WritePayload};
 use observability_deps::tracing::*;
@ -64,7 +66,7 @@ impl DmlHandler for Partitioner {
    type WriteError = PartitionError;
    type DeleteError = PartitionError;

-    type WriteInput = HashMap<String, MutableBatch>;
+    type WriteInput = HashMap<TableId, (String, MutableBatch)>;
    type WriteOutput = Vec<Partitioned<Self::WriteInput>>;

    /// Partition the per-table [`MutableBatch`].
@ -76,9 +78,10 @@ impl DmlHandler for Partitioner {
        _span_ctx: Option<SpanContext>,
    ) -> Result<Self::WriteOutput, Self::WriteError> {
        // A collection of partition-keyed, per-table MutableBatch instances.
-        let mut partitions: HashMap<PartitionKey, HashMap<_, MutableBatch>> = HashMap::default();
+        let mut partitions: HashMap<PartitionKey, HashMap<_, (String, MutableBatch)>> =
+            HashMap::default();

-        for (table_name, batch) in batch {
+        for (table_id, (table_name, batch)) in batch {
            // Partition the table batch according to the configured partition
            // template and write it into the partition-keyed map.
            for (partition_key, partition_payload) in
@ -87,10 +90,12 @@ impl DmlHandler for Partitioner {
                let partition = partitions.entry(partition_key).or_default();
                let table_batch = partition
                    .raw_entry_mut()
-                    .from_key(&table_name)
-                    .or_insert_with(|| (table_name.to_owned(), MutableBatch::default()));
+                    .from_key(&table_id)
+                    .or_insert_with(|| {
+                        (table_id, (table_name.to_owned(), MutableBatch::default()))
+                    });

-                partition_payload.write_to_batch(table_batch.1)?;
+                partition_payload.write_to_batch(&mut table_batch.1 .1)?;
            }
        }

@ -104,6 +109,7 @@ impl DmlHandler for Partitioner {
    async fn delete(
        &self,
        _namespace: &DatabaseName<'static>,
+        _namespace_id: NamespaceId,
        _table_name: &str,
        _predicate: &DeletePredicate,
        _span_ctx: Option<SpanContext>,
@ -119,9 +125,17 @@ mod tests {

    use super::*;

-    /// The default timestamp applied to test LP if the write does not specify
-    /// one.
-    const DEFAULT_TIMESTAMP_NANOS: i64 = 42000000000000000;
+    // Parse `lp` into a table-keyed MutableBatch map.
+    fn lp_to_writes(lp: &str) -> HashMap<TableId, (String, MutableBatch)> {
+        let (writes, _) = mutable_batch_lp::lines_to_batches_stats(lp, 42)
+            .expect("failed to build test writes from LP");
+
+        writes
+            .into_iter()
+            .enumerate()
+            .map(|(i, (name, data))| (TableId::new(i as _), (name, data)))
+            .collect()
+    }

    // Generate a test case that partitions "lp".
    //
@ -144,7 +158,7 @@ mod tests {
                    let partitioner = Partitioner::new(partition_template);
                    let ns = DatabaseName::new("bananas").expect("valid db name");

-                    let (writes, _) = mutable_batch_lp::lines_to_batches_stats($lp, DEFAULT_TIMESTAMP_NANOS).expect("failed to parse test LP");
+                    let writes = lp_to_writes($lp);

                    let handler_ret = partitioner.write(&ns, NamespaceId::new(42), writes, None).await;
                    assert_matches!(handler_ret, $($want_handler_ret)+);
@ -156,8 +170,7 @@ mod tests {
                            // Extract the table names in this partition
                            let mut tables = partition
                                .payload
-                                .keys()
-                                .cloned()
+                                .values().map(|v| v.0.clone())
                                .collect::<Vec<String>>();

                            tables.sort();
--- a/Show More
+++ b/Show More