diff --git a/Cargo.lock b/Cargo.lock
index 80a1918e25..5e5870e339 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1318,13 +1318,13 @@ dependencies = [
  "data_types",
  "futures",
  "google_types",
- "influxdb_line_protocol",
  "num_cpus",
  "observability_deps",
+ "pbjson",
+ "pbjson_build",
  "proc-macro2",
  "prost",
  "prost-build",
- "prost-types",
  "regex",
  "serde",
  "serde_json",
@@ -1392,6 +1392,8 @@ version = "0.1.0"
 dependencies = [
  "bytes",
  "chrono",
+ "pbjson",
+ "pbjson_build",
  "prost",
  "prost-build",
  "serde",
@@ -2896,6 +2898,39 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58"
 
+[[package]]
+name = "pbjson"
+version = "0.1.0"
+dependencies = [
+ "base64 0.13.0",
+ "bytes",
+ "serde",
+]
+
+[[package]]
+name = "pbjson_build"
+version = "0.1.0"
+dependencies = [
+ "heck",
+ "itertools 0.10.1",
+ "pbjson_test",
+ "prost",
+ "prost-types",
+ "tempfile",
+]
+
+[[package]]
+name = "pbjson_test"
+version = "0.1.0"
+dependencies = [
+ "pbjson",
+ "pbjson_build",
+ "prost",
+ "prost-build",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
diff --git a/Cargo.toml b/Cargo.toml
index a676c7c180..f2b5bd2b10 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -57,6 +57,9 @@ members = [
     "observability_deps",
     "packers",
     "panic_logging",
+    "pbjson",
+    "pbjson_build",
+    "pbjson_test",
     "persistence_windows",
     "predicate",
     "query",
diff --git a/data_types/src/job.rs b/data_types/src/job.rs
index 2918c96e1c..33ca1165be 100644
--- a/data_types/src/job.rs
+++ b/data_types/src/job.rs
@@ -117,45 +117,3 @@ impl Job {
         }
     }
 }
-
-/// The status of a running operation
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-pub enum OperationStatus {
-    /// A task associated with the operation is running
-    Running,
-    /// All tasks associated with the operation have finished successfully
-    Success,
-    /// The operation was cancelled and no associated tasks are running
-    Cancelled,
-    /// An operation error was returned
-    Errored,
-}
-
-/// A group of asynchronous tasks being performed by an IOx server
-///
-/// TODO: Temporary until prost adds JSON support
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Operation {
-    /// ID of the running operation
-    pub id: usize,
-    // The total number of created tasks
-    pub total_count: u64,
-    // The number of pending tasks
-    pub pending_count: u64,
-    // The number of tasks that completed successfully
-    pub success_count: u64,
-    // The number of tasks that returned an error
-    pub error_count: u64,
-    // The number of tasks that were cancelled
-    pub cancelled_count: u64,
-    // The number of tasks that did not run to completion (e.g. panic)
-    pub dropped_count: u64,
-    /// Wall time spent executing this operation
-    pub wall_time: std::time::Duration,
-    /// CPU time spent executing this operation
-    pub cpu_time: std::time::Duration,
-    /// Additional job metadata
-    pub job: Option<Job>,
-    /// The status of the running operation
-    pub status: OperationStatus,
-}
diff --git a/generated_types/Cargo.toml b/generated_types/Cargo.toml
index 7158d32911..9def5d286e 100644
--- a/generated_types/Cargo.toml
+++ b/generated_types/Cargo.toml
@@ -7,16 +7,12 @@ edition = "2018"
 [dependencies] # In alphabetical order
 bytes = { version = "1.0", features = ["serde"] }
 data_types = { path = "../data_types" }
-# See docs/regenerating_flatbuffers.md about updating generated code when updating the
-# version of the flatbuffers crate
-#flatbuffers = "2"
 futures = "0.3"
 google_types = { path = "../google_types" }
-influxdb_line_protocol = { path = "../influxdb_line_protocol" }
-observability_deps = { path = "../observability_deps" }
 num_cpus = "1.13.0"
+observability_deps = { path = "../observability_deps" }
+pbjson = { path = "../pbjson" }
 prost = "0.8"
-prost-types = "0.8"
 regex = "1.4"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0.67"
@@ -31,3 +27,4 @@ chrono = { version = "0.4", features = ["serde"] }
 proc-macro2 = "=1.0.27"
 tonic-build = "0.5"
 prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
diff --git a/generated_types/build.rs b/generated_types/build.rs
index 276e289e71..ff248f19b0 100644
--- a/generated_types/build.rs
+++ b/generated_types/build.rs
@@ -64,12 +64,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
     config
         .compile_well_known_types()
         .disable_comments(&[".google"])
-        // approximates jsonpb. This is still not enough to deal with the special cases like Any
-        // Tracking issue for proper jsonpb support in prost: https://github.com/danburkert/prost/issues/277
-        .type_attribute(
-            ".",
-            "#[derive(serde::Serialize,serde::Deserialize)] #[serde(rename_all = \"camelCase\")]",
-        )
         .extern_path(".google.protobuf", "::google_types::protobuf")
         .bytes(&[".influxdata.iox.catalog.v1.AddParquet.metadata"]);
 
@@ -79,5 +73,11 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
         .format(true)
         .compile_with_config(config, &proto_files, &[root.into()])?;
 
+    let descriptor_set = std::fs::read(descriptor_path)?;
+
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".influxdata", ".google.longrunning", ".google.rpc"])?;
+
     Ok(())
 }
diff --git a/generated_types/protos/influxdata/iox/management/v1/jobs.proto b/generated_types/protos/influxdata/iox/management/v1/jobs.proto
index 2e5985f144..5a021c3d1b 100644
--- a/generated_types/protos/influxdata/iox/management/v1/jobs.proto
+++ b/generated_types/protos/influxdata/iox/management/v1/jobs.proto
@@ -27,12 +27,11 @@ message OperationMetadata {
   // The number of tasks that did not run to completion (e.g. panic)
   uint64 dropped_count = 16;
 
+  reserved 6;
+
   // What kind of job is it?
   oneof job {
     Dummy dummy = 5;
-    /* historical artifact
-    PersistSegment persist_segment = 6;
-    */
     CloseChunk close_chunk = 7;
     WriteChunk write_chunk = 8;
     WipePreservedCatalog wipe_preserved_catalog = 9;
diff --git a/generated_types/src/google.rs b/generated_types/src/google.rs
index bc9211d0ee..9ab7287fef 100644
--- a/generated_types/src/google.rs
+++ b/generated_types/src/google.rs
@@ -5,10 +5,17 @@ pub use google_types::*;
 
 pub mod rpc {
     include!(concat!(env!("OUT_DIR"), "/google.rpc.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.rpc.serde.rs"));
 }
 
 pub mod longrunning {
     include!(concat!(env!("OUT_DIR"), "/google.longrunning.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.longrunning.serde.rs"));
+
+    use crate::google::{FieldViolation, FieldViolationExt};
+    use crate::influxdata::iox::management::v1::{OperationMetadata, OPERATION_METADATA};
+    use prost::{bytes::Bytes, Message};
+    use std::convert::TryFrom;
 
     impl Operation {
         /// Return the IOx operation `id`. This `id` can
@@ -19,6 +26,46 @@ pub mod longrunning {
                 .parse()
                 .expect("Internal error: id returned from server was not an integer")
         }
+
+        /// Decodes an IOx `OperationMetadata` metadata payload
+        pub fn iox_metadata(&self) -> Result<OperationMetadata, FieldViolation> {
+            let metadata = self
+                .metadata
+                .as_ref()
+                .ok_or_else(|| FieldViolation::required("metadata"))?;
+
+            if !crate::protobuf_type_url_eq(&metadata.type_url, OPERATION_METADATA) {
+                return Err(FieldViolation {
+                    field: "metadata.type_url".to_string(),
+                    description: "Unexpected field type".to_string(),
+                });
+            }
+
+            Message::decode(Bytes::clone(&metadata.value)).field("metadata.value")
+        }
+    }
+
+    /// Groups together an `Operation` with a decoded `OperationMetadata`
+    ///
+    /// When serialized this will serialize the encoded Any field on `Operation` along
+    /// with its decoded representation as `OperationMetadata`
+    #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
+    pub struct IoxOperation {
+        /// The `Operation` message returned from the API
+        pub operation: Operation,
+        /// The decoded `Operation::metadata` contained within `IoxOperation::operation`
+        pub metadata: OperationMetadata,
+    }
+
+    impl TryFrom<Operation> for IoxOperation {
+        type Error = FieldViolation;
+
+        fn try_from(operation: Operation) -> Result<Self, Self::Error> {
+            Ok(Self {
+                metadata: operation.iox_metadata()?,
+                operation,
+            })
+        }
     }
 }
 
diff --git a/generated_types/src/job.rs b/generated_types/src/job.rs
index 499826f455..d3d1429875 100644
--- a/generated_types/src/job.rs
+++ b/generated_types/src/job.rs
@@ -1,11 +1,5 @@
-use crate::google::{longrunning, protobuf::Any, FieldViolation, FieldViolationExt};
 use crate::influxdata::iox::management::v1 as management;
-use crate::protobuf_type_url_eq;
-use data_types::chunk_metadata::ChunkAddr;
-use data_types::job::{Job, OperationStatus};
-use data_types::partition_metadata::PartitionAddr;
-use std::convert::TryFrom;
-use std::sync::Arc;
+use data_types::job::Job;
 
 impl From<Job> for management::operation_metadata::Job {
     fn from(job: Job) -> Self {
@@ -61,142 +55,3 @@ impl From<Job> for management::operation_metadata::Job {
         }
     }
 }
-
-impl From<management::operation_metadata::Job> for Job {
-    fn from(value: management::operation_metadata::Job) -> Self {
-        use management::operation_metadata::Job;
-        match value {
-            Job::Dummy(management::Dummy { nanos, db_name }) => Self::Dummy {
-                nanos,
-                db_name: (!db_name.is_empty()).then(|| Arc::from(db_name.as_str())),
-            },
-            Job::CloseChunk(management::CloseChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::CompactChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::WriteChunk(management::WriteChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::WriteChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::WipePreservedCatalog(management::WipePreservedCatalog { db_name }) => {
-                Self::WipePreservedCatalog {
-                    db_name: Arc::from(db_name.as_str()),
-                }
-            }
-            Job::CompactChunks(management::CompactChunks {
-                db_name,
-                partition_key,
-                table_name,
-                chunks,
-            }) => Self::CompactChunks {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-                chunks,
-            },
-            Job::PersistChunks(management::PersistChunks {
-                db_name,
-                partition_key,
-                table_name,
-                chunks,
-            }) => Self::PersistChunks {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-                chunks,
-            },
-            Job::DropChunk(management::DropChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::DropChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::DropPartition(management::DropPartition {
-                db_name,
-                partition_key,
-                table_name,
-            }) => Self::DropPartition {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-            },
-        }
-    }
-}
-
-impl TryFrom<longrunning::Operation> for data_types::job::Operation {
-    type Error = FieldViolation;
-
-    fn try_from(operation: longrunning::Operation) -> Result<Self, Self::Error> {
-        let metadata: Any = operation
-            .metadata
-            .ok_or_else(|| FieldViolation::required("metadata"))?;
-
-        if !protobuf_type_url_eq(&metadata.type_url, management::OPERATION_METADATA) {
-            return Err(FieldViolation {
-                field: "metadata.type_url".to_string(),
-                description: "Unexpected field type".to_string(),
-            });
-        }
-
-        let meta: management::OperationMetadata =
-            prost::Message::decode(metadata.value).field("metadata.value")?;
-
-        let status = match &operation.result {
-            None => OperationStatus::Running,
-            Some(longrunning::operation::Result::Response(_)) => OperationStatus::Success,
-            Some(longrunning::operation::Result::Error(status)) => {
-                if status.code == tonic::Code::Cancelled as i32 {
-                    OperationStatus::Cancelled
-                } else {
-                    OperationStatus::Errored
-                }
-            }
-        };
-
-        Ok(Self {
-            id: operation.name.parse().field("name")?,
-            total_count: meta.total_count,
-            pending_count: meta.pending_count,
-            success_count: meta.success_count,
-            error_count: meta.error_count,
-            cancelled_count: meta.cancelled_count,
-            dropped_count: meta.dropped_count,
-            wall_time: std::time::Duration::from_nanos(meta.wall_nanos),
-            cpu_time: std::time::Duration::from_nanos(meta.cpu_nanos),
-            job: meta.job.map(Into::into),
-            status,
-        })
-    }
-}
diff --git a/generated_types/src/lib.rs b/generated_types/src/lib.rs
index 56f376a368..0176d23b84 100644
--- a/generated_types/src/lib.rs
+++ b/generated_types/src/lib.rs
@@ -10,6 +10,10 @@ pub mod influxdata {
     pub mod platform {
         pub mod storage {
             include!(concat!(env!("OUT_DIR"), "/influxdata.platform.storage.rs"));
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.storage.serde.rs"
+            ));
 
             // Can't implement `Default` because `prost::Message` implements `Default`
             impl TimestampRange {
@@ -27,6 +31,10 @@ pub mod influxdata {
         pub mod catalog {
             pub mod v1 {
                 include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog.v1.serde.rs"
+                ));
             }
         }
 
@@ -37,12 +45,20 @@ pub mod influxdata {
                     "influxdata.iox.management.v1.OperationMetadata";
 
                 include!(concat!(env!("OUT_DIR"), "/influxdata.iox.management.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.management.v1.serde.rs"
+                ));
             }
         }
 
         pub mod write {
             pub mod v1 {
                 include!(concat!(env!("OUT_DIR"), "/influxdata.iox.write.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.write.v1.serde.rs"
+                ));
             }
         }
     }
@@ -50,6 +66,7 @@ pub mod influxdata {
     pub mod pbdata {
         pub mod v1 {
             include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.rs"));
+            include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.serde.rs"));
         }
     }
 }
diff --git a/google_types/Cargo.toml b/google_types/Cargo.toml
index cca54af075..1c18f2c3d7 100644
--- a/google_types/Cargo.toml
+++ b/google_types/Cargo.toml
@@ -8,8 +8,10 @@ edition = "2018"
 [dependencies] # In alphabetical order
 bytes = { version = "1.0", features = ["serde"] }
 chrono = "0.4"
+pbjson = { path = "../pbjson" }
 prost = "0.8"
 serde = { version = "1.0", features = ["derive"] }
 
 [build-dependencies] # In alphabetical order
 prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
diff --git a/google_types/build.rs b/google_types/build.rs
index df67311be4..28818c1e0d 100644
--- a/google_types/build.rs
+++ b/google_types/build.rs
@@ -1,6 +1,7 @@
 //! Compiles Protocol Buffers and FlatBuffers schema definitions into
 //! native Rust types.
 
+use std::env;
 use std::path::PathBuf;
 
 type Error = Box<dyn std::error::Error>;
@@ -16,16 +17,18 @@ fn main() -> Result<()> {
         println!("cargo:rerun-if-changed={}", proto_file.display());
     }
 
+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
     prost_build::Config::new()
+        .file_descriptor_set_path(&descriptor_path)
         .compile_well_known_types()
         .disable_comments(&["."])
-        // approximates jsonpb. This is still not enough to deal with the special cases like Any.
-        .type_attribute(
-            ".google",
-            "#[derive(serde::Serialize,serde::Deserialize)] #[serde(rename_all = \"camelCase\")]",
-        )
         .bytes(&[".google"])
         .compile_protos(&proto_files, &[root])?;
 
+    let descriptor_set = std::fs::read(descriptor_path)?;
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".google"])?;
+
     Ok(())
 }
diff --git a/google_types/src/lib.rs b/google_types/src/lib.rs
index 13d3fad7e2..078916f103 100644
--- a/google_types/src/lib.rs
+++ b/google_types/src/lib.rs
@@ -17,6 +17,7 @@ mod pb {
             use std::convert::{TryFrom, TryInto};
 
             include!(concat!(env!("OUT_DIR"), "/google.protobuf.rs"));
+            include!(concat!(env!("OUT_DIR"), "/google.protobuf.serde.rs"));
 
             impl TryFrom<Duration> for std::time::Duration {
                 type Error = std::num::TryFromIntError;
diff --git a/influxdb_iox_client/src/client/management.rs b/influxdb_iox_client/src/client/management.rs
index 5b6392281e..8a1ea54f9c 100644
--- a/influxdb_iox_client/src/client/management.rs
+++ b/influxdb_iox_client/src/client/management.rs
@@ -3,8 +3,8 @@ use thiserror::Error;
 use self::generated_types::{management_service_client::ManagementServiceClient, *};
 
 use crate::connection::Connection;
-use ::generated_types::google::longrunning::Operation;
 
+use crate::google::{longrunning::IoxOperation, FieldViolation};
 use std::convert::TryInto;
 use std::num::NonZeroU32;
 
@@ -200,6 +200,10 @@ pub enum CreateDummyJobError {
     #[error("Server returned an empty response")]
     EmptyResponse,
 
+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
     /// Client received an unexpected error from the server
     #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
     ServerError(tonic::Status),
@@ -284,6 +288,10 @@ pub enum ClosePartitionChunkError {
     #[error("Server unavailable: {}", .0.message())]
     Unavailable(tonic::Status),
 
+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
     /// Client received an unexpected error from the server
     #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
     ServerError(tonic::Status),
@@ -336,6 +344,10 @@ pub enum WipePersistedCatalogError {
     #[error("Server returned an empty response")]
     EmptyResponse,
 
+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
     /// Client received an unexpected error from the server
     #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
     ServerError(tonic::Status),
@@ -842,7 +854,7 @@ impl Client {
     pub async fn create_dummy_job(
         &mut self,
         nanos: Vec<u64>,
-    ) -> Result<Operation, CreateDummyJobError> {
+    ) -> Result<IoxOperation, CreateDummyJobError> {
         let response = self
             .inner
             .create_dummy_job(CreateDummyJobRequest { nanos })
@@ -852,7 +864,8 @@ impl Client {
         Ok(response
             .into_inner()
             .operation
-            .ok_or(CreateDummyJobError::EmptyResponse)?)
+            .ok_or(CreateDummyJobError::EmptyResponse)?
+            .try_into()?)
     }
 
     /// Closes the specified chunk in the specified partition and
@@ -865,7 +878,7 @@ impl Client {
         table_name: impl Into<String> + Send,
         partition_key: impl Into<String> + Send,
         chunk_id: u32,
-    ) -> Result<Operation, ClosePartitionChunkError> {
+    ) -> Result<IoxOperation, ClosePartitionChunkError> {
         let db_name = db_name.into();
         let partition_key = partition_key.into();
         let table_name = table_name.into();
@@ -888,7 +901,8 @@ impl Client {
         Ok(response
             .into_inner()
             .operation
-            .ok_or(ClosePartitionChunkError::EmptyResponse)?)
+            .ok_or(ClosePartitionChunkError::EmptyResponse)?
+            .try_into()?)
     }
 
     /// Unload chunk from read buffer but keep it in object store.
@@ -929,7 +943,7 @@ impl Client {
     pub async fn wipe_persisted_catalog(
         &mut self,
         db_name: impl Into<String> + Send,
-    ) -> Result<Operation, WipePersistedCatalogError> {
+    ) -> Result<IoxOperation, WipePersistedCatalogError> {
         let db_name = db_name.into();
 
         let response = self
@@ -947,7 +961,8 @@ impl Client {
         Ok(response
             .into_inner()
             .operation
-            .ok_or(WipePersistedCatalogError::EmptyResponse)?)
+            .ok_or(WipePersistedCatalogError::EmptyResponse)?
+            .try_into()?)
     }
 
     /// Skip replay of an uninitialized database.
diff --git a/influxdb_iox_client/src/client/operations.rs b/influxdb_iox_client/src/client/operations.rs
index dcc0fb6fb0..781f2ec557 100644
--- a/influxdb_iox_client/src/client/operations.rs
+++ b/influxdb_iox_client/src/client/operations.rs
@@ -1,11 +1,9 @@
 use thiserror::Error;
 
-use ::generated_types::{
-    google::FieldViolation, influxdata::iox::management::v1 as management, protobuf_type_url_eq,
-};
-
 use self::generated_types::{operations_client::OperationsClient, *};
 use crate::connection::Connection;
+use std::convert::TryInto;
+
 /// Re-export generated_types
 pub mod generated_types {
     pub use generated_types::google::longrunning::*;
@@ -16,7 +14,7 @@ pub mod generated_types {
 pub enum Error {
     /// Client received an invalid response
     #[error("Invalid server response: {}", .0)]
-    InvalidResponse(#[from] FieldViolation),
+    InvalidResponse(#[from] ::generated_types::google::FieldViolation),
 
     /// Operation was not found
     #[error("Operation not found: {}", .0)]
@@ -66,7 +64,7 @@ impl Client {
     }
 
     /// Get information of all client operation
-    pub async fn list_operations(&mut self) -> Result<Vec<ClientOperation>> {
+    pub async fn list_operations(&mut self) -> Result<Vec<IoxOperation>> {
         Ok(self
             .inner
             .list_operations(ListOperationsRequest::default())
@@ -75,12 +73,12 @@ impl Client {
             .into_inner()
             .operations
             .into_iter()
-            .map(|o| ClientOperation::try_new(o).unwrap())
-            .collect())
+            .map(TryInto::try_into)
+            .collect::<Result<_, _>>()?)
     }
 
     /// Get information about a specific operation
-    pub async fn get_operation(&mut self, id: usize) -> Result<Operation> {
+    pub async fn get_operation(&mut self, id: usize) -> Result<IoxOperation> {
         Ok(self
             .inner
             .get_operation(GetOperationRequest {
@@ -91,7 +89,8 @@ impl Client {
                 tonic::Code::NotFound => Error::NotFound(id),
                 _ => Error::ServerError(e),
             })?
-            .into_inner())
+            .into_inner()
+            .try_into()?)
     }
 
     /// Cancel a given operation
@@ -115,7 +114,7 @@ impl Client {
         &mut self,
         id: usize,
         timeout: Option<std::time::Duration>,
-    ) -> Result<Operation> {
+    ) -> Result<IoxOperation> {
         Ok(self
             .inner
             .wait_operation(WaitOperationRequest {
@@ -127,50 +126,7 @@ impl Client {
                 tonic::Code::NotFound => Error::NotFound(id),
                 _ => Error::ServerError(e),
             })?
-            .into_inner())
-    }
-
-    /// Return the Client Operation
-    pub async fn client_operation(&mut self, id: usize) -> Result<ClientOperation> {
-        let operation = self.get_operation(id).await?;
-        ClientOperation::try_new(operation)
-    }
-}
-
-/// IOx's Client Operation
-#[derive(Debug, Clone)]
-pub struct ClientOperation {
-    inner: generated_types::Operation,
-}
-
-impl ClientOperation {
-    /// Create a new Cient Operation
-    pub fn try_new(operation: generated_types::Operation) -> Result<Self> {
-        if operation.metadata.is_some() {
-            let metadata = operation.metadata.clone().unwrap();
-            if !protobuf_type_url_eq(&metadata.type_url, management::OPERATION_METADATA) {
-                return Err(Error::WrongOperationMetaData);
-            }
-        } else {
-            return Err(Error::NotFound(0));
-        }
-
-        Ok(Self { inner: operation })
-    }
-
-    /// Return Metadata for this client operation
-    pub fn metadata(&self) -> management::OperationMetadata {
-        prost::Message::decode(self.inner.metadata.clone().unwrap().value)
-            .expect("failed to decode metadata")
-    }
-
-    /// Return name of this operation
-    pub fn name(&self) -> &str {
-        &self.inner.name
-    }
-
-    /// Return the inner's Operation
-    pub fn operation(self) -> Operation {
-        self.inner
+            .into_inner()
+            .try_into()?)
     }
 }
diff --git a/iox_data_generator/src/lib.rs b/iox_data_generator/src/lib.rs
index b57edf3103..b14eeb0ec1 100644
--- a/iox_data_generator/src/lib.rs
+++ b/iox_data_generator/src/lib.rs
@@ -77,6 +77,15 @@ pub enum Error {
         /// Underlying `agent` module error that caused this problem
         source: agent::Error,
     },
+
+    /// Error that may happen when constructing an agent's writer
+    #[snafu(display("Could not create writer for agent `{}`, caused by:\n{}", name, source))]
+    CouldNotCreateAgentWriter {
+        /// The name of the relevant agent
+        name: String,
+        /// Underlying `write` module error that caused this problem
+        source: write::Error,
+    },
 }
 
 type Result<T, E = Error> = std::result::Result<T, E>;
@@ -135,7 +144,9 @@ pub async fn generate<T: DataGenRng>(
             )
             .context(CouldNotCreateAgent { name: &agent_name })?;
 
-            let agent_points_writer = points_writer_builder.build_for_agent(&agent_name);
+            let agent_points_writer = points_writer_builder
+                .build_for_agent(&agent_name)
+                .context(CouldNotCreateAgentWriter { name: &agent_name })?;
 
             handles.push(tokio::task::spawn(async move {
                 agent.generate_all(agent_points_writer, batch_size).await
diff --git a/iox_data_generator/src/write.rs b/iox_data_generator/src/write.rs
index fb0966f8d0..ae68d93940 100644
--- a/iox_data_generator/src/write.rs
+++ b/iox_data_generator/src/write.rs
@@ -10,14 +10,23 @@ use std::{
 };
 use std::{
     fs,
-    fs::OpenOptions,
+    fs::{File, OpenOptions},
+    io::BufWriter,
     path::{Path, PathBuf},
 };
-use tracing::info;
 
 /// Errors that may happen while writing points.
 #[derive(Snafu, Debug)]
 pub enum Error {
+    /// Error that may happen when writing line protocol to a file
+    #[snafu(display("Could open line protocol file {}: {}", filename.display(), source))]
+    CantOpenLineProtocolFile {
+        /// The location of the file we tried to open
+        filename: PathBuf,
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
     /// Error that may happen when writing line protocol to a no-op sink
     #[snafu(display("Could not generate line protocol: {}", source))]
     CantWriteToNoOp {
@@ -174,7 +183,7 @@ impl PointsWriterBuilder {
 
     /// Create a writer out of this writer's configuration for a particular
     /// agent that runs in a separate thread/task.
-    pub fn build_for_agent(&mut self, agent_name: &str) -> PointsWriter {
+    pub fn build_for_agent(&mut self, agent_name: &str) -> Result<PointsWriter> {
         let inner_writer = match &mut self.config {
             PointsWriterConfig::Api {
                 client,
@@ -189,7 +198,16 @@ impl PointsWriterBuilder {
                 let mut filename = dir_path.clone();
                 filename.push(agent_name);
                 filename.set_extension("txt");
-                InnerPointsWriter::File(filename)
+
+                let file = OpenOptions::new()
+                    .append(true)
+                    .create(true)
+                    .open(&filename)
+                    .context(CantOpenLineProtocolFile { filename })?;
+
+                let file = BufWriter::new(file);
+
+                InnerPointsWriter::File { file }
             }
             PointsWriterConfig::NoOp { perform_write } => InnerPointsWriter::NoOp {
                 perform_write: *perform_write,
@@ -204,7 +222,7 @@ impl PointsWriterBuilder {
             PointsWriterConfig::Stdout => InnerPointsWriter::Stdout,
         };
 
-        PointsWriter { inner_writer }
+        Ok(PointsWriter { inner_writer })
     }
 }
 
@@ -228,7 +246,9 @@ enum InnerPointsWriter {
         org: String,
         bucket: String,
     },
-    File(PathBuf),
+    File {
+        file: BufWriter<File>,
+    },
     NoOp {
         perform_write: bool,
     },
@@ -250,22 +270,12 @@ impl InnerPointsWriter {
                     .await
                     .context(CantWriteToApi)?;
             }
-            Self::File(filename) => {
-                info!("Opening file {:?}", filename);
-                let num_points = points.len();
-                let file = OpenOptions::new()
-                    .append(true)
-                    .create(true)
-                    .open(&filename)
-                    .context(CantWriteToLineProtocolFile)?;
-
-                let mut file = std::io::BufWriter::new(file);
+            Self::File { file } => {
                 for point in points {
                     point
-                        .write_data_point_to(&mut file)
+                        .write_data_point_to(&mut *file)
                         .context(CantWriteToLineProtocolFile)?;
                 }
-                info!("Wrote {} points to {:?}", num_points, filename);
             }
             Self::NoOp { perform_write } => {
                 if *perform_write {
diff --git a/parquet_file/src/catalog/cleanup.rs b/parquet_file/src/catalog/cleanup.rs
index eb6d9d303e..85bb8f2e58 100644
--- a/parquet_file/src/catalog/cleanup.rs
+++ b/parquet_file/src/catalog/cleanup.rs
@@ -1,7 +1,6 @@
 //! Methods to cleanup the object store.
 use std::{collections::HashSet, sync::Arc};
 
-use crate::catalog::api::{CatalogParquetInfo, CatalogState, PreservedCatalog};
 use futures::TryStreamExt;
 use iox_object_store::{IoxObjectStore, ParquetFilePath};
 use object_store::{ObjectStore, ObjectStoreApi};
@@ -9,6 +8,11 @@ use observability_deps::tracing::info;
 use parking_lot::Mutex;
 use snafu::{ResultExt, Snafu};
 
+use crate::catalog::{
+    core::PreservedCatalog,
+    interface::{CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError},
+};
+
 #[derive(Debug, Snafu)]
 pub enum Error {
     #[snafu(display("Error from read operation while cleaning object store: {}", source))]
@@ -22,7 +26,7 @@ pub enum Error {
     },
 
     #[snafu(display("Error from catalog loading while cleaning object store: {}", source))]
-    CatalogLoadError { source: crate::catalog::api::Error },
+    CatalogLoadError { source: crate::catalog::core::Error },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -124,12 +128,12 @@ impl CatalogState for TracerCatalogState {
         &mut self,
         _iox_object_store: Arc<IoxObjectStore>,
         info: CatalogParquetInfo,
-    ) -> crate::catalog::api::Result<()> {
+    ) -> Result<(), CatalogStateAddError> {
         self.files.lock().insert(info.path);
         Ok(())
     }
 
-    fn remove(&mut self, _path: &ParquetFilePath) -> crate::catalog::api::Result<()> {
+    fn remove(&mut self, _path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
         // Do NOT remove the file since we still need it for time travel
         Ok(())
     }
diff --git a/parquet_file/src/catalog/api.rs b/parquet_file/src/catalog/core.rs
similarity index 95%
rename from parquet_file/src/catalog/api.rs
rename to parquet_file/src/catalog/core.rs
index 507c974a7f..5550d6fdf6 100644
--- a/parquet_file/src/catalog/api.rs
+++ b/parquet_file/src/catalog/core.rs
@@ -1,10 +1,13 @@
 //! Catalog preservation and transaction handling.
 
 use crate::{
-    catalog::internals::{
-        proto_io::{load_transaction_proto, store_transaction_proto},
-        proto_parse,
-        types::{FileType, TransactionKey},
+    catalog::{
+        interface::{CatalogParquetInfo, CatalogState, CheckpointData},
+        internals::{
+            proto_io::{load_transaction_proto, store_transaction_proto},
+            proto_parse,
+            types::{FileType, TransactionKey},
+        },
     },
     metadata::IoxParquetMetaData,
 };
@@ -113,55 +116,9 @@ pub enum Error {
     #[snafu(display("Upgrade path not implemented/supported: {}", format))]
     UnsupportedUpgrade { format: String },
 
-    #[snafu(display("Parquet already exists in catalog: {:?}", path))]
-    ParquetFileAlreadyExists { path: ParquetFilePath },
-
-    #[snafu(display("Parquet does not exist in catalog: {:?}", path))]
-    ParquetFileDoesNotExist { path: ParquetFilePath },
-
     #[snafu(display("Cannot decode parquet metadata: {}", source))]
     MetadataDecodingFailed { source: crate::metadata::Error },
 
-    #[snafu(
-        display("Cannot extract metadata from {:?}: {}", path, source),
-        visibility(pub)
-    )]
-    MetadataExtractFailed {
-        source: crate::metadata::Error,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display("Schema for {:?} does not work with existing schema: {}", path, source),
-        visibility(pub)
-    )]
-    SchemaError {
-        source: Box<dyn std::error::Error + Send + Sync>,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display(
-            "Internal error: Using checkpoints from {:?} leads to broken replay plan: {}, catalog likely broken",
-            path,
-            source
-        ),
-        visibility(pub)
-    )]
-    ReplayPlanError {
-        source: Box<dyn std::error::Error + Send + Sync>,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display("Cannot create parquet chunk from {:?}: {}", path, source),
-        visibility(pub)
-    )]
-    ChunkCreationFailed {
-        source: crate::chunk::Error,
-        path: ParquetFilePath,
-    },
-
     #[snafu(display("Catalog already exists"))]
     AlreadyExists {},
 
@@ -177,44 +134,20 @@ pub enum Error {
 
     #[snafu(display("Cannot commit transaction: {}", source))]
     CommitError { source: Box<Error> },
+
+    #[snafu(display("Cannot add parquet file during load: {}", source))]
+    AddError {
+        source: crate::catalog::interface::CatalogStateAddError,
+    },
+
+    #[snafu(display("Cannot remove parquet file during load: {}", source))]
+    RemoveError {
+        source: crate::catalog::interface::CatalogStateRemoveError,
+    },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-/// Struct containing all information that a catalog received for a new parquet file.
-#[derive(Debug, Clone)]
-pub struct CatalogParquetInfo {
-    /// Path within this database.
-    pub path: ParquetFilePath,
-
-    /// Size of the parquet file, in bytes
-    pub file_size_bytes: usize,
-
-    /// Associated parquet metadata.
-    pub metadata: Arc<IoxParquetMetaData>,
-}
-
-/// Abstraction over how the in-memory state of the catalog works.
-pub trait CatalogState {
-    /// Input to create a new empty instance.
-    ///
-    /// See [`new_empty`](Self::new_empty) for details.
-    type EmptyInput: Send;
-
-    /// Create empty state w/o any known files.
-    fn new_empty(db_name: &str, data: Self::EmptyInput) -> Self;
-
-    /// Add parquet file to state.
-    fn add(
-        &mut self,
-        iox_object_store: Arc<IoxObjectStore>,
-        info: CatalogParquetInfo,
-    ) -> Result<()>;
-
-    /// Remove parquet file from state.
-    fn remove(&mut self, path: &ParquetFilePath) -> Result<()>;
-}
-
 /// In-memory view of the preserved catalog.
 pub struct PreservedCatalog {
     // We need an RWLock AND a semaphore, so that readers are NOT blocked during an open
@@ -605,20 +538,22 @@ impl OpenTransaction {
 
                 let metadata = Arc::new(metadata);
 
-                state.add(
-                    Arc::clone(iox_object_store),
-                    CatalogParquetInfo {
-                        path,
-                        file_size_bytes,
-                        metadata,
-                    },
-                )?;
+                state
+                    .add(
+                        Arc::clone(iox_object_store),
+                        CatalogParquetInfo {
+                            path,
+                            file_size_bytes,
+                            metadata,
+                        },
+                    )
+                    .context(AddError)?;
             }
             proto::transaction::action::Action::RemoveParquet(a) => {
                 let path =
                     proto_parse::parse_dirs_and_filename(a.path.as_ref().context(PathRequired)?)
                         .context(ProtobufParseError)?;
-                state.remove(&path)?;
+                state.remove(&path).context(RemoveError)?;
             }
         };
         Ok(())
@@ -737,20 +672,6 @@ impl OpenTransaction {
     }
 }
 
-/// Structure that holds all information required to create a checkpoint.
-///
-/// Note that while checkpoint are addressed using the same schema as we use for transaction
-/// (revision counter, UUID), they contain the changes at the end (aka including) the transaction
-/// they refer.
-#[derive(Debug)]
-pub struct CheckpointData {
-    /// List of all Parquet files that are currently (i.e. by the current version) tracked by the
-    /// catalog.
-    ///
-    /// If a file was once added but later removed it MUST NOT appear in the result.
-    pub files: HashMap<ParquetFilePath, CatalogParquetInfo>,
-}
-
 /// Handle for an open uncommitted transaction.
 ///
 /// Dropping this object w/o calling [`commit`](Self::commit) will issue a warning.
diff --git a/parquet_file/src/catalog/dump.rs b/parquet_file/src/catalog/dump.rs
index 5fb7668e5c..55d83ffb20 100644
--- a/parquet_file/src/catalog/dump.rs
+++ b/parquet_file/src/catalog/dump.rs
@@ -223,8 +223,7 @@ mod tests {
 
     use crate::{
         catalog::{
-            api::{CatalogParquetInfo, PreservedCatalog},
-            test_helpers::TestCatalogState,
+            core::PreservedCatalog, interface::CatalogParquetInfo, test_helpers::TestCatalogState,
         },
         test_utils::{chunk_addr, make_iox_object_store, make_metadata, TestSize},
     };
diff --git a/parquet_file/src/catalog/interface.rs b/parquet_file/src/catalog/interface.rs
new file mode 100644
index 0000000000..d0b609d814
--- /dev/null
+++ b/parquet_file/src/catalog/interface.rs
@@ -0,0 +1,99 @@
+//! Abstract interfaces to make different users work with the perserved catalog.
+use std::{collections::HashMap, sync::Arc};
+
+use iox_object_store::{IoxObjectStore, ParquetFilePath};
+use snafu::Snafu;
+
+use crate::metadata::IoxParquetMetaData;
+
+/// Struct containing all information that a catalog received for a new parquet file.
+#[derive(Debug, Clone)]
+pub struct CatalogParquetInfo {
+    /// Path within this database.
+    pub path: ParquetFilePath,
+
+    /// Size of the parquet file, in bytes
+    pub file_size_bytes: usize,
+
+    /// Associated parquet metadata.
+    pub metadata: Arc<IoxParquetMetaData>,
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum CatalogStateAddError {
+    #[snafu(display("Cannot extract metadata from {:?}: {}", path, source))]
+    MetadataExtractFailed {
+        source: crate::metadata::Error,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Schema for {:?} does not work with existing schema: {}", path, source))]
+    SchemaError {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(
+        display(
+            "Internal error: Using checkpoints from {:?} leads to broken replay plan: {}, catalog likely broken",
+            path,
+            source
+        ),
+    )]
+    ReplayPlanError {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Cannot create parquet chunk from {:?}: {}", path, source))]
+    ChunkCreationFailed {
+        source: crate::chunk::Error,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Parquet already exists in catalog: {:?}", path))]
+    ParquetFileAlreadyExists { path: ParquetFilePath },
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum CatalogStateRemoveError {
+    #[snafu(display("Parquet does not exist in catalog: {:?}", path))]
+    ParquetFileDoesNotExist { path: ParquetFilePath },
+}
+
+/// Abstraction over how the in-memory state of the catalog works.
+pub trait CatalogState {
+    /// Input to create a new empty instance.
+    ///
+    /// See [`new_empty`](Self::new_empty) for details.
+    type EmptyInput: Send;
+
+    /// Create empty state w/o any known files.
+    fn new_empty(db_name: &str, data: Self::EmptyInput) -> Self;
+
+    /// Add parquet file to state.
+    fn add(
+        &mut self,
+        iox_object_store: Arc<IoxObjectStore>,
+        info: CatalogParquetInfo,
+    ) -> Result<(), CatalogStateAddError>;
+
+    /// Remove parquet file from state.
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError>;
+}
+
+/// Structure that holds all information required to create a checkpoint.
+///
+/// Note that while checkpoint are addressed using the same schema as we use for transaction
+/// (revision counter, UUID), they contain the changes at the end (aka including) the transaction
+/// they refer.
+#[derive(Debug)]
+pub struct CheckpointData {
+    /// List of all Parquet files that are currently (i.e. by the current version) tracked by the
+    /// catalog.
+    ///
+    /// If a file was once added but later removed it MUST NOT appear in the result.
+    pub files: HashMap<ParquetFilePath, CatalogParquetInfo>,
+}
diff --git a/parquet_file/src/catalog/mod.rs b/parquet_file/src/catalog/mod.rs
index 70132e8c7f..ebdb5ae680 100644
--- a/parquet_file/src/catalog/mod.rs
+++ b/parquet_file/src/catalog/mod.rs
@@ -1,6 +1,7 @@
-pub mod api;
 pub mod cleanup;
+pub mod core;
 pub mod dump;
+pub mod interface;
 mod internals;
 pub mod prune;
 pub mod rebuild;
diff --git a/parquet_file/src/catalog/prune.rs b/parquet_file/src/catalog/prune.rs
index 442747712b..4104ab9ae4 100644
--- a/parquet_file/src/catalog/prune.rs
+++ b/parquet_file/src/catalog/prune.rs
@@ -8,7 +8,7 @@ use object_store::{ObjectStore, ObjectStoreApi};
 use snafu::{ResultExt, Snafu};
 
 use crate::catalog::{
-    api::{ProtoIOError, ProtoParseError},
+    core::{ProtoIOError, ProtoParseError},
     internals::{proto_io::load_transaction_proto, proto_parse::parse_timestamp},
 };
 
@@ -33,7 +33,7 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-/// Prune history of [`PreservedCatalog`](crate::catalog::api::PreservedCatalog).
+/// Prune history of [`PreservedCatalog`](crate::catalog::core::PreservedCatalog).
 ///
 /// This deletes all transactions and checkpoints that were started prior to `before`. Note that this only deletes data
 /// that is safe to delete when time travel to `before` is allowed. For example image the following transactions:
@@ -133,8 +133,7 @@ fn is_checkpoint_or_zero(path: &TransactionFilePath) -> bool {
 mod tests {
     use crate::{
         catalog::{
-            api::{CheckpointData, PreservedCatalog},
-            test_helpers::TestCatalogState,
+            core::PreservedCatalog, interface::CheckpointData, test_helpers::TestCatalogState,
         },
         test_utils::make_iox_object_store,
     };
diff --git a/parquet_file/src/catalog/rebuild.rs b/parquet_file/src/catalog/rebuild.rs
index 663741d0df..0c2fd528bd 100644
--- a/parquet_file/src/catalog/rebuild.rs
+++ b/parquet_file/src/catalog/rebuild.rs
@@ -7,13 +7,17 @@ use observability_deps::tracing::error;
 use snafu::{ResultExt, Snafu};
 
 use crate::{
-    catalog::api::{CatalogParquetInfo, CatalogState, PreservedCatalog},
+    catalog::{
+        core::PreservedCatalog,
+        interface::{CatalogParquetInfo, CatalogState},
+    },
     metadata::IoxParquetMetaData,
 };
+
 #[derive(Debug, Snafu)]
 pub enum Error {
     #[snafu(display("Cannot create new empty catalog: {}", source))]
-    NewEmptyFailure { source: crate::catalog::api::Error },
+    NewEmptyFailure { source: crate::catalog::core::Error },
 
     #[snafu(display("Cannot read store: {}", source))]
     ReadFailure { source: object_store::Error },
@@ -25,13 +29,15 @@ pub enum Error {
     },
 
     #[snafu(display("Cannot add file to transaction: {}", source))]
-    FileRecordFailure { source: crate::catalog::api::Error },
+    FileRecordFailure {
+        source: crate::catalog::interface::CatalogStateAddError,
+    },
 
     #[snafu(display("Cannot commit transaction: {}", source))]
-    CommitFailure { source: crate::catalog::api::Error },
+    CommitFailure { source: crate::catalog::core::Error },
 
     #[snafu(display("Cannot create checkpoint: {}", source))]
-    CheckpointFailure { source: crate::catalog::api::Error },
+    CheckpointFailure { source: crate::catalog::core::Error },
 }
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
@@ -164,7 +170,7 @@ async fn read_parquet(
 mod tests {
     use super::*;
     use crate::{
-        catalog::{api::PreservedCatalog, test_helpers::TestCatalogState},
+        catalog::{core::PreservedCatalog, test_helpers::TestCatalogState},
         metadata::IoxMetadata,
         storage::{MemWriter, Storage},
         test_utils::{
diff --git a/parquet_file/src/catalog/test_helpers.rs b/parquet_file/src/catalog/test_helpers.rs
index 804b51ad02..ba2f4b5c46 100644
--- a/parquet_file/src/catalog/test_helpers.rs
+++ b/parquet_file/src/catalog/test_helpers.rs
@@ -12,7 +12,11 @@ use snafu::ResultExt;
 
 use crate::{
     catalog::{
-        api::{CatalogParquetInfo, CatalogState, CheckpointData, PreservedCatalog},
+        core::PreservedCatalog,
+        interface::{
+            CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError,
+            CheckpointData,
+        },
         internals::{
             proto_io::{load_transaction_proto, store_transaction_proto},
             types::TransactionKey,
@@ -61,8 +65,8 @@ impl TestCatalogState {
     }
 
     /// Inserts a file into this catalog state
-    pub fn insert(&mut self, info: CatalogParquetInfo) -> crate::catalog::api::Result<()> {
-        use crate::catalog::api::{Error, MetadataExtractFailed};
+    pub fn insert(&mut self, info: CatalogParquetInfo) -> Result<(), CatalogStateAddError> {
+        use crate::catalog::interface::MetadataExtractFailed;
 
         let iox_md = info
             .metadata
@@ -80,7 +84,7 @@ impl TestCatalogState {
 
         match partition.chunks.entry(iox_md.chunk_id) {
             Occupied(o) => {
-                return Err(Error::ParquetFileAlreadyExists {
+                return Err(CatalogStateAddError::ParquetFileAlreadyExists {
                     path: o.get().path.clone(),
                 });
             }
@@ -104,13 +108,11 @@ impl CatalogState for TestCatalogState {
         &mut self,
         _iox_object_store: Arc<IoxObjectStore>,
         info: CatalogParquetInfo,
-    ) -> crate::catalog::api::Result<()> {
+    ) -> Result<(), CatalogStateAddError> {
         self.insert(info)
     }
 
-    fn remove(&mut self, path: &ParquetFilePath) -> crate::catalog::api::Result<()> {
-        use crate::catalog::api::Error;
-
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
         let partitions = self
             .tables
             .values_mut()
@@ -136,7 +138,7 @@ impl CatalogState for TestCatalogState {
         }
 
         match removed {
-            0 => Err(Error::ParquetFileDoesNotExist { path: path.clone() }),
+            0 => Err(CatalogStateRemoveError::ParquetFileDoesNotExist { path: path.clone() }),
             _ => Ok(()),
         }
     }
@@ -173,8 +175,6 @@ where
     S: CatalogState + Debug + Send + Sync,
     F: Fn(&S) -> CheckpointData + Send,
 {
-    use crate::catalog::api::Error;
-
     // empty state
     let iox_object_store = make_iox_object_store().await;
     let (_catalog, mut state) =
@@ -317,11 +317,17 @@ where
                 },
             )
             .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));
 
         // does not exist as has a different UUID
         let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));
     }
     assert_checkpoint(&state, &f, &expected);
 
@@ -340,7 +346,10 @@ where
                 },
             )
             .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));
 
         // this transaction will still work
         let (path, metadata) =
@@ -369,12 +378,18 @@ where
                 },
             )
             .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));
 
         // does not exist - as different UUID
         let path = ParquetFilePath::new(&chunk_addr(7));
         let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));
 
         // this still works
         let (path, _) = expected.remove(&7).unwrap();
@@ -382,7 +397,10 @@ where
 
         // recently removed
         let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));
     }
     assert_checkpoint(&state, &f, &expected);
 }
diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs
index f530cfcfc8..450958bd55 100644
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@@ -120,7 +120,7 @@ use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol, TOutputPro
 /// For breaking changes, this will change.
 ///
 /// **Important: When changing this structure, consider bumping the
-///   [catalog transaction version](crate::catalog::api::TRANSACTION_VERSION)!**
+///   [catalog transaction version](crate::catalog::core::TRANSACTION_VERSION)!**
 pub const METADATA_VERSION: u32 = 6;
 
 /// File-level metadata key to store the IOx-specific data.
diff --git a/pbjson/Cargo.toml b/pbjson/Cargo.toml
new file mode 100644
index 0000000000..04ce2185f3
--- /dev/null
+++ b/pbjson/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "pbjson"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Utilities for pbjson converion"
+
+[dependencies]
+
+serde = { version = "1.0", features = ["derive"] }
+base64 = "0.13"
+
+[dev-dependencies]
+bytes = "1.0"
diff --git a/pbjson/src/lib.rs b/pbjson/src/lib.rs
new file mode 100644
index 0000000000..cb9bdd0b3b
--- /dev/null
+++ b/pbjson/src/lib.rs
@@ -0,0 +1,82 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::future_not_send
+)]
+
+#[doc(hidden)]
+pub mod private {
+    /// Re-export base64
+    pub use base64;
+
+    use serde::Deserialize;
+    use std::str::FromStr;
+
+    /// Used to parse a number from either a string or its raw representation
+    #[derive(Debug, Copy, Clone, PartialOrd, PartialEq, Hash, Ord, Eq)]
+    pub struct NumberDeserialize<T>(pub T);
+
+    #[derive(Deserialize)]
+    #[serde(untagged)]
+    enum Content<'a, T> {
+        Str(&'a str),
+        Number(T),
+    }
+
+    impl<'de, T> serde::Deserialize<'de> for NumberDeserialize<T>
+    where
+        T: FromStr + serde::Deserialize<'de>,
+        <T as FromStr>::Err: std::error::Error,
+    {
+        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+        where
+            D: serde::Deserializer<'de>,
+        {
+            let content = Content::deserialize(deserializer)?;
+            Ok(Self(match content {
+                Content::Str(v) => v.parse().map_err(serde::de::Error::custom)?,
+                Content::Number(v) => v,
+            }))
+        }
+    }
+
+    #[derive(Debug, Copy, Clone, PartialOrd, PartialEq, Hash, Ord, Eq)]
+    pub struct BytesDeserialize<T>(pub T);
+
+    impl<'de, T> Deserialize<'de> for BytesDeserialize<T>
+    where
+        T: From<Vec<u8>>,
+    {
+        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+        where
+            D: serde::Deserializer<'de>,
+        {
+            let s: &str = Deserialize::deserialize(deserializer)?;
+            let decoded = base64::decode(s).map_err(serde::de::Error::custom)?;
+            Ok(Self(decoded.into()))
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+        use bytes::Bytes;
+        use serde::de::value::{BorrowedStrDeserializer, Error};
+
+        #[test]
+        fn test_bytes() {
+            let raw = vec![2, 5, 62, 2, 5, 7, 8, 43, 5, 8, 4, 23, 5, 7, 7, 3, 2, 5, 196];
+            let encoded = base64::encode(&raw);
+
+            let deserializer = BorrowedStrDeserializer::<'_, Error>::new(&encoded);
+            let a: Bytes = BytesDeserialize::deserialize(deserializer).unwrap().0;
+            let b: Vec<u8> = BytesDeserialize::deserialize(deserializer).unwrap().0;
+
+            assert_eq!(raw.as_slice(), &a);
+            assert_eq!(raw.as_slice(), &b);
+        }
+    }
+}
diff --git a/pbjson_build/Cargo.toml b/pbjson_build/Cargo.toml
new file mode 100644
index 0000000000..41983411dc
--- /dev/null
+++ b/pbjson_build/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "pbjson_build"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Generates Serialize and Deserialize implementations for prost message types"
+
+[dependencies]
+
+heck = "0.3"
+prost = "0.8"
+prost-types = "0.8"
+itertools = "0.10"
+
+[dev-dependencies]
+tempfile = "3.1"
+pbjson_test = { path = "../pbjson_test" }
\ No newline at end of file
diff --git a/pbjson_build/src/descriptor.rs b/pbjson_build/src/descriptor.rs
new file mode 100644
index 0000000000..16501ea93a
--- /dev/null
+++ b/pbjson_build/src/descriptor.rs
@@ -0,0 +1,260 @@
+//! This module contains code to parse and extract the protobuf descriptor
+//! format for use by the rest of the codebase
+
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
+use std::io::{Error, ErrorKind, Result};
+
+use itertools::{EitherOrBoth, Itertools};
+use prost_types::{
+    DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto,
+    FileDescriptorSet, MessageOptions, OneofDescriptorProto,
+};
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct Package(String);
+
+impl Display for Package {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl Package {
+    pub fn new(s: impl Into<String>) -> Self {
+        let s = s.into();
+        assert!(
+            !s.starts_with('.'),
+            "package cannot start with \'.\', got \"{}\"",
+            s
+        );
+        Self(s)
+    }
+}
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct TypeName(String);
+
+impl Display for TypeName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl TypeName {
+    pub fn new(s: impl Into<String>) -> Self {
+        let s = s.into();
+        assert!(
+            !s.contains('.'),
+            "type name cannot contain \'.\', got \"{}\"",
+            s
+        );
+        Self(s)
+    }
+
+    pub fn to_snake_case(&self) -> String {
+        use heck::SnakeCase;
+        self.0.to_snake_case()
+    }
+
+    pub fn to_camel_case(&self) -> String {
+        use heck::CamelCase;
+        self.0.to_camel_case()
+    }
+
+    pub fn to_shouty_snake_case(&self) -> String {
+        use heck::ShoutySnakeCase;
+        self.0.to_shouty_snake_case()
+    }
+}
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct TypePath {
+    package: Package,
+    path: Vec<TypeName>,
+}
+
+impl Display for TypePath {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.package.fmt(f)?;
+        for element in &self.path {
+            write!(f, ".{}", element)?;
+        }
+        Ok(())
+    }
+}
+
+impl TypePath {
+    pub fn new(package: Package) -> Self {
+        Self {
+            package,
+            path: Default::default(),
+        }
+    }
+
+    pub fn package(&self) -> &Package {
+        &self.package
+    }
+
+    pub fn path(&self) -> &[TypeName] {
+        self.path.as_slice()
+    }
+
+    pub fn child(&self, name: TypeName) -> Self {
+        let path = self
+            .path
+            .iter()
+            .cloned()
+            .chain(std::iter::once(name))
+            .collect();
+        Self {
+            package: self.package.clone(),
+            path,
+        }
+    }
+
+    pub fn matches_prefix(&self, prefix: &str) -> bool {
+        let prefix = match prefix.strip_prefix('.') {
+            Some(prefix) => prefix,
+            None => return false,
+        };
+
+        if prefix.len() <= self.package.0.len() {
+            return self.package.0.starts_with(prefix);
+        }
+
+        match prefix.strip_prefix(&self.package.0) {
+            Some(prefix) => {
+                let split = prefix.split('.').skip(1);
+                for zipped in self.path.iter().zip_longest(split) {
+                    match zipped {
+                        EitherOrBoth::Both(a, b) if a.0.as_str() == b => continue,
+                        EitherOrBoth::Left(_) => return true,
+                        _ => return false,
+                    }
+                }
+                true
+            }
+            None => false,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct DescriptorSet {
+    descriptors: BTreeMap<TypePath, Descriptor>,
+}
+
+impl DescriptorSet {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn register_encoded(&mut self, encoded: &[u8]) -> Result<()> {
+        let descriptors: FileDescriptorSet =
+            prost::Message::decode(encoded).map_err(|e| Error::new(ErrorKind::InvalidData, e))?;
+
+        for file in descriptors.file {
+            let syntax = match file.syntax.as_deref() {
+                None | Some("proto2") => Syntax::Proto2,
+                Some("proto3") => Syntax::Proto3,
+                Some(s) => panic!("unknown syntax: {}", s),
+            };
+
+            let package = Package::new(file.package.expect("expected package"));
+            let path = TypePath::new(package);
+
+            for descriptor in file.message_type {
+                self.register_message(&path, descriptor, syntax)
+            }
+
+            for descriptor in file.enum_type {
+                self.register_enum(&path, descriptor)
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = (&TypePath, &Descriptor)> {
+        self.descriptors.iter()
+    }
+
+    fn register_message(&mut self, path: &TypePath, descriptor: DescriptorProto, syntax: Syntax) {
+        let name = TypeName::new(descriptor.name.expect("expected name"));
+        let child_path = path.child(name);
+
+        for child_descriptor in descriptor.enum_type {
+            self.register_enum(&child_path, child_descriptor)
+        }
+
+        for child_descriptor in descriptor.nested_type {
+            self.register_message(&child_path, child_descriptor, syntax)
+        }
+
+        self.register_descriptor(
+            child_path.clone(),
+            Descriptor::Message(MessageDescriptor {
+                path: child_path,
+                options: descriptor.options,
+                one_of: descriptor.oneof_decl,
+                fields: descriptor.field,
+                syntax,
+            }),
+        );
+    }
+
+    fn register_enum(&mut self, path: &TypePath, descriptor: EnumDescriptorProto) {
+        let name = TypeName::new(descriptor.name.expect("expected name"));
+        self.register_descriptor(
+            path.child(name),
+            Descriptor::Enum(EnumDescriptor {
+                values: descriptor.value,
+            }),
+        );
+    }
+
+    fn register_descriptor(&mut self, path: TypePath, descriptor: Descriptor) {
+        match self.descriptors.entry(path) {
+            Entry::Occupied(o) => panic!("descriptor already registered for {}", o.key()),
+            Entry::Vacant(v) => v.insert(descriptor),
+        };
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum Syntax {
+    Proto2,
+    Proto3,
+}
+
+#[derive(Debug, Clone)]
+pub enum Descriptor {
+    Enum(EnumDescriptor),
+    Message(MessageDescriptor),
+}
+
+#[derive(Debug, Clone)]
+pub struct EnumDescriptor {
+    pub values: Vec<EnumValueDescriptorProto>,
+}
+
+#[derive(Debug, Clone)]
+pub struct MessageDescriptor {
+    pub path: TypePath,
+    pub options: Option<MessageOptions>,
+    pub one_of: Vec<OneofDescriptorProto>,
+    pub fields: Vec<FieldDescriptorProto>,
+    pub syntax: Syntax,
+}
+
+impl MessageDescriptor {
+    /// Whether this is an auto-generated type for the map field
+    pub fn is_map(&self) -> bool {
+        self.options
+            .as_ref()
+            .and_then(|options| options.map_entry)
+            .unwrap_or(false)
+    }
+}
diff --git a/pbjson_build/src/escape.rs b/pbjson_build/src/escape.rs
new file mode 100644
index 0000000000..554ac85eec
--- /dev/null
+++ b/pbjson_build/src/escape.rs
@@ -0,0 +1,26 @@
+///! Contains code to escape strings to avoid collisions with reserved Rust keywords
+
+pub fn escape_ident(mut ident: String) -> String {
+    // Copied from prost-build::ident
+    //
+    // Use a raw identifier if the identifier matches a Rust keyword:
+    // https://doc.rust-lang.org/reference/keywords.html.
+    match ident.as_str() {
+        // 2015 strict keywords.
+        | "as" | "break" | "const" | "continue" | "else" | "enum" | "false"
+        | "fn" | "for" | "if" | "impl" | "in" | "let" | "loop" | "match" | "mod" | "move" | "mut"
+        | "pub" | "ref" | "return" | "static" | "struct" | "trait" | "true"
+        | "type" | "unsafe" | "use" | "where" | "while"
+        // 2018 strict keywords.
+        | "dyn"
+        // 2015 reserved keywords.
+        | "abstract" | "become" | "box" | "do" | "final" | "macro" | "override" | "priv" | "typeof"
+        | "unsized" | "virtual" | "yield"
+        // 2018 reserved keywords.
+        | "async" | "await" | "try" => ident.insert_str(0, "r#"),
+        // the following keywords are not supported as raw identifiers and are therefore suffixed with an underscore.
+        "self" | "super" | "extern" | "crate" => ident += "_",
+        _ => (),
+    };
+    ident
+}
diff --git a/pbjson_build/src/generator.rs b/pbjson_build/src/generator.rs
new file mode 100644
index 0000000000..f9d205998d
--- /dev/null
+++ b/pbjson_build/src/generator.rs
@@ -0,0 +1,129 @@
+//! This module contains the actual code generation logic
+
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
+use std::io::{Result, Write};
+
+use crate::descriptor::TypePath;
+
+mod enumeration;
+mod message;
+
+pub use enumeration::generate_enum;
+pub use message::generate_message;
+
+#[derive(Debug, Clone, Copy)]
+struct Indent(usize);
+
+impl Display for Indent {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        for _ in 0..self.0 {
+            write!(f, "    ")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub struct Config {
+    pub extern_types: BTreeMap<TypePath, String>,
+}
+
+impl Config {
+    fn rust_type(&self, path: &TypePath) -> String {
+        if let Some(t) = self.extern_types.get(path) {
+            return t.clone();
+        }
+
+        let mut ret = String::new();
+        let path = path.path();
+        assert!(!path.is_empty(), "path cannot be empty");
+
+        for i in &path[..(path.len() - 1)] {
+            ret.push_str(i.to_snake_case().as_str());
+            ret.push_str("::");
+        }
+        ret.push_str(path.last().unwrap().to_camel_case().as_str());
+        ret
+    }
+
+    fn rust_variant(&self, enumeration: &TypePath, variant: &str) -> String {
+        use heck::CamelCase;
+        assert!(
+            variant
+                .chars()
+                .all(|c| matches!(c, '0'..='9' | 'A'..='Z' | '_')),
+            "illegal variant - {}",
+            variant
+        );
+
+        // TODO: Config to disable stripping prefix
+
+        let enumeration_name = enumeration.path().last().unwrap().to_shouty_snake_case();
+        let variant = match variant.strip_prefix(&enumeration_name) {
+            Some("") => variant,
+            Some(stripped) => stripped,
+            None => variant,
+        };
+        variant.to_camel_case()
+    }
+}
+
+fn write_fields_array<'a, W: Write, I: Iterator<Item = &'a str>>(
+    writer: &mut W,
+    indent: usize,
+    variants: I,
+) -> Result<()> {
+    writeln!(writer, "{}const FIELDS: &[&str] = &[", Indent(indent))?;
+    for name in variants {
+        writeln!(writer, "{}\"{}\",", Indent(indent + 1), name)?;
+    }
+    writeln!(writer, "{}];", Indent(indent))?;
+    writeln!(writer)
+}
+
+fn write_serialize_start<W: Write>(indent: usize, rust_type: &str, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}impl serde::Serialize for {rust_type} {{
+{indent}    #[allow(deprecated)]
+{indent}    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+{indent}    where
+{indent}        S: serde::Serializer,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type
+    )
+}
+
+fn write_serialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent),
+    )
+}
+
+fn write_deserialize_start<W: Write>(indent: usize, rust_type: &str, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::Deserialize<'de> for {rust_type} {{
+{indent}    #[allow(deprecated)]
+{indent}    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+{indent}    where
+{indent}        D: serde::Deserializer<'de>,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type
+    )
+}
+
+fn write_deserialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent),
+    )
+}
diff --git a/pbjson_build/src/generator/enumeration.rs b/pbjson_build/src/generator/enumeration.rs
new file mode 100644
index 0000000000..d5e2736bd1
--- /dev/null
+++ b/pbjson_build/src/generator/enumeration.rs
@@ -0,0 +1,138 @@
+//! This module contains the code to generate Serialize and Deserialize
+//! implementations for enumeration type
+//!
+//! An enumeration should be decode-able from the full string variant name
+//! or its integer tag number, and should encode to the string representation
+
+use super::{
+    write_deserialize_end, write_deserialize_start, write_serialize_end, write_serialize_start,
+    Config, Indent,
+};
+use crate::descriptor::{EnumDescriptor, TypePath};
+use crate::generator::write_fields_array;
+use std::io::{Result, Write};
+
+pub fn generate_enum<W: Write>(
+    config: &Config,
+    path: &TypePath,
+    descriptor: &EnumDescriptor,
+    writer: &mut W,
+) -> Result<()> {
+    let rust_type = config.rust_type(path);
+
+    let variants: Vec<_> = descriptor
+        .values
+        .iter()
+        .map(|variant| {
+            let variant_name = variant.name.clone().unwrap();
+            let rust_variant = config.rust_variant(path, &variant_name);
+            (variant_name, rust_variant)
+        })
+        .collect();
+
+    // Generate Serialize
+    write_serialize_start(0, &rust_type, writer)?;
+    writeln!(writer, "{}let variant = match self {{", Indent(2))?;
+    for (variant_name, rust_variant) in &variants {
+        writeln!(
+            writer,
+            "{}Self::{} => \"{}\",",
+            Indent(3),
+            rust_variant,
+            variant_name
+        )?;
+    }
+    writeln!(writer, "{}}};", Indent(2))?;
+
+    writeln!(writer, "{}serializer.serialize_str(variant)", Indent(2))?;
+    write_serialize_end(0, writer)?;
+
+    // Generate Deserialize
+    write_deserialize_start(0, &rust_type, writer)?;
+    write_fields_array(writer, 2, variants.iter().map(|(name, _)| name.as_str()))?;
+    write_visitor(writer, 2, &rust_type, &variants)?;
+
+    // Use deserialize_any to allow users to provide integers or strings
+    writeln!(
+        writer,
+        "{}deserializer.deserialize_any(GeneratedVisitor)",
+        Indent(2)
+    )?;
+
+    write_deserialize_end(0, writer)?;
+    Ok(())
+}
+
+fn write_visitor<W: Write>(
+    writer: &mut W,
+    indent: usize,
+    rust_type: &str,
+    variants: &[(String, String)],
+) -> Result<()> {
+    // Protobuf supports deserialization of enumerations both from string and integer values
+    writeln!(
+        writer,
+        r#"{indent}struct GeneratedVisitor;
+
+{indent}impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}    type Value = {rust_type};
+
+{indent}    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}        write!(formatter, "expected one of: {{:?}}", &FIELDS)
+{indent}    }}
+
+{indent}    fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{
+{indent}        use std::convert::TryFrom;
+{indent}        i32::try_from(v)
+{indent}            .ok()
+{indent}            .and_then({rust_type}::from_i32)
+{indent}            .ok_or_else(|| {{
+{indent}                serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+{indent}            }})
+{indent}    }}
+
+{indent}    fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{
+{indent}        use std::convert::TryFrom;
+{indent}        i32::try_from(v)
+{indent}            .ok()
+{indent}            .and_then({rust_type}::from_i32)
+{indent}            .ok_or_else(|| {{
+{indent}                serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+{indent}            }})
+{indent}    }}
+
+{indent}    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type,
+    )?;
+
+    writeln!(writer, "{}match value {{", Indent(indent + 2))?;
+    for (variant_name, rust_variant) in variants {
+        writeln!(
+            writer,
+            "{}\"{}\" => Ok({}::{}),",
+            Indent(indent + 3),
+            variant_name,
+            rust_type,
+            rust_variant
+        )?;
+    }
+
+    writeln!(
+        writer,
+        "{indent}_ => Err(serde::de::Error::unknown_variant(value, FIELDS)),",
+        indent = Indent(indent + 3)
+    )?;
+    writeln!(writer, "{}}}", Indent(indent + 2))?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
diff --git a/pbjson_build/src/generator/message.rs b/pbjson_build/src/generator/message.rs
new file mode 100644
index 0000000000..9625554af3
--- /dev/null
+++ b/pbjson_build/src/generator/message.rs
@@ -0,0 +1,809 @@
+//! This module contains the code to generate Serialize and Deserialize
+//! implementations for message types
+//!
+//! The implementation follows the proto3 [JSON mapping][1] with the default options
+//!
+//! Importantly:
+//! - numeric types can be decoded from either a string or number
+//! - 32-bit integers and floats are encoded as numbers
+//! - 64-bit integers are encoded as strings
+//! - repeated fields are encoded as arrays
+//! - bytes are base64 encoded (NOT CURRENTLY SUPPORTED)
+//! - messages and maps are encoded as objects
+//! - fields are lowerCamelCase except where overridden by the proto definition
+//! - default values are not emitted on encode
+//! - unrecognised fields error on decode
+//!
+//! Note: This will not generate code to correctly serialize/deserialize well-known-types
+//! such as google.protobuf.Any, google.protobuf.Duration, etc... conversions for these
+//! special-cased messages will need to be manually implemented. Once done so, however,
+//! any messages containing these types will serialize/deserialize correctly
+//!
+//! [1]: https://developers.google.com/protocol-buffers/docs/proto3#json
+
+use std::io::{Result, Write};
+
+use crate::message::{Field, FieldModifier, FieldType, Message, OneOf, ScalarType};
+
+use super::{
+    write_deserialize_end, write_deserialize_start, write_serialize_end, write_serialize_start,
+    Config, Indent,
+};
+use crate::descriptor::TypePath;
+use crate::generator::write_fields_array;
+
+pub fn generate_message<W: Write>(
+    config: &Config,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    let rust_type = config.rust_type(&message.path);
+
+    // Generate Serialize
+    write_serialize_start(0, &rust_type, writer)?;
+    write_message_serialize(config, 2, message, writer)?;
+    write_serialize_end(0, writer)?;
+
+    // Generate Deserialize
+    write_deserialize_start(0, &rust_type, writer)?;
+    write_deserialize_message(config, 2, message, &rust_type, writer)?;
+    write_deserialize_end(0, writer)?;
+    Ok(())
+}
+
+fn write_field_empty_predicate<W: Write>(member: &Field, writer: &mut W) -> Result<()> {
+    match (&member.field_type, &member.field_modifier) {
+        (_, FieldModifier::Required) => unreachable!(),
+        (_, FieldModifier::Repeated)
+        | (FieldType::Map(_, _), _)
+        | (FieldType::Scalar(ScalarType::String), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::Bytes), FieldModifier::UseDefault) => {
+            write!(writer, "!self.{}.is_empty()", member.rust_field_name())
+        }
+        (_, FieldModifier::Optional) | (FieldType::Message(_), _) => {
+            write!(writer, "self.{}.is_some()", member.rust_field_name())
+        }
+        (FieldType::Scalar(ScalarType::F64), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::F32), FieldModifier::UseDefault) => {
+            write!(writer, "self.{} != 0.", member.rust_field_name())
+        }
+        (FieldType::Scalar(ScalarType::Bool), FieldModifier::UseDefault) => {
+            write!(writer, "self.{}", member.rust_field_name())
+        }
+        (FieldType::Enum(_), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::I64), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::I32), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::U32), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::U64), FieldModifier::UseDefault) => {
+            write!(writer, "self.{} != 0", member.rust_field_name())
+        }
+    }
+}
+
+fn write_message_serialize<W: Write>(
+    config: &Config,
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    write_struct_serialize_start(indent, message, writer)?;
+
+    for field in &message.fields {
+        write_serialize_field(config, indent, field, writer)?;
+    }
+
+    for one_of in &message.one_ofs {
+        write_serialize_one_of(indent, config, one_of, writer)?;
+    }
+
+    write_struct_serialize_end(indent, writer)
+}
+
+fn write_struct_serialize_start<W: Write>(
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(writer, "{}use serde::ser::SerializeStruct;", Indent(indent))?;
+
+    let required_len = message
+        .fields
+        .iter()
+        .filter(|member| member.field_modifier.is_required())
+        .count();
+
+    if required_len != message.fields.len() || !message.one_ofs.is_empty() {
+        writeln!(writer, "{}let mut len = {};", Indent(indent), required_len)?;
+    } else {
+        writeln!(writer, "{}let len = {};", Indent(indent), required_len)?;
+    }
+
+    for field in &message.fields {
+        if field.field_modifier.is_required() {
+            continue;
+        }
+        write!(writer, "{}if ", Indent(indent))?;
+        write_field_empty_predicate(field, writer)?;
+        writeln!(writer, " {{")?;
+        writeln!(writer, "{}len += 1;", Indent(indent + 1))?;
+        writeln!(writer, "{}}}", Indent(indent))?;
+    }
+
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{}if self.{}.is_some() {{",
+            Indent(indent),
+            one_of.rust_field_name()
+        )?;
+        writeln!(writer, "{}len += 1;", Indent(indent + 1))?;
+        writeln!(writer, "{}}}", Indent(indent))?;
+    }
+
+    if !message.fields.is_empty() || !message.one_ofs.is_empty() {
+        writeln!(
+            writer,
+            "{}let mut struct_ser = serializer.serialize_struct(\"{}\", len)?;",
+            Indent(indent),
+            message.path
+        )?;
+    } else {
+        writeln!(
+            writer,
+            "{}let struct_ser = serializer.serialize_struct(\"{}\", len)?;",
+            Indent(indent),
+            message.path
+        )?;
+    }
+    Ok(())
+}
+
+fn write_struct_serialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(writer, "{}struct_ser.end()", Indent(indent))
+}
+
+fn write_decode_variant<W: Write>(
+    config: &Config,
+    indent: usize,
+    value: &str,
+    path: &TypePath,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(writer, "{}::from_i32({})", config.rust_type(path), value)?;
+    write!(
+        writer,
+        "{}.ok_or_else(|| serde::ser::Error::custom(format!(\"Invalid variant {{}}\", {})))",
+        Indent(indent),
+        value
+    )
+}
+
+/// Depending on the type of the field different ways of accessing field's value
+/// are needed - this allows decoupling the type serialization logic from the logic
+/// that manipulates its container e.g. Vec, Option, HashMap
+struct Variable<'a> {
+    /// A reference to the field's value
+    as_ref: &'a str,
+    /// The field's value
+    as_unref: &'a str,
+    /// The field without any leading "&" or "*"
+    raw: &'a str,
+}
+
+fn write_serialize_variable<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    variable: Variable<'_>,
+    writer: &mut W,
+) -> Result<()> {
+    match &field.field_type {
+        FieldType::Scalar(scalar) => write_serialize_scalar_variable(
+            indent,
+            *scalar,
+            field.field_modifier,
+            variable,
+            field.json_name(),
+            writer,
+        ),
+        FieldType::Enum(path) => {
+            write!(writer, "{}let v = ", Indent(indent))?;
+            match field.field_modifier {
+                FieldModifier::Repeated => {
+                    writeln!(writer, "{}.iter().cloned().map(|v| {{", variable.raw)?;
+                    write!(writer, "{}", Indent(indent + 1))?;
+                    write_decode_variant(config, indent + 2, "v", path, writer)?;
+                    writeln!(writer)?;
+                    write!(
+                        writer,
+                        "{}}}).collect::<Result<Vec<_>, _>>()",
+                        Indent(indent + 1)
+                    )
+                }
+                _ => write_decode_variant(config, indent + 1, variable.as_unref, path, writer),
+            }?;
+
+            writeln!(writer, "?;")?;
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &v)?;",
+                Indent(indent),
+                field.json_name()
+            )
+        }
+        FieldType::Map(_, value_type)
+            if matches!(
+                value_type.as_ref(),
+                FieldType::Scalar(ScalarType::I64)
+                    | FieldType::Scalar(ScalarType::U64)
+                    | FieldType::Enum(_)
+            ) =>
+        {
+            writeln!(
+                writer,
+                "{}let v: std::collections::HashMap<_, _> = {}.iter()",
+                Indent(indent),
+                variable.raw
+            )?;
+
+            match value_type.as_ref() {
+                FieldType::Scalar(ScalarType::I64) | FieldType::Scalar(ScalarType::U64) => {
+                    writeln!(
+                        writer,
+                        "{}.map(|(k, v)| (k, v.to_string())).collect();",
+                        Indent(indent + 1)
+                    )?;
+                }
+                FieldType::Enum(path) => {
+                    writeln!(writer, "{}.map(|(k, v)| {{", Indent(indent + 1))?;
+                    write!(writer, "{}let v = ", Indent(indent + 2))?;
+                    write_decode_variant(config, indent + 3, "*v", path, writer)?;
+                    writeln!(writer, "?;")?;
+                    writeln!(writer, "{}Ok((k, v))", Indent(indent + 2))?;
+                    writeln!(
+                        writer,
+                        "{}}}).collect::<Result<_,_>>()?;",
+                        Indent(indent + 1)
+                    )?;
+                }
+                _ => unreachable!(),
+            }
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &v)?;",
+                Indent(indent),
+                field.json_name()
+            )
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {})?;",
+                Indent(indent),
+                field.json_name(),
+                variable.as_ref
+            )
+        }
+    }
+}
+
+fn write_serialize_scalar_variable<W: Write>(
+    indent: usize,
+    scalar: ScalarType,
+    field_modifier: FieldModifier,
+    variable: Variable<'_>,
+    json_name: String,
+    writer: &mut W,
+) -> Result<()> {
+    let conversion = match scalar {
+        ScalarType::I64 | ScalarType::U64 => "ToString::to_string",
+        ScalarType::Bytes => "pbjson::private::base64::encode",
+        _ => {
+            return writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {})?;",
+                Indent(indent),
+                json_name,
+                variable.as_ref
+            )
+        }
+    };
+
+    match field_modifier {
+        FieldModifier::Repeated => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &{}.iter().map({}).collect::<Vec<_>>())?;",
+                Indent(indent),
+                json_name,
+                variable.raw,
+                conversion
+            )
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {}(&{}).as_str())?;",
+                Indent(indent),
+                json_name,
+                conversion,
+                variable.raw,
+            )
+        }
+    }
+}
+
+fn write_serialize_field<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    writer: &mut W,
+) -> Result<()> {
+    let as_ref = format!("&self.{}", field.rust_field_name());
+    let variable = Variable {
+        as_ref: as_ref.as_str(),
+        as_unref: &as_ref.as_str()[1..],
+        raw: &as_ref.as_str()[1..],
+    };
+
+    match &field.field_modifier {
+        FieldModifier::Required => {
+            write_serialize_variable(config, indent, field, variable, writer)?;
+        }
+        FieldModifier::Optional => {
+            writeln!(
+                writer,
+                "{}if let Some(v) = {}.as_ref() {{",
+                Indent(indent),
+                variable.as_unref
+            )?;
+            let variable = Variable {
+                as_ref: "v",
+                as_unref: "*v",
+                raw: "v",
+            };
+            write_serialize_variable(config, indent + 1, field, variable, writer)?;
+            writeln!(writer, "{}}}", Indent(indent))?;
+        }
+        FieldModifier::Repeated | FieldModifier::UseDefault => {
+            write!(writer, "{}if ", Indent(indent))?;
+            write_field_empty_predicate(field, writer)?;
+            writeln!(writer, " {{")?;
+            write_serialize_variable(config, indent + 1, field, variable, writer)?;
+            writeln!(writer, "{}}}", Indent(indent))?;
+        }
+    }
+    Ok(())
+}
+
+fn write_serialize_one_of<W: Write>(
+    indent: usize,
+    config: &Config,
+    one_of: &OneOf,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(
+        writer,
+        "{}if let Some(v) = self.{}.as_ref() {{",
+        Indent(indent),
+        one_of.rust_field_name()
+    )?;
+
+    writeln!(writer, "{}match v {{", Indent(indent + 1))?;
+    for field in &one_of.fields {
+        writeln!(
+            writer,
+            "{}{}::{}(v) => {{",
+            Indent(indent + 2),
+            config.rust_type(&one_of.path),
+            field.rust_type_name(),
+        )?;
+        let variable = Variable {
+            as_ref: "v",
+            as_unref: "*v",
+            raw: "v",
+        };
+        write_serialize_variable(config, indent + 3, field, variable, writer)?;
+        writeln!(writer, "{}}}", Indent(indent + 2))?;
+    }
+
+    writeln!(writer, "{}}}", Indent(indent + 1),)?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_deserialize_message<W: Write>(
+    config: &Config,
+    indent: usize,
+    message: &Message,
+    rust_type: &str,
+    writer: &mut W,
+) -> Result<()> {
+    write_deserialize_field_name(2, message, writer)?;
+
+    writeln!(writer, "{}struct GeneratedVisitor;", Indent(indent))?;
+
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}    type Value = {rust_type};
+
+{indent}    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}        formatter.write_str("struct {name}")
+{indent}    }}
+
+{indent}    fn visit_map<V>(self, mut map: V) -> Result<{rust_type}, V::Error>
+{indent}        where
+{indent}            V: serde::de::MapAccess<'de>,
+{indent}    {{"#,
+        indent = Indent(indent),
+        name = message.path,
+        rust_type = rust_type,
+    )?;
+
+    for field in &message.fields {
+        writeln!(
+            writer,
+            "{}let mut {} = None;",
+            Indent(indent + 2),
+            field.rust_field_name(),
+        )?;
+    }
+
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{}let mut {} = None;",
+            Indent(indent + 2),
+            one_of.rust_field_name(),
+        )?;
+    }
+
+    if !message.fields.is_empty() || !message.one_ofs.is_empty() {
+        writeln!(
+            writer,
+            "{}while let Some(k) = map.next_key()? {{",
+            Indent(indent + 2)
+        )?;
+
+        writeln!(writer, "{}match k {{", Indent(indent + 3))?;
+
+        for field in &message.fields {
+            write_deserialize_field(config, indent + 4, field, None, writer)?;
+        }
+
+        for one_of in &message.one_ofs {
+            for field in &one_of.fields {
+                write_deserialize_field(config, indent + 4, field, Some(one_of), writer)?;
+            }
+        }
+
+        writeln!(writer, "{}}}", Indent(indent + 3))?;
+        writeln!(writer, "{}}}", Indent(indent + 2))?;
+    } else {
+        writeln!(
+            writer,
+            "{}while map.next_key::<GeneratedField>()?.is_some() {{}}",
+            Indent(indent + 2)
+        )?;
+    }
+
+    writeln!(writer, "{}Ok({} {{", Indent(indent + 2), rust_type)?;
+    for field in &message.fields {
+        match field.field_modifier {
+            FieldModifier::Required => {
+                writeln!(
+                    writer,
+                    "{indent}{field}: {field}.ok_or_else(|| serde::de::Error::missing_field(\"{json_name}\"))?,",
+                    indent=Indent(indent + 3),
+                    field= field.rust_field_name(),
+                    json_name= field.json_name()
+                )?;
+            }
+            FieldModifier::UseDefault | FieldModifier::Repeated => {
+                // Note: this currently does not hydrate optional proto2 fields with defaults
+                writeln!(
+                    writer,
+                    "{indent}{field}: {field}.unwrap_or_default(),",
+                    indent = Indent(indent + 3),
+                    field = field.rust_field_name()
+                )?;
+            }
+            _ => {
+                writeln!(
+                    writer,
+                    "{indent}{field},",
+                    indent = Indent(indent + 3),
+                    field = field.rust_field_name()
+                )?;
+            }
+        }
+    }
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{indent}{field},",
+            indent = Indent(indent + 3),
+            field = one_of.rust_field_name(),
+        )?;
+    }
+
+    writeln!(writer, "{}}})", Indent(indent + 2))?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    writeln!(writer, "{}}}", Indent(indent))?;
+    writeln!(
+        writer,
+        "{}deserializer.deserialize_struct(\"{}\", FIELDS, GeneratedVisitor)",
+        Indent(indent),
+        message.path
+    )
+}
+
+fn write_deserialize_field_name<W: Write>(
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    let fields: Vec<_> = message
+        .all_fields()
+        .map(|field| (field.json_name(), field.rust_type_name()))
+        .collect();
+
+    write_fields_array(writer, indent, fields.iter().map(|(name, _)| name.as_str()))?;
+    write_fields_enum(writer, indent, fields.iter().map(|(_, name)| name.as_str()))?;
+
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::Deserialize<'de> for GeneratedField {{
+{indent}    fn deserialize<D>(deserializer: D) -> Result<GeneratedField, D::Error>
+{indent}    where
+{indent}        D: serde::Deserializer<'de>,
+{indent}    {{
+{indent}        struct GeneratedVisitor;
+
+{indent}        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}            type Value = GeneratedField;
+
+{indent}            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}                write!(formatter, "expected one of: {{:?}}", &FIELDS)
+{indent}            }}
+
+{indent}            fn visit_str<E>(self, value: &str) -> Result<GeneratedField, E>
+{indent}            where
+{indent}                E: serde::de::Error,
+{indent}            {{"#,
+        indent = Indent(indent)
+    )?;
+
+    if !fields.is_empty() {
+        writeln!(writer, "{}match value {{", Indent(indent + 4))?;
+        for (json_name, type_name) in &fields {
+            writeln!(
+                writer,
+                "{}\"{}\" => Ok(GeneratedField::{}),",
+                Indent(indent + 5),
+                json_name,
+                type_name
+            )?;
+        }
+        writeln!(
+            writer,
+            "{}_ => Err(serde::de::Error::unknown_field(value, FIELDS)),",
+            Indent(indent + 5)
+        )?;
+        writeln!(writer, "{}}}", Indent(indent + 4))?;
+    } else {
+        writeln!(
+            writer,
+            "{}Err(serde::de::Error::unknown_field(value, FIELDS))",
+            Indent(indent + 4)
+        )?;
+    }
+
+    writeln!(
+        writer,
+        r#"{indent}            }}
+{indent}        }}
+{indent}        deserializer.deserialize_identifier(GeneratedVisitor)
+{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent)
+    )
+}
+
+fn write_fields_enum<'a, W: Write, I: Iterator<Item = &'a str>>(
+    writer: &mut W,
+    indent: usize,
+    fields: I,
+) -> Result<()> {
+    writeln!(
+        writer,
+        "{}#[allow(clippy::enum_variant_names)]",
+        Indent(indent)
+    )?;
+    writeln!(writer, "{}enum GeneratedField {{", Indent(indent))?;
+    for type_name in fields {
+        writeln!(writer, "{}{},", Indent(indent + 1), type_name)?;
+    }
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_deserialize_field<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    one_of: Option<&OneOf>,
+    writer: &mut W,
+) -> Result<()> {
+    let field_name = match one_of {
+        Some(one_of) => one_of.rust_field_name(),
+        None => field.rust_field_name(),
+    };
+
+    let json_name = field.json_name();
+    writeln!(
+        writer,
+        "{}GeneratedField::{} => {{",
+        Indent(indent),
+        field.rust_type_name()
+    )?;
+    writeln!(
+        writer,
+        "{}if {}.is_some() {{",
+        Indent(indent + 1),
+        field_name
+    )?;
+
+    // Note: this will report duplicate field if multiple value are specified for a one of
+    writeln!(
+        writer,
+        "{}return Err(serde::de::Error::duplicate_field(\"{}\"));",
+        Indent(indent + 2),
+        json_name
+    )?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    write!(writer, "{}{} = Some(", Indent(indent + 1), field_name)?;
+
+    if let Some(one_of) = one_of {
+        write!(
+            writer,
+            "{}::{}(",
+            config.rust_type(&one_of.path),
+            field.rust_type_name()
+        )?;
+    }
+
+    match &field.field_type {
+        FieldType::Scalar(scalar) => {
+            write_encode_scalar_field(indent + 1, *scalar, field.field_modifier, writer)?;
+        }
+        FieldType::Enum(path) => match field.field_modifier {
+            FieldModifier::Repeated => {
+                write!(
+                    writer,
+                    "map.next_value::<Vec<{}>>()?.into_iter().map(|x| x as i32).collect()",
+                    config.rust_type(path)
+                )?;
+            }
+            _ => {
+                write!(
+                    writer,
+                    "map.next_value::<{}>()? as i32",
+                    config.rust_type(path)
+                )?;
+            }
+        },
+        FieldType::Map(key, value) => {
+            writeln!(writer)?;
+            write!(
+                writer,
+                "{}map.next_value::<std::collections::HashMap<",
+                Indent(indent + 2),
+            )?;
+
+            let map_k = match key {
+                ScalarType::Bytes => {
+                    // https://github.com/tokio-rs/prost/issues/531
+                    panic!("bytes are not currently supported as map keys")
+                }
+                _ if key.is_numeric() => {
+                    write!(
+                        writer,
+                        "::pbjson::private::NumberDeserialize<{}>",
+                        key.rust_type()
+                    )?;
+                    "k.0"
+                }
+                _ => {
+                    write!(writer, "_")?;
+                    "k"
+                }
+            };
+            write!(writer, ", ")?;
+            let map_v = match value.as_ref() {
+                FieldType::Scalar(scalar) if scalar.is_numeric() => {
+                    write!(
+                        writer,
+                        "::pbjson::private::NumberDeserialize<{}>",
+                        scalar.rust_type()
+                    )?;
+                    "v.0"
+                }
+                FieldType::Scalar(ScalarType::Bytes) => {
+                    // https://github.com/tokio-rs/prost/issues/531
+                    panic!("bytes are not currently supported as map values")
+                }
+                FieldType::Enum(path) => {
+                    write!(writer, "{}", config.rust_type(path))?;
+                    "v as i32"
+                }
+                FieldType::Map(_, _) => panic!("protobuf disallows nested maps"),
+                _ => {
+                    write!(writer, "_")?;
+                    "v"
+                }
+            };
+
+            writeln!(writer, ">>()?")?;
+            if map_k != "k" || map_v != "v" {
+                writeln!(
+                    writer,
+                    "{}.into_iter().map(|(k,v)| ({}, {})).collect()",
+                    Indent(indent + 3),
+                    map_k,
+                    map_v,
+                )?;
+            }
+            write!(writer, "{}", Indent(indent + 1))?;
+        }
+        _ => {
+            write!(writer, "map.next_value()?",)?;
+        }
+    };
+
+    if one_of.is_some() {
+        write!(writer, ")")?;
+    }
+
+    writeln!(writer, ");")?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_encode_scalar_field<W: Write>(
+    indent: usize,
+    scalar: ScalarType,
+    field_modifier: FieldModifier,
+    writer: &mut W,
+) -> Result<()> {
+    let deserializer = match scalar {
+        ScalarType::Bytes => "BytesDeserialize",
+        _ if scalar.is_numeric() => "NumberDeserialize",
+        _ => return write!(writer, "map.next_value()?",),
+    };
+
+    writeln!(writer)?;
+
+    match field_modifier {
+        FieldModifier::Repeated => {
+            writeln!(
+                writer,
+                "{}map.next_value::<Vec<::pbjson::private::{}<_>>>()?",
+                Indent(indent + 1),
+                deserializer
+            )?;
+            writeln!(
+                writer,
+                "{}.into_iter().map(|x| x.0).collect()",
+                Indent(indent + 2)
+            )?;
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}map.next_value::<::pbjson::private::{}<_>>()?.0",
+                Indent(indent + 1),
+                deserializer
+            )?;
+        }
+    }
+    write!(writer, "{}", Indent(indent))
+}
diff --git a/pbjson_build/src/lib.rs b/pbjson_build/src/lib.rs
new file mode 100644
index 0000000000..b985473ac4
--- /dev/null
+++ b/pbjson_build/src/lib.rs
@@ -0,0 +1,113 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::future_not_send
+)]
+
+use crate::descriptor::{Descriptor, DescriptorSet, Package};
+use crate::generator::{generate_enum, generate_message, Config};
+use crate::message::resolve_message;
+use std::io::{BufWriter, Error, ErrorKind, Result, Write};
+use std::path::PathBuf;
+
+mod descriptor;
+mod escape;
+mod generator;
+mod message;
+
+#[derive(Debug, Default)]
+pub struct Builder {
+    descriptors: descriptor::DescriptorSet,
+    out_dir: Option<PathBuf>,
+}
+
+impl Builder {
+    /// Create a new `Builder`
+    pub fn new() -> Self {
+        Self {
+            descriptors: DescriptorSet::new(),
+            out_dir: None,
+        }
+    }
+
+    /// Register an encoded `FileDescriptorSet` with this `Builder`
+    pub fn register_descriptors(&mut self, descriptors: &[u8]) -> Result<&mut Self> {
+        self.descriptors.register_encoded(descriptors)?;
+        Ok(self)
+    }
+
+    /// Generates code for all registered types where `prefixes` contains a prefix of
+    /// the fully-qualified path of the type
+    pub fn build<S: AsRef<str>>(&mut self, prefixes: &[S]) -> Result<()> {
+        let mut output: PathBuf = self.out_dir.clone().map(Ok).unwrap_or_else(|| {
+            std::env::var_os("OUT_DIR")
+                .ok_or_else(|| {
+                    Error::new(ErrorKind::Other, "OUT_DIR environment variable is not set")
+                })
+                .map(Into::into)
+        })?;
+        output.push("FILENAME");
+
+        let write_factory = move |package: &Package| {
+            output.set_file_name(format!("{}.serde.rs", package));
+
+            let file = std::fs::OpenOptions::new()
+                .write(true)
+                .truncate(true)
+                .create(true)
+                .open(&output)?;
+
+            Ok(BufWriter::new(file))
+        };
+
+        let writers = generate(&self.descriptors, prefixes, write_factory)?;
+        for (_, mut writer) in writers {
+            writer.flush()?;
+        }
+
+        Ok(())
+    }
+}
+
+fn generate<S: AsRef<str>, W: Write, F: FnMut(&Package) -> Result<W>>(
+    descriptors: &DescriptorSet,
+    prefixes: &[S],
+    mut write_factory: F,
+) -> Result<Vec<(Package, W)>> {
+    let config = Config {
+        extern_types: Default::default(),
+    };
+
+    let iter = descriptors.iter().filter(move |(t, _)| {
+        prefixes
+            .iter()
+            .any(|prefix| t.matches_prefix(prefix.as_ref()))
+    });
+
+    // Exploit the fact descriptors is ordered to group together types from the same package
+    let mut ret: Vec<(Package, W)> = Vec::new();
+    for (type_path, descriptor) in iter {
+        let writer = match ret.last_mut() {
+            Some((package, writer)) if package == type_path.package() => writer,
+            _ => {
+                let package = type_path.package();
+                ret.push((package.clone(), write_factory(package)?));
+                &mut ret.last_mut().unwrap().1
+            }
+        };
+
+        match descriptor {
+            Descriptor::Enum(descriptor) => generate_enum(&config, type_path, descriptor, writer)?,
+            Descriptor::Message(descriptor) => {
+                if let Some(message) = resolve_message(descriptors, descriptor) {
+                    generate_message(&config, &message, writer)?
+                }
+            }
+        }
+    }
+
+    Ok(ret)
+}
diff --git a/pbjson_build/src/message.rs b/pbjson_build/src/message.rs
new file mode 100644
index 0000000000..5927a1f520
--- /dev/null
+++ b/pbjson_build/src/message.rs
@@ -0,0 +1,275 @@
+//! The raw descriptor format is not very easy to work with, a fact not aided
+//! by prost making almost all members of proto2 syntax message optional
+//!
+//! This module therefore extracts a slightly less obtuse representation of a
+//! message that can be used by the code generation logic
+
+use prost_types::{
+    field_descriptor_proto::{Label, Type},
+    FieldDescriptorProto,
+};
+
+use crate::descriptor::{Descriptor, DescriptorSet, MessageDescriptor, Syntax, TypeName, TypePath};
+use crate::escape::escape_ident;
+
+#[derive(Debug, Clone, Copy)]
+pub enum ScalarType {
+    F64,
+    F32,
+    I32,
+    I64,
+    U32,
+    U64,
+    Bool,
+    String,
+    Bytes,
+}
+
+impl ScalarType {
+    pub fn rust_type(&self) -> &'static str {
+        match self {
+            ScalarType::F64 => "f64",
+            ScalarType::F32 => "f32",
+            ScalarType::I32 => "i32",
+            ScalarType::I64 => "i64",
+            ScalarType::U32 => "u32",
+            ScalarType::U64 => "u64",
+            ScalarType::Bool => "bool",
+            ScalarType::String => "String",
+            ScalarType::Bytes => "Vec<u8>",
+        }
+    }
+
+    pub fn is_numeric(&self) -> bool {
+        matches!(
+            self,
+            ScalarType::F64
+                | ScalarType::F32
+                | ScalarType::I32
+                | ScalarType::I64
+                | ScalarType::U32
+                | ScalarType::U64
+        )
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum FieldType {
+    Scalar(ScalarType),
+    Enum(TypePath),
+    Message(TypePath),
+    Map(ScalarType, Box<FieldType>),
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum FieldModifier {
+    Required,
+    Optional,
+    UseDefault,
+    Repeated,
+}
+
+impl FieldModifier {
+    pub fn is_required(&self) -> bool {
+        matches!(self, Self::Required)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Field {
+    pub name: String,
+    pub json_name: Option<String>,
+    pub field_modifier: FieldModifier,
+    pub field_type: FieldType,
+}
+
+impl Field {
+    pub fn rust_type_name(&self) -> String {
+        use heck::CamelCase;
+        self.name.to_camel_case()
+    }
+
+    pub fn rust_field_name(&self) -> String {
+        use heck::SnakeCase;
+        escape_ident(self.name.to_snake_case())
+    }
+
+    pub fn json_name(&self) -> String {
+        use heck::MixedCase;
+        self.json_name
+            .clone()
+            .unwrap_or_else(|| self.name.to_mixed_case())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct OneOf {
+    pub name: String,
+    pub path: TypePath,
+    pub fields: Vec<Field>,
+}
+
+impl OneOf {
+    pub fn rust_field_name(&self) -> String {
+        use heck::SnakeCase;
+        escape_ident(self.name.to_snake_case())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Message {
+    pub path: TypePath,
+    pub fields: Vec<Field>,
+    pub one_ofs: Vec<OneOf>,
+}
+
+impl Message {
+    pub fn all_fields(&self) -> impl Iterator<Item = &Field> + '_ {
+        self.fields
+            .iter()
+            .chain(self.one_ofs.iter().flat_map(|one_of| one_of.fields.iter()))
+    }
+}
+
+/// Resolve the provided message descriptor into a slightly less obtuse representation
+///
+/// Returns None if the provided provided message is auto-generated
+pub fn resolve_message(
+    descriptors: &DescriptorSet,
+    message: &MessageDescriptor,
+) -> Option<Message> {
+    if message.is_map() {
+        return None;
+    }
+
+    let mut fields = Vec::new();
+    let mut one_of_fields = vec![Vec::new(); message.one_of.len()];
+
+    for field in &message.fields {
+        let field_type = field_type(descriptors, field);
+        let field_modifier = field_modifier(message, field, &field_type);
+
+        let resolved = Field {
+            name: field.name.clone().expect("expected field to have name"),
+            json_name: field.json_name.clone(),
+            field_type,
+            field_modifier,
+        };
+
+        // Treat synthetic one-of as normal
+        let proto3_optional = field.proto3_optional.unwrap_or(false);
+        match (field.oneof_index, proto3_optional) {
+            (Some(idx), false) => one_of_fields[idx as usize].push(resolved),
+            _ => fields.push(resolved),
+        }
+    }
+
+    let mut one_ofs = Vec::new();
+
+    for (fields, descriptor) in one_of_fields.into_iter().zip(&message.one_of) {
+        // Might be empty in the event of a synthetic one-of
+        if !fields.is_empty() {
+            let name = descriptor.name.clone().expect("oneof with no name");
+            let path = message.path.child(TypeName::new(&name));
+
+            one_ofs.push(OneOf { name, path, fields })
+        }
+    }
+
+    Some(Message {
+        path: message.path.clone(),
+        fields,
+        one_ofs,
+    })
+}
+
+fn field_modifier(
+    message: &MessageDescriptor,
+    field: &FieldDescriptorProto,
+    field_type: &FieldType,
+) -> FieldModifier {
+    let label = Label::from_i32(field.label.expect("expected label")).expect("valid label");
+    if field.proto3_optional.unwrap_or(false) {
+        assert_eq!(label, Label::Optional);
+        return FieldModifier::Optional;
+    }
+
+    if field.oneof_index.is_some() {
+        assert_eq!(label, Label::Optional);
+        return FieldModifier::Optional;
+    }
+
+    if matches!(field_type, FieldType::Map(_, _)) {
+        assert_eq!(label, Label::Repeated);
+        return FieldModifier::Repeated;
+    }
+
+    match label {
+        Label::Optional => match message.syntax {
+            Syntax::Proto2 => FieldModifier::Optional,
+            Syntax::Proto3 => match field_type {
+                FieldType::Message(_) => FieldModifier::Optional,
+                _ => FieldModifier::UseDefault,
+            },
+        },
+        Label::Required => FieldModifier::Required,
+        Label::Repeated => FieldModifier::Repeated,
+    }
+}
+
+fn field_type(descriptors: &DescriptorSet, field: &FieldDescriptorProto) -> FieldType {
+    match field.type_name.as_ref() {
+        Some(type_name) => resolve_type(descriptors, type_name.as_str()),
+        None => {
+            let scalar =
+                match Type::from_i32(field.r#type.expect("expected type")).expect("valid type") {
+                    Type::Double => ScalarType::F64,
+                    Type::Float => ScalarType::F32,
+                    Type::Int64 | Type::Sfixed64 | Type::Sint64 => ScalarType::I64,
+                    Type::Int32 | Type::Sfixed32 | Type::Sint32 => ScalarType::I32,
+                    Type::Uint64 | Type::Fixed64 => ScalarType::U64,
+                    Type::Uint32 | Type::Fixed32 => ScalarType::U32,
+                    Type::Bool => ScalarType::Bool,
+                    Type::String => ScalarType::String,
+                    Type::Bytes => ScalarType::Bytes,
+                    Type::Message | Type::Enum | Type::Group => panic!("no type name specified"),
+                };
+            FieldType::Scalar(scalar)
+        }
+    }
+}
+
+fn resolve_type(descriptors: &DescriptorSet, type_name: &str) -> FieldType {
+    assert!(
+        type_name.starts_with('.'),
+        "pbjson does not currently support resolving relative types"
+    );
+    let maybe_descriptor = descriptors
+        .iter()
+        .find(|(path, _)| path.matches_prefix(type_name));
+
+    match maybe_descriptor {
+        Some((path, Descriptor::Enum(_))) => FieldType::Enum(path.clone()),
+        Some((path, Descriptor::Message(descriptor))) => match descriptor.is_map() {
+            true => {
+                assert_eq!(descriptor.fields.len(), 2, "expected map to have 2 fields");
+                let key = &descriptor.fields[0];
+                let value = &descriptor.fields[1];
+
+                assert_eq!("key", key.name());
+                assert_eq!("value", value.name());
+
+                let key_type = match field_type(descriptors, key) {
+                    FieldType::Scalar(scalar) => scalar,
+                    _ => panic!("non scalar map key"),
+                };
+                let value_type = field_type(descriptors, value);
+                FieldType::Map(key_type, Box::new(value_type))
+            }
+            // Note: This may actually be a group but it is non-trivial to detect this,
+            // they're deprecated, and pbjson doesn't need to be able to distinguish
+            false => FieldType::Message(path.clone()),
+        },
+        None => panic!("failed to resolve type: {}", type_name),
+    }
+}
diff --git a/pbjson_test/Cargo.toml b/pbjson_test/Cargo.toml
new file mode 100644
index 0000000000..86c0566005
--- /dev/null
+++ b/pbjson_test/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pbjson_test"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Test resources for pbjson converion"
+
+[dependencies]
+prost = "0.8"
+pbjson = { path = "../pbjson" }
+serde = { version = "1.0", features = ["derive"] }
+
+[dev-dependencies]
+serde_json = "1.0"
+
+[build-dependencies]
+prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
diff --git a/pbjson_test/build.rs b/pbjson_test/build.rs
new file mode 100644
index 0000000000..a9b294440a
--- /dev/null
+++ b/pbjson_test/build.rs
@@ -0,0 +1,33 @@
+//! Compiles Protocol Buffers definitions into native Rust types
+
+use std::env;
+use std::path::PathBuf;
+
+type Error = Box<dyn std::error::Error>;
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+fn main() -> Result<()> {
+    let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
+
+    let proto_files = vec![root.join("syntax3.proto")];
+
+    // Tell cargo to recompile if any of these proto files are changed
+    for proto_file in &proto_files {
+        println!("cargo:rerun-if-changed={}", proto_file.display());
+    }
+
+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
+    prost_build::Config::new()
+        .file_descriptor_set_path(&descriptor_path)
+        .compile_well_known_types()
+        .disable_comments(&["."])
+        .bytes(&[".test"])
+        .compile_protos(&proto_files, &[root])?;
+
+    let descriptor_set = std::fs::read(descriptor_path)?;
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".test"])?;
+
+    Ok(())
+}
diff --git a/pbjson_test/protos/syntax3.proto b/pbjson_test/protos/syntax3.proto
new file mode 100644
index 0000000000..73ef4fded7
--- /dev/null
+++ b/pbjson_test/protos/syntax3.proto
@@ -0,0 +1,72 @@
+syntax = "proto3";
+
+package test.syntax3;
+
+message Empty {}
+
+message KitchenSink {
+  // Standard enum
+  enum Value {
+    VALUE_UNKNOWN = 0;
+    VALUE_A = 45;
+    VALUE_B = 63;
+  }
+
+  // An enumeration without prefixed variants
+  enum Prefix {
+    UNKNOWN = 0;
+    A = 66;
+    B = 20;
+  }
+
+  int32 i32 = 1;
+  optional int32 optional_i32 = 2;
+  repeated int32 repeated_i32 = 3;
+
+  uint32 u32 = 4;
+  optional uint32 optional_u32 = 5;
+  repeated uint32 repeated_u32 = 6;
+
+  int64 i64 = 7;
+  optional int64  optional_i64 = 8;
+  repeated int64 repeated_i64 = 9;
+
+  uint64 u64 = 10;
+  optional uint64 optional_u64 = 11;
+  repeated uint64 repeated_u64 = 12;
+
+  Value value = 13;
+  optional Value optional_value = 14;
+  repeated Value repeated_value = 15;
+
+  Prefix prefix = 16;
+  Empty empty = 17;
+
+  map<string, string> string_dict = 18;
+  map<string, Empty> message_dict = 19;
+  map<string, Prefix> enum_dict = 20;
+  map<int64, Prefix> int64_dict = 21;
+  map<int32, Prefix> int32_dict = 22;
+  map<int32, uint64> integer_dict = 23;
+
+  bool bool = 24;
+  optional bool optional_bool = 25;
+  repeated bool repeated_bool = 26;
+
+  oneof one_of {
+    int32 one_of_i32 = 27;
+    bool one_of_bool = 28;
+    Value one_of_value = 29;
+    Empty one_of_message = 30;
+  }
+
+  bytes bytes = 31;
+  optional bytes optional_bytes = 32;
+  repeated bytes repeated_bytes = 33;
+
+  // Bytes support is currently broken - https://github.com/tokio-rs/prost/issues/531
+  //  map<string, bytes> bytes_dict = 34;
+
+  string string = 35;
+  optional string optional_string = 36;
+}
\ No newline at end of file
diff --git a/pbjson_test/src/lib.rs b/pbjson_test/src/lib.rs
new file mode 100644
index 0000000000..70a9253aff
--- /dev/null
+++ b/pbjson_test/src/lib.rs
@@ -0,0 +1,241 @@
+include!(concat!(env!("OUT_DIR"), "/test.syntax3.rs"));
+include!(concat!(env!("OUT_DIR"), "/test.syntax3.serde.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_empty() {
+        let message = Empty {};
+
+        let encoded = serde_json::to_string(&message).unwrap();
+        let _decoded: Empty = serde_json::from_str(&encoded).unwrap();
+
+        let err = serde_json::from_str::<Empty>("343").unwrap_err();
+        assert_eq!(
+            err.to_string().as_str(),
+            "invalid type: integer `343`, expected struct test.syntax3.Empty at line 1 column 3"
+        );
+
+        let err = serde_json::from_str::<Empty>("{\"foo\": \"bar\"}").unwrap_err();
+        assert_eq!(
+            err.to_string().as_str(),
+            "unknown field `foo`, there are no fields at line 1 column 6"
+        );
+    }
+
+    #[test]
+    fn test_kitchen_sink() {
+        let mut decoded: KitchenSink = serde_json::from_str("{}").unwrap();
+
+        let verify_encode = |decoded: &KitchenSink, expected: &str| {
+            assert_eq!(serde_json::to_string(&decoded).unwrap().as_str(), expected);
+        };
+
+        let verify_decode = |decoded: &KitchenSink, expected: &str| {
+            assert_eq!(decoded, &serde_json::from_str(expected).unwrap());
+        };
+
+        let verify = |decoded: &KitchenSink, expected: &str| {
+            verify_encode(decoded, expected);
+            verify_decode(decoded, expected);
+        };
+
+        verify(&decoded, "{}");
+        decoded.i32 = 24;
+        verify(&decoded, r#"{"i32":24}"#);
+        decoded.i32 = 0;
+        verify_decode(&decoded, "{}");
+
+        // Explicit optional fields can distinguish between no value and default value
+        decoded.optional_i32 = Some(2);
+        verify(&decoded, r#"{"optionalI32":2}"#);
+
+        decoded.optional_i32 = Some(0);
+        verify(&decoded, r#"{"optionalI32":0}"#);
+
+        // Can also decode from string
+        verify_decode(&decoded, r#"{"optionalI32":"0"}"#);
+
+        decoded.optional_i32 = None;
+        verify_decode(&decoded, "{}");
+
+        // 64-bit integers are encoded as strings
+        decoded.i64 = 123125;
+        verify(&decoded, r#"{"i64":"123125"}"#);
+
+        decoded.i64 = 0;
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_i64 = Some(532);
+        verify(&decoded, r#"{"optionalI64":"532"}"#);
+
+        decoded.optional_i64 = Some(0);
+        verify(&decoded, r#"{"optionalI64":"0"}"#);
+
+        // Can also decode from non-string
+        verify_decode(&decoded, r#"{"optionalI64":0}"#);
+
+        decoded.optional_i64 = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.u64 = 34346;
+        decoded.u32 = 567094456;
+        decoded.optional_u32 = Some(0);
+        decoded.optional_u64 = Some(3);
+        verify(
+            &decoded,
+            r#"{"u32":567094456,"optionalU32":0,"u64":"34346","optionalU64":"3"}"#,
+        );
+
+        decoded.u64 = 0;
+        decoded.u32 = 0;
+        decoded.optional_u32 = None;
+        decoded.optional_u64 = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_i32 = vec![0, 23, 5, 6, 2, 34];
+        verify(&decoded, r#"{"repeatedI32":[0,23,5,6,2,34]}"#);
+        // Can also mix in some strings
+        verify_decode(&decoded, r#"{"repeatedI32":[0,"23",5,6,"2",34]}"#);
+
+        decoded.repeated_i32 = vec![];
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_u64 = vec![0, 532, 2];
+        verify(&decoded, r#"{"repeatedU64":["0","532","2"]}"#);
+        // Can also mix in some non-strings
+        verify_decode(&decoded, r#"{"repeatedU64":["0",532,"2"]}"#);
+
+        decoded.repeated_u64 = vec![];
+        verify_decode(&decoded, "{}");
+
+        // Enumerations should be encoded as strings
+        decoded.value = kitchen_sink::Value::A as i32;
+        verify(&decoded, r#"{"value":"VALUE_A"}"#);
+
+        // Can also use variant number
+        verify_decode(&decoded, r#"{"value":45}"#);
+
+        decoded.value = kitchen_sink::Value::Unknown as i32;
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_value = Some(kitchen_sink::Value::Unknown as i32);
+        verify(&decoded, r#"{"optionalValue":"VALUE_UNKNOWN"}"#);
+
+        // Can also use variant number
+        verify_decode(&decoded, r#"{"optionalValue":0}"#);
+
+        decoded.optional_value = None;
+        verify_decode(&decoded, "{}");
+
+        decoded
+            .string_dict
+            .insert("foo".to_string(), "bar".to_string());
+        verify(&decoded, r#"{"stringDict":{"foo":"bar"}}"#);
+
+        decoded.string_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded
+            .int32_dict
+            .insert(343, kitchen_sink::Prefix::A as i32);
+        // Dictionary keys should always be strings
+        // Enum dictionary values should be encoded as strings
+        verify(&decoded, r#"{"int32Dict":{"343":"A"}}"#);
+        // Enum dictionary values can be decoded from integers
+        verify_decode(&decoded, r#"{"int32Dict":{"343":66}}"#);
+
+        decoded.int32_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        // 64-bit dictionary values should be encoded as strings
+        decoded.integer_dict.insert(12, 13);
+        verify(&decoded, r#"{"integerDict":{"12":"13"}}"#);
+        // 64-bit dictionary values can be decoded from numeric types
+        verify_decode(&decoded, r#"{"integerDict":{"12":13}}"#);
+
+        decoded.integer_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfI32(0));
+        verify(&decoded, r#"{"oneOfI32":0}"#);
+        // Can also specify string
+        verify_decode(&decoded, r#"{"oneOfI32":"0"}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfI32(12));
+        verify(&decoded, r#"{"oneOfI32":12}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfBool(false));
+        verify(&decoded, r#"{"oneOfBool":false}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfBool(true));
+        verify(&decoded, r#"{"oneOfBool":true}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfValue(
+            kitchen_sink::Value::B as i32,
+        ));
+        verify(&decoded, r#"{"oneOfValue":"VALUE_B"}"#);
+        // Can also specify enum variant
+        verify_decode(&decoded, r#"{"oneOfValue":63}"#);
+
+        decoded.one_of = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_value = vec![
+            kitchen_sink::Value::B as i32,
+            kitchen_sink::Value::B as i32,
+            kitchen_sink::Value::A as i32,
+        ];
+        verify(
+            &decoded,
+            r#"{"repeatedValue":["VALUE_B","VALUE_B","VALUE_A"]}"#,
+        );
+        verify_decode(&decoded, r#"{"repeatedValue":[63,"VALUE_B","VALUE_A"]}"#);
+
+        decoded.repeated_value = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.bytes = prost::bytes::Bytes::from_static(b"kjkjkj");
+        verify(&decoded, r#"{"bytes":"a2pramtq"}"#);
+
+        decoded.bytes = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_bytes = Some(prost::bytes::Bytes::from_static(b"kjkjkj"));
+        verify(&decoded, r#"{"optionalBytes":"a2pramtq"}"#);
+
+        decoded.optional_bytes = Some(Default::default());
+        verify(&decoded, r#"{"optionalBytes":""}"#);
+
+        decoded.optional_bytes = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_bytes = vec![
+            prost::bytes::Bytes::from_static(b"sdfsd"),
+            prost::bytes::Bytes::from_static(b"fghfg"),
+        ];
+        verify(&decoded, r#"{"repeatedBytes":["c2Rmc2Q=","ZmdoZmc="]}"#);
+
+        decoded.repeated_bytes = Default::default();
+        verify_decode(&decoded, "{}");
+
+        // decoded.bytes_dict.insert(
+        //     "test".to_string(),
+        //     prost::bytes::Bytes::from_static(b"asdf"),
+        // );
+        // verify(&decoded, r#"{"bytesDict":{"test":"YXNkZgo="}}"#);
+        //
+        // decoded.bytes_dict = Default::default();
+        // verify_decode(&decoded, "{}");
+
+        decoded.string = "test".to_string();
+        verify(&decoded, r#"{"string":"test"}"#);
+
+        decoded.string = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_string = Some(String::new());
+        verify(&decoded, r#"{"optionalString":""}"#);
+    }
+}
diff --git a/predicate/src/predicate.rs b/predicate/src/predicate.rs
index 2bbc879298..0cc3f0c16f 100644
--- a/predicate/src/predicate.rs
+++ b/predicate/src/predicate.rs
@@ -161,6 +161,51 @@ impl Predicate {
     pub fn is_empty(&self) -> bool {
         self == &EMPTY_PREDICATE
     }
+
+    /// Add each range [start, stop] of the delete_predicates into the predicate in
+    /// the form "time < start OR time > stop" to eliminate that range from the query
+    pub fn add_delete_ranges<S>(&mut self, delete_predicates: &[S])
+    where
+        S: AsRef<Self>,
+    {
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            if let Some(range) = pred.range {
+                let expr = col(TIME_COLUMN_NAME)
+                    .lt(lit(range.start))
+                    .or(col(TIME_COLUMN_NAME).gt(lit(range.end)));
+                self.exprs.push(expr);
+            }
+        }
+    }
+
+    /// Add a list of disjunctive negated expressions.
+    /// Example: there are two deletes as follows
+    ///   . Delete_1: WHERE city != "Boston"  AND temp = 70
+    ///   . Delete 2: WHERE state = "NY" AND route != "I90"
+    /// The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
+    ///    NOT(city != "Boston"  AND temp = 70),  NOT(state = "NY" AND route != "I90") which means
+    ///   [NOT(city = Boston") OR NOT(temp = 70)], [NOT(state = "NY") OR NOT(route != "I90")]
+    pub fn add_delete_exprs<S>(&mut self, delete_predicates: &[S])
+    where
+        S: AsRef<Self>,
+    {
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            let mut expr: Option<Expr> = None;
+            for exp in &pred.exprs {
+                match expr {
+                    None => expr = Some(exp.clone().not()),
+                    Some(e) => expr = Some(e.or(exp.clone().not())),
+                }
+            }
+            if let Some(e) = expr {
+                self.exprs.push(e);
+            }
+        }
+    }
 }
 
 impl fmt::Display for Predicate {
diff --git a/query/src/exec/context.rs b/query/src/exec/context.rs
index b526a58773..ce72f303fb 100644
--- a/query/src/exec/context.rs
+++ b/query/src/exec/context.rs
@@ -11,18 +11,19 @@ use datafusion::{
     logical_plan::{LogicalPlan, UserDefinedLogicalNode},
     physical_plan::{
         coalesce_partitions::CoalescePartitionsExec,
-        collect, displayable,
+        displayable,
         planner::{DefaultPhysicalPlanner, ExtensionPlanner},
         ExecutionPlan, PhysicalPlanner, SendableRecordBatchStream,
     },
     prelude::*,
 };
+use futures::TryStreamExt;
 use observability_deps::tracing::{debug, trace};
 use trace::{ctx::SpanContext, span::SpanRecorder};
 
 use crate::exec::{
     fieldlist::{FieldList, IntoFieldList},
-    query_tracing::send_metrics_to_tracing,
+    query_tracing::TracedStream,
     schema_pivot::{SchemaPivotExec, SchemaPivotNode},
     seriesset::{SeriesSetConverter, SeriesSetItem},
     split::StreamSplitExec,
@@ -272,45 +273,63 @@ impl IOxExecutionContext {
     /// Executes the logical plan using DataFusion on a separate
     /// thread pool and produces RecordBatches
     pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
-        let ctx = self.child_ctx("collect");
         debug!(
             "Running plan, physical:\n{}",
             displayable(physical_plan.as_ref()).indent()
         );
+        let ctx = self.child_ctx("collect");
+        let stream = ctx.execute_stream(physical_plan).await?;
 
-        let res = ctx.run(collect(Arc::clone(&physical_plan))).await;
-
-        // send metrics to tracing, even on error
-        ctx.save_metrics(physical_plan);
-        res
+        ctx.run(
+            stream
+                .err_into() // convert to DataFusionError
+                .try_collect(),
+        )
+        .await
     }
 
-    /// Executes the physical plan and produces a RecordBatchStream to stream
-    /// over the result that iterates over the results.
-    pub async fn execute(
+    /// Executes the physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream(
         &self,
         physical_plan: Arc<dyn ExecutionPlan>,
     ) -> Result<SendableRecordBatchStream> {
         match physical_plan.output_partitioning().partition_count() {
             0 => unreachable!(),
-            1 => self.execute_partition(physical_plan, 0).await,
+            1 => self.execute_stream_partitioned(physical_plan, 0).await,
             _ => {
                 // Merge into a single partition
-                self.execute_partition(Arc::new(CoalescePartitionsExec::new(physical_plan)), 0)
-                    .await
+                self.execute_stream_partitioned(
+                    Arc::new(CoalescePartitionsExec::new(physical_plan)),
+                    0,
+                )
+                .await
             }
         }
     }
 
-    /// Executes a single partition of a physical plan and produces a RecordBatchStream to stream
-    /// over the result that iterates over the results.
-    pub async fn execute_partition(
+    /// Executes a single partition of a physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream_partitioned(
         &self,
         physical_plan: Arc<dyn ExecutionPlan>,
         partition: usize,
     ) -> Result<SendableRecordBatchStream> {
-        self.run(async move { physical_plan.execute(partition).await })
-            .await
+        let span = self
+            .recorder
+            .span()
+            .map(|span| span.child("execute_stream_partitioned"));
+
+        self.run(async move {
+            let stream = physical_plan.execute(partition).await?;
+            let stream = TracedStream::new(stream, span, physical_plan);
+            Ok(Box::pin(stream) as _)
+        })
+        .await
     }
 
     /// Executes the SeriesSetPlans on the query executor, in
@@ -349,7 +368,7 @@ impl IOxExecutionContext {
 
                     let physical_plan = ctx.prepare_plan(&plan)?;
 
-                    let it = ctx.execute(physical_plan).await?;
+                    let it = ctx.execute_stream(physical_plan).await?;
 
                     SeriesSetConverter::default()
                         .convert(
@@ -486,19 +505,4 @@ impl IOxExecutionContext {
             recorder: self.recorder.child(name),
         }
     }
-
-    /// Saves any DataFusion metrics that are currently present in
-    /// `physical_plan` to the span recorder so they show up in
-    /// distributed traces (e.g. Jaeger)
-    ///
-    /// This function should be invoked after `physical_plan` has
-    /// fully `collect`ed, meaning that `PhysicalPlan::execute()` has
-    /// been invoked and the resulting streams have been completely
-    /// consumed. Calling `save_metrics` metrics prior to this point
-    /// may result in saving incomplete information.
-    pub fn save_metrics(&self, physical_plan: Arc<dyn ExecutionPlan>) {
-        if let Some(span) = self.recorder.span() {
-            send_metrics_to_tracing(span, physical_plan.as_ref())
-        }
-    }
 }
diff --git a/query/src/exec/query_tracing.rs b/query/src/exec/query_tracing.rs
index 9dadc5e588..7fce5f545e 100644
--- a/query/src/exec/query_tracing.rs
+++ b/query/src/exec/query_tracing.rs
@@ -1,15 +1,67 @@
 //! This module contains the code to map DataFusion metrics to `Span`s
 //! for use in distributed tracing (e.g. Jaeger)
 
-use std::{borrow::Cow, fmt};
+use std::{borrow::Cow, fmt, sync::Arc};
 
+use arrow::record_batch::RecordBatch;
 use chrono::{DateTime, Utc};
 use datafusion::physical_plan::{
     metrics::{MetricValue, MetricsSet},
-    DisplayFormatType, ExecutionPlan,
+    DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
 };
+use futures::StreamExt;
 use observability_deps::tracing::debug;
-use trace::span::Span;
+use trace::span::{Span, SpanRecorder};
+
+/// Stream wrapper that records DataFusion `MetricSets` into IOx
+/// [`Span`]s when it is dropped.
+pub(crate) struct TracedStream {
+    inner: SendableRecordBatchStream,
+    span_recorder: SpanRecorder,
+    physical_plan: Arc<dyn ExecutionPlan>,
+}
+
+impl TracedStream {
+    /// Return a stream that records DataFusion `MetricSets` from
+    /// `physical_plan` into `span` when dropped.
+    pub(crate) fn new(
+        inner: SendableRecordBatchStream,
+        span: Option<trace::span::Span>,
+        physical_plan: Arc<dyn ExecutionPlan>,
+    ) -> Self {
+        Self {
+            inner,
+            span_recorder: SpanRecorder::new(span),
+            physical_plan,
+        }
+    }
+}
+
+impl RecordBatchStream for TracedStream {
+    fn schema(&self) -> arrow::datatypes::SchemaRef {
+        self.inner.schema()
+    }
+}
+
+impl futures::Stream for TracedStream {
+    type Item = arrow::error::Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        self.inner.poll_next_unpin(cx)
+    }
+}
+
+impl Drop for TracedStream {
+    fn drop(&mut self) {
+        if let Some(span) = self.span_recorder.span() {
+            let default_end_time = Utc::now();
+            send_metrics_to_tracing(default_end_time, span, self.physical_plan.as_ref());
+        }
+    }
+}
 
 /// This function translates data in DataFusion `MetricSets` into IOx
 /// [`Span`]s. It records a snapshot of the current state of the
@@ -26,15 +78,7 @@ use trace::span::Span;
 /// 1. If the ExecutionPlan had no metrics
 /// 2. The total number of rows produced by the ExecutionPlan (if available)
 /// 3. The elapsed compute time taken by the ExecutionPlan
-pub(crate) fn send_metrics_to_tracing(parent_span: &Span, physical_plan: &dyn ExecutionPlan) {
-    // The parent span may be open, but since the physical_plan is
-    // assumed to be fully collected, using `now()` is a conservative
-    // estimate of the end time
-    let default_end_time = Utc::now();
-    send_metrics_to_tracing_inner(default_end_time, parent_span, physical_plan)
-}
-
-fn send_metrics_to_tracing_inner(
+fn send_metrics_to_tracing(
     default_end_time: DateTime<Utc>,
     parent_span: &Span,
     physical_plan: &dyn ExecutionPlan,
@@ -101,7 +145,7 @@ fn send_metrics_to_tracing_inner(
 
     // recurse
     for child in physical_plan.children() {
-        send_metrics_to_tracing_inner(span_end, &span, child.as_ref())
+        send_metrics_to_tracing(span_end, &span, child.as_ref())
     }
 
     span.export()
@@ -173,17 +217,9 @@ mod tests {
         Metric,
     };
 
-    use std::{
-        num::{NonZeroU128, NonZeroU64},
-        sync::Arc,
-        time::Duration,
-    };
+    use std::{sync::Arc, time::Duration};
 
-    use trace::{
-        ctx::{SpanContext, SpanId, TraceId},
-        span::{MetaValue, SpanStatus},
-        RingBufferTraceCollector,
-    };
+    use trace::{ctx::SpanContext, span::MetaValue, RingBufferTraceCollector};
 
     use super::*;
 
@@ -193,7 +229,7 @@ mod tests {
         let exec = TestExec::new(name, Default::default());
 
         let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);
 
         let spans = traces.spans();
         assert_eq!(spans.len(), 1);
@@ -224,7 +260,7 @@ mod tests {
         exec.new_child("child4", make_time_metricset(None, None));
 
         let traces = TraceBuilder::new();
-        send_metrics_to_tracing_inner(ts5, &traces.make_span(), &exec);
+        send_metrics_to_tracing(ts5, &traces.make_span(), &exec);
 
         let spans = traces.spans();
         println!("Spans: \n\n{:#?}", spans);
@@ -250,7 +286,7 @@ mod tests {
         exec.metrics = None;
 
         let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);
 
         let spans = traces.spans();
         assert_eq!(spans.len(), 1);
@@ -274,7 +310,7 @@ mod tests {
         add_elapsed_compute(exec.metrics_mut(), 2000, 2);
 
         let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);
 
         // aggregated metrics should be reported
         let spans = traces.spans();
@@ -358,23 +394,7 @@ mod tests {
 
         // create a new span connected to the collector
         fn make_span(&self) -> Span {
-            let collector = Arc::clone(&self.collector);
-
-            // lifted from make_span in trace/src/span.rs
-            Span {
-                name: "foo".into(),
-                ctx: SpanContext {
-                    trace_id: TraceId(NonZeroU128::new(23948923).unwrap()),
-                    parent_span_id: None,
-                    span_id: SpanId(NonZeroU64::new(3498394).unwrap()),
-                    collector: Some(collector),
-                },
-                start: None,
-                end: None,
-                status: SpanStatus::Unknown,
-                metadata: Default::default(),
-                events: vec![],
-            }
+            SpanContext::new(Arc::clone(&self.collector) as _).child("foo")
         }
 
         /// return all collected spans
diff --git a/query/src/frontend/influxrpc.rs b/query/src/frontend/influxrpc.rs
index 689c371cb1..e5367ae81d 100644
--- a/query/src/frontend/influxrpc.rs
+++ b/query/src/frontend/influxrpc.rs
@@ -524,6 +524,8 @@ impl InfluxRpcPlanner {
     /// The data is sorted on (tag_col1, tag_col2, ...) so that all
     /// rows for a particular series (groups where all tags are the
     /// same) occur together in the plan
+    // NGA todo: may need to add delete predicate here to eliminate deleted data at read time
+    //      https://github.com/influxdata/influxdb_iox/issues/2548
     pub fn read_filter<D>(&self, database: &D, predicate: Predicate) -> Result<SeriesSetPlans>
     where
         D: QueryDatabase + 'static,
diff --git a/query/src/lib.rs b/query/src/lib.rs
index 9352a9a0e1..f1d6d4e8e1 100644
--- a/query/src/lib.rs
+++ b/query/src/lib.rs
@@ -18,7 +18,7 @@ use internal_types::{
     schema::{sort::SortKey, Schema, TIME_COLUMN_NAME},
     selection::Selection,
 };
-use observability_deps::tracing::trace;
+use observability_deps::tracing::{debug, trace};
 use predicate::predicate::{Predicate, PredicateMatch};
 
 use hashbrown::HashMap;
@@ -46,7 +46,7 @@ pub trait QueryChunkMeta: Sized {
     fn schema(&self) -> Arc<Schema>;
 
     // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>>;
+    fn delete_predicates(&self) -> &[Arc<Predicate>];
 }
 
 /// A `Database` is the main trait implemented by the IOx subsystems
@@ -137,6 +137,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync {
         &self,
         predicate: &Predicate,
         selection: Selection<'_>,
+        delete_predicates: &[Arc<Predicate>],
     ) -> Result<SendableRecordBatchStream, Self::Error>;
 
     /// Returns true if data of this chunk is sorted
@@ -165,8 +166,10 @@ where
         self.as_ref().schema()
     }
 
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        self.as_ref().delete_predicates()
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = self.as_ref().delete_predicates();
+        debug!(?pred, "Delete predicate in QueryChunkMeta");
+        pred
     }
 }
 
diff --git a/query/src/provider/physical.rs b/query/src/provider/physical.rs
index d76bd7217f..9f4bb0ce5a 100644
--- a/query/src/provider/physical.rs
+++ b/query/src/provider/physical.rs
@@ -116,14 +116,18 @@ impl<C: QueryChunk + 'static> ExecutionPlan for IOxReadFilterNode<C> {
         let selection_cols = restrict_selection(selection_cols, &chunk_table_schema);
         let selection = Selection::Some(&selection_cols);
 
-        let stream = chunk.read_filter(&self.predicate, selection).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "Error creating scan for table {} chunk {}: {}",
-                self.table_name,
-                chunk.id(),
-                e
-            ))
-        })?;
+        let del_preds = chunk.delete_predicates();
+
+        let stream = chunk
+            .read_filter(&self.predicate, selection, del_preds)
+            .map_err(|e| {
+                DataFusionError::Execution(format!(
+                    "Error creating scan for table {} chunk {}: {}",
+                    self.table_name,
+                    chunk.id(),
+                    e
+                ))
+            })?;
 
         // all CPU time is now done, pass in baseline metrics to adapter
         timer.done();
diff --git a/query/src/test.rs b/query/src/test.rs
index 7f404ffe7a..d7c2661852 100644
--- a/query/src/test.rs
+++ b/query/src/test.rs
@@ -25,6 +25,7 @@ use internal_types::{
     schema::{builder::SchemaBuilder, merge::SchemaMerger, InfluxColumnType, Schema},
     selection::Selection,
 };
+use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 use snafu::Snafu;
 use std::num::NonZeroU64;
@@ -173,6 +174,9 @@ pub struct TestChunk {
     /// Return value for apply_predicate, if desired
     predicate_match: Option<PredicateMatch>,
 
+    /// Copy of delete predicates passed
+    delete_predicates: Vec<Arc<Predicate>>,
+
     /// Order of this chunk relative to other overlapping chunks.
     order: ChunkOrder,
 }
@@ -248,6 +252,7 @@ impl TestChunk {
             table_data: Default::default(),
             saved_error: Default::default(),
             predicate_match: Default::default(),
+            delete_predicates: Default::default(),
             order: ChunkOrder::new(0),
         }
     }
@@ -819,6 +824,7 @@ impl QueryChunk for TestChunk {
         &self,
         predicate: &Predicate,
         _selection: Selection<'_>,
+        _delete_predicates: &[Arc<Predicate>],
     ) -> Result<SendableRecordBatchStream, Self::Error> {
         self.check_error()?;
 
@@ -908,12 +914,11 @@ impl QueryChunkMeta for TestChunk {
     }
 
     // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        // Since this is the test chunk and its focus is not (yet) on
-        // deletion, return an empty delete predicate now which means nothing is deleted
-        // from this test chunk
-        let pred: Vec<Predicate> = vec![];
-        Arc::new(pred)
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = &self.delete_predicates;
+        debug!(?pred, "Delete predicate in Test Chunk");
+
+        pred
     }
 }
 
@@ -923,8 +928,9 @@ pub async fn raw_data(chunks: &[Arc<TestChunk>]) -> Vec<RecordBatch> {
     for c in chunks {
         let pred = Predicate::default();
         let selection = Selection::All;
+        let delete_predicates: Vec<Arc<Predicate>> = vec![];
         let mut stream = c
-            .read_filter(&pred, selection)
+            .read_filter(&pred, selection, &delete_predicates)
             .expect("Error in read_filter");
         while let Some(b) = stream.next().await {
             let b = b.expect("Error in stream");
diff --git a/query_tests/Cargo.toml b/query_tests/Cargo.toml
index bdc1a6a427..4ac25e1f69 100644
--- a/query_tests/Cargo.toml
+++ b/query_tests/Cargo.toml
@@ -10,19 +10,19 @@ description = "Tests of the query engine against different database configuratio
 [dependencies]
 async-trait = "0.1"
 data_types = { path = "../data_types" }
+datafusion = { path = "../datafusion" }
 once_cell = { version = "1.4.0", features = ["parking_lot"] }
+predicate = { path = "../predicate" }
 query = { path = "../query" }
 server = { path = "../server" }
 
 [dev-dependencies]
 arrow = { version = "5.0", features = ["prettyprint"] }
 arrow_util = { path = "../arrow_util" }
-datafusion = { path = "../datafusion" }
 data_types = { path = "../data_types" }
 internal_types = { path = "../internal_types" }
 metric = { path = "../metric" }
 object_store = { path = "../object_store" }
-predicate = { path = "../predicate" }
 snafu = "0.6.3"
 tempfile = "3.1.0"
 test_helpers = { path = "../test_helpers" }
diff --git a/query_tests/src/scenarios.rs b/query_tests/src/scenarios.rs
index f6d100960f..4c50397d92 100644
--- a/query_tests/src/scenarios.rs
+++ b/query_tests/src/scenarios.rs
@@ -1,12 +1,13 @@
 //! This module contains testing scenarios for Db
-
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
+use datafusion::logical_plan::{col, lit};
 use once_cell::sync::OnceCell;
 
-#[allow(unused_imports, dead_code, unused_macros)]
+use predicate::predicate::{Predicate, PredicateBuilder};
+
 use query::QueryChunk;
 
 use async_trait::async_trait;
@@ -1198,3 +1199,650 @@ impl DbSetup for ChunkOrder {
         vec![scenario]
     }
 }
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteFromMubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromMubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens when data in MUB
+        let _scenario_mub = make_delete_mub(lp_lines.clone(), pred.clone()).await;
+
+        // delete happens when data in MUB then moved to RUB
+        let scenario_rub =
+            make_delete_mub_to_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens when data in MUB then moved to RUB and then persisted
+        let scenario_rub_os = make_delete_mub_to_rub_and_os(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens when data in MUB then moved to RUB, then persisted, and then RUB is unloaded
+        let scenario_os =
+            make_delete_mub_to_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        // NGA todo: add scenario_mub in this after the deleted data is removed in the scan
+        //   right now MUB does not push predicate down so the result is not correct yet
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk moved from RUB to OS
+pub struct DeleteFromRubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromRubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added in RUB
+        // and is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens to data in RUB
+        let scenario_rub =
+            make_delete_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted
+        let scenario_rub_os =
+            make_delete_rub_to_os(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted then RUB unloaded
+        let scenario_os =
+            make_delete_rub_to_os_and_unload_rub(lp_lines.clone(), pred, table_name, partition_key)
+                .await;
+
+        // return scenarios to run queries
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk in both RUB and OS
+pub struct DeleteFromOsOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromOsOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added to persisted chunks
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens after data is persisted but still in RUB
+        let scenario_rub_os =
+            make_delete_os_with_rub(lp_lines.clone(), pred.clone(), table_name, partition_key)
+                .await;
+
+        // delete happens after data is persisted but still in RUB and then unload RUB
+        let _scenario_rub_os_unload_rub = make_delete_os_with_rub_then_unload_rub(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens after data is persisted and RUB is unloaded
+        let _scenario_os = make_delete_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        //vec![scenario_rub_os, scenario_rub_os_unload_rub, scenario_os]
+        // NGA todo: turn the last 2 scenarios on when #2518 and #2550 are done
+        vec![scenario_rub_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromMubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromMubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens when data in MUB
+        let _scenario_mub = make_delete_mub(lp_lines.clone(), pred.clone()).await;
+
+        // delete happens when data in MUB then moved to RUB
+        let scenario_rub =
+            make_delete_mub_to_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens when data in MUB then moved to RUB and then persisted
+        let scenario_rub_os = make_delete_mub_to_rub_and_os(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens when data in MUB then moved to RUB, then persisted, and then RUB is unloaded
+        let scenario_os =
+            make_delete_mub_to_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        // NGA todo: add scenario_mub in this after the deleted data is removed in the scan
+        //   right now MUB does not push predicate down so the result is not correct yet
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromRubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromRubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens to data in RUB
+        let scenario_rub =
+            make_delete_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted
+        let scenario_rub_os =
+            make_delete_rub_to_os(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted then RUB unloaded
+        let scenario_os =
+            make_delete_rub_to_os_and_unload_rub(lp_lines.clone(), pred, table_name, partition_key)
+                .await;
+
+        // return scenarios to run queries
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromOsOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromOsOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens after data is persisted but still in RUB
+        let scenario_rub_os =
+            make_delete_os_with_rub(lp_lines.clone(), pred.clone(), table_name, partition_key)
+                .await;
+
+        // delete happens after data is persisted but still in RUB and then unload RUB
+        let _scenario_rub_os_unload_rub = make_delete_os_with_rub_then_unload_rub(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens after data is persisted and RUB is unloaded
+        let _scenario_os = make_delete_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        //vec![scenario_rub_os, scenario_rub_os_unload_rub, scenario_os]
+        // NGA todo: turn the last 2 scenarios on when #2518band #2550 are done
+        vec![scenario_rub_os]
+    }
+}
+
+// NGA todo: Add these scenarios after deleted data is eliminated from scan
+//  1. Many deletes, each has one or/and multi expressions
+//  2. Many different-type chunks when a delete happens
+//  3. Combination of above
+
+async fn make_delete_mub(lp_lines: Vec<&str>, pred: Predicate) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // One open MUB, no RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 1);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 0);
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // Still one but frozen MUB, no RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 1);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in MUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // No MUB, one RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB moved from MUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_rub_and_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB and OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, one RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub_to_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB and then persisted to OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub_to_os_and_unload_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB then persisted to OS then RUB unloaded".into(),
+        db,
+    }
+}
+
+async fn make_delete_os_with_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS with RUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_os_with_rub_then_unload_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS only but the delete happens before RUB is unloaded"
+            .into(),
+        db,
+    }
+}
+
+async fn make_delete_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS and the delete happens after RUB is unloaded".into(),
+        db,
+    }
+}
diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs
index 325de8d0c1..7c49f0408a 100644
--- a/query_tests/src/sql.rs
+++ b/query_tests/src/sql.rs
@@ -808,3 +808,68 @@ async fn sql_select_all_different_tags_chunks() {
         &expected
     );
 }
+
+#[tokio::test]
+async fn sql_select_with_deleted_data_from_one_expr() {
+    let expected = vec![
+        "+-----+--------------------------------+",
+        "| bar | time                           |",
+        "+-----+--------------------------------+",
+        "| 2   | 1970-01-01T00:00:00.000000020Z |",
+        "+-----+--------------------------------+",
+    ];
+
+    // Data deleted when it is in MUB, and then moved to RUB and OS
+    run_sql_test_case!(
+        DeleteFromMubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in RUB, and then moved OS
+    run_sql_test_case!(
+        DeleteFromRubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in OS
+    run_sql_test_case!(
+        DeleteFromOsOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+}
+
+#[tokio::test]
+async fn sql_select_with_deleted_data_from_multi_exprs() {
+    let expected = vec![
+        "+-----+-----+--------------------------------+",
+        "| bar | foo | time                           |",
+        "+-----+-----+--------------------------------+",
+        "| 1   | me  | 1970-01-01T00:00:00.000000040Z |",
+        "| 2   | you | 1970-01-01T00:00:00.000000020Z |",
+        "+-----+-----+--------------------------------+",
+    ];
+
+    // Data deleted when it is in MUB, and then moved to RUB and OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromMubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in RUB, and then moved OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromRubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromOsOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+}
diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs
index 6a565e0031..85349b917c 100644
--- a/read_buffer/src/column.rs
+++ b/read_buffer/src/column.rs
@@ -2067,6 +2067,40 @@ mod test {
         assert!(matches!(row_ids, RowIDsOption::All(_)));
     }
 
+    #[test]
+    fn row_ids_filter_float_trimmed() {
+        let input = &[100.0, 200.0, 300.0, 2.0, 22.0, 30.0];
+
+        let col = Column::from(&input[..]);
+        let mut row_ids = col.row_ids_filter(
+            &cmp::Operator::Equal,
+            &Value::from(200.0),
+            RowIDs::new_bitmap(),
+        );
+        assert_eq!(row_ids.unwrap().to_vec(), vec![1]);
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::LT,
+            &Value::from(64000.0),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::GTE,
+            &Value::from(-1_000_000.0),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::NotEqual,
+            &Value::from(1_000_000.3),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+    }
+
     #[test]
     fn row_ids_range() {
         let input = &[100_i64, 200, 300, 2, 200, 22, 30];
diff --git a/read_buffer/src/column/encoding/scalar/fixed.rs b/read_buffer/src/column/encoding/scalar/fixed.rs
index 6cf6b0b63e..f8d8193a0b 100644
--- a/read_buffer/src/column/encoding/scalar/fixed.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed.rs
@@ -7,6 +7,7 @@
 //! allow results to be emitted as some logical type `L` via a transformation
 //! `T`.
 use either::Either;
+use observability_deps::tracing::debug;
 use std::cmp::Ordering;
 use std::fmt::Debug;
 use std::marker::PhantomData;
@@ -373,14 +374,36 @@ where
         Some(self.transcoder.decode(max))
     }
 
-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst.add_range(0, self.num_rows());
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
+        // N.B, the transcoder may have changed the operator depending on the
+        // value provided.
         match op {
             cmp::Operator::GT => self.row_ids_cmp_order(&value, PartialOrd::gt, dst),
             cmp::Operator::GTE => self.row_ids_cmp_order(&value, PartialOrd::ge, dst),
             cmp::Operator::LT => self.row_ids_cmp_order(&value, PartialOrd::lt, dst),
             cmp::Operator::LTE => self.row_ids_cmp_order(&value, PartialOrd::le, dst),
-            _ => self.row_ids_equal(&value, op, dst),
+            _ => self.row_ids_equal(&value, &op, dst),
         }
     }
 
@@ -390,8 +413,16 @@ where
         right: (L, &cmp::Operator),
         dst: RowIDs,
     ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");
 
         match (&left.1, &right.1) {
             (cmp::Operator::GT, cmp::Operator::LT)
@@ -402,8 +433,8 @@ where
             | (cmp::Operator::LT, cmp::Operator::GTE)
             | (cmp::Operator::LTE, cmp::Operator::GT)
             | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order(
-                (&left.0, Self::ord_from_op(left.1)),
-                (&right.0, Self::ord_from_op(right.1)),
+                (&left.0, Self::ord_from_op(&left.1)),
+                (&right.0, Self::ord_from_op(&right.1)),
                 dst,
             ),
 
diff --git a/read_buffer/src/column/encoding/scalar/fixed_null.rs b/read_buffer/src/column/encoding/scalar/fixed_null.rs
index 7168a28d75..ed7a2e73bd 100644
--- a/read_buffer/src/column/encoding/scalar/fixed_null.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed_null.rs
@@ -3,6 +3,7 @@
 //! This encoding stores a column of fixed-width numerical values backed by an
 //! an Arrow array, allowing for storage of NULL values.
 use either::Either;
+use observability_deps::tracing::debug;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::mem::size_of;
@@ -227,6 +228,43 @@ where
         }
         dst
     }
+
+    // Identify all row IDs that contain a non-null value.
+    fn all_non_null_row_ids(&self, mut dst: RowIDs) -> RowIDs {
+        dst.clear();
+
+        if self.null_count() == 0 {
+            dst.add_range(0, self.num_rows());
+            return dst;
+        }
+
+        let mut found = false;
+        let mut count = 0;
+        for i in 0..self.num_rows() as usize {
+            if self.arr.is_null(i) {
+                if found {
+                    // add the non-null range
+                    let (min, max) = (i as u32 - count, i as u32);
+                    dst.add_range(min, max);
+                    found = false;
+                    count = 0;
+                }
+                continue;
+            }
+
+            if !found {
+                found = true;
+            }
+            count += 1;
+        }
+
+        // add any remaining range.
+        if found {
+            let (min, max) = (self.num_rows() - count, self.num_rows());
+            dst.add_range(min, max);
+        }
+        dst
+    }
 }
 
 impl<P, L, T> ScalarEncoding<L> for FixedNull<P, L, T>
@@ -411,14 +449,34 @@ where
         max.map(|v| self.transcoder.decode(v))
     }
 
-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // non-null values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst = self.all_non_null_row_ids(dst);
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
         match op {
-            cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            _ => self.row_ids_equal(value, op, dst),
+            cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            _ => self.row_ids_equal(value, &op, dst),
         }
     }
 
@@ -428,8 +486,16 @@ where
         right: (L, &cmp::Operator),
         dst: RowIDs,
     ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");
 
         match (left.1, right.1) {
             (cmp::Operator::GT, cmp::Operator::LT)
@@ -440,8 +506,8 @@ where
             | (cmp::Operator::LT, cmp::Operator::GTE)
             | (cmp::Operator::LTE, cmp::Operator::GT)
             | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order(
-                (left.0, Self::ord_from_op(left.1)),
-                (right.0, Self::ord_from_op(right.1)),
+                (left.0, Self::ord_from_op(&left.1)),
+                (right.0, Self::ord_from_op(&right.1)),
                 dst,
             ),
 
@@ -789,6 +855,38 @@ mod test {
         assert_eq!(row_ids.to_vec(), vec![1, 2, 4]);
     }
 
+    #[test]
+    fn row_ids_filter_range_all_non_null() {
+        let cases = vec![
+            (vec![None], vec![]),
+            (vec![None, None, None], vec![]),
+            (vec![Some(22)], vec![0_u32]),
+            (vec![Some(22), Some(3), Some(3)], vec![0, 1, 2]),
+            (vec![Some(22), None], vec![0]),
+            (
+                vec![Some(22), None, Some(1), None, Some(3), None],
+                vec![0, 2, 4],
+            ),
+            (vec![Some(22), None, None, Some(33)], vec![0, 3]),
+            (vec![None, None, Some(33)], vec![2]),
+            (
+                vec![None, None, Some(33), None, None, Some(3), Some(3), Some(1)],
+                vec![2, 5, 6, 7],
+            ),
+        ];
+
+        for (i, (data, exp)) in cases.into_iter().enumerate() {
+            let (v, _) = new_encoding(data);
+            let dst = RowIDs::new_vector();
+            assert_eq!(
+                v.all_non_null_row_ids(dst).unwrap_vector(),
+                &exp,
+                "example {:?} failed",
+                i
+            );
+        }
+    }
+
     #[test]
     fn has_non_null_value() {
         let (v, _) = new_encoding(vec![None, None]);
diff --git a/read_buffer/src/column/encoding/scalar/rle.rs b/read_buffer/src/column/encoding/scalar/rle.rs
index fc8c1b3bc2..80d81e777f 100644
--- a/read_buffer/src/column/encoding/scalar/rle.rs
+++ b/read_buffer/src/column/encoding/scalar/rle.rs
@@ -1,4 +1,5 @@
 use either::Either;
+use observability_deps::tracing::debug;
 
 use crate::column::cmp;
 use crate::column::RowIDs;
@@ -288,18 +289,35 @@ where
                 let left_cmp_result = next.partial_cmp(left.0);
                 let right_cmp_result = next.partial_cmp(right.0);
 
-                // TODO(edd): eurgh I still don't understand how I got this to
-                // be correct. Need to revisit to make it simpler.
                 let left_result_ok =
-                    !(left_cmp_result == Some(left_op.0) || left_cmp_result == Some(left_op.1));
+                    left_cmp_result == Some(left_op.0) || left_cmp_result == Some(left_op.1);
                 let right_result_ok =
-                    !(right_cmp_result == Some(right_op.0) || right_cmp_result == Some(right_op.1));
+                    right_cmp_result == Some(right_op.0) || right_cmp_result == Some(right_op.1);
 
-                if !(left_result_ok || right_result_ok) {
+                if left_result_ok && right_result_ok {
                     dst.add_range(curr_logical_row_id, curr_logical_row_id + rl);
                 }
-                curr_logical_row_id += rl;
             }
+            curr_logical_row_id += rl;
+        }
+
+        dst
+    }
+
+    fn all_non_null_row_ids(&self, mut dst: RowIDs) -> RowIDs {
+        dst.clear();
+
+        if self.null_count() == 0 {
+            dst.add_range(0, self.num_rows());
+            return dst;
+        }
+
+        let mut curr_logical_row_id = 0;
+        for (rl, next) in &self.run_lengths {
+            if next.is_some() {
+                dst.add_range(curr_logical_row_id, curr_logical_row_id + rl);
+            }
+            curr_logical_row_id += rl;
         }
 
         dst
@@ -375,15 +393,34 @@ where
         self.num_rows
     }
 
-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // non-null values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst = self.all_non_null_row_ids(dst);
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
         match op {
-            cmp::Operator::Equal | cmp::Operator::NotEqual => {
-                self.row_ids_cmp_equal(value, op, dst)
-            }
-            cmp::Operator::LT | cmp::Operator::LTE | cmp::Operator::GT | cmp::Operator::GTE => {
-                self.row_ids_cmp(value, op, dst)
-            }
+            cmp::Operator::GT => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::GTE => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::LT => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::LTE => self.row_ids_cmp(value, &op, dst),
+            _ => self.row_ids_cmp_equal(value, &op, dst),
         }
     }
 
@@ -393,8 +430,16 @@ where
         right: (L, &cmp::Operator),
         dst: RowIDs,
     ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");
 
         match (&left.1, &right.1) {
             (cmp::Operator::GT, cmp::Operator::LT)
@@ -405,8 +450,8 @@ where
             | (cmp::Operator::LT, cmp::Operator::GTE)
             | (cmp::Operator::LTE, cmp::Operator::GT)
             | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range(
-                (&left.0, Self::ord_from_op(left.1)),
-                (&right.0, Self::ord_from_op(right.1)),
+                (&left.0, Self::ord_from_op(&left.1)),
+                (&right.0, Self::ord_from_op(&right.1)),
                 dst,
             ),
 
@@ -622,6 +667,16 @@ mod test {
         )
     }
 
+    fn new_encoding_opt(
+        values: Vec<Option<i64>>,
+    ) -> (RLE<i64, i64, Arc<MockTranscoder>>, Arc<MockTranscoder>) {
+        let mock = Arc::new(MockTranscoder::default());
+        (
+            RLE::new_from_iter_opt(values.into_iter(), Arc::clone(&mock)),
+            mock,
+        )
+    }
+
     #[test]
     fn new_from_iter() {
         let cases = vec![
@@ -981,6 +1036,38 @@ mod test {
         }
     }
 
+    #[test]
+    fn row_ids_filter_range_all_non_null() {
+        let cases = vec![
+            (vec![None], vec![]),
+            (vec![None, None, None], vec![]),
+            (vec![Some(22)], vec![0_u32]),
+            (vec![Some(22), Some(3), Some(3)], vec![0, 1, 2]),
+            (vec![Some(22), None], vec![0]),
+            (
+                vec![Some(22), None, Some(1), None, Some(3), None],
+                vec![0, 2, 4],
+            ),
+            (vec![Some(22), None, None, Some(33)], vec![0, 3]),
+            (vec![None, None, Some(33)], vec![2]),
+            (
+                vec![None, None, Some(33), None, None, Some(3), Some(3), Some(1)],
+                vec![2, 5, 6, 7],
+            ),
+        ];
+
+        for (i, (data, exp)) in cases.into_iter().enumerate() {
+            let (v, _) = new_encoding_opt(data);
+            let dst = RowIDs::new_vector();
+            assert_eq!(
+                v.all_non_null_row_ids(dst).unwrap_vector(),
+                &exp,
+                "example {:?} failed",
+                i
+            );
+        }
+    }
+
     #[test]
     fn row_ids_filter_range() {
         let (enc, transcoder) = new_encoding(vec![
@@ -1016,6 +1103,37 @@ mod test {
         assert_eq!(transcoder.encodings(), calls * 2);
     }
 
+    #[test]
+    fn row_ids_filter_range_nulls() {
+        let (enc, transcoder) = new_encoding_opt(vec![
+            Some(100),
+            None,
+            None,
+            None,
+            Some(100),
+            Some(101),
+            Some(101),
+        ]);
+
+        let cases = vec![
+            (
+                (100, &Operator::GTE),
+                (240, &Operator::LT),
+                vec![0, 4, 5, 6],
+            ),
+            ((100, &Operator::GT), (240, &Operator::LT), vec![5, 6]),
+            ((10, &Operator::LT), (-100, &Operator::GT), vec![]),
+            ((21, &Operator::GTE), (100, &Operator::LTE), vec![0, 4]),
+        ];
+
+        let calls = cases.len();
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(dst.unwrap_vector(), &exp);
+        }
+        assert_eq!(transcoder.encodings(), calls * 2);
+    }
+
     #[test]
     fn estimate_rle_size() {
         let cases = vec![
diff --git a/read_buffer/src/column/encoding/scalar/transcoders.rs b/read_buffer/src/column/encoding/scalar/transcoders.rs
index 2628fde983..da5db2c6b7 100644
--- a/read_buffer/src/column/encoding/scalar/transcoders.rs
+++ b/read_buffer/src/column/encoding/scalar/transcoders.rs
@@ -1,3 +1,4 @@
+use crate::column::cmp::Operator;
 use std::{
     convert::TryFrom,
     fmt::{Debug, Display},
@@ -13,7 +14,17 @@ use std::{
 // `P` is a physical type that is stored directly within an encoding, `L` is
 // a logical type callers expect to be returned.
 pub trait Transcoder<P, L>: Debug + Display {
+    /// A function that encodes a logical value into a physical representation.
     fn encode(&self, _: L) -> P;
+
+    /// A function that attempts to encode a logical value, within the context
+    /// of a comparison operator, into a physical representation.
+    ///
+    /// Implementation should return a suitable operator for the physical
+    /// representation, which may differ from the provided operator.
+    fn encode_comparable(&self, _: L, _: Operator) -> Option<(P, Operator)>;
+
+    /// A function to decode a physical representation back into a logical value.
     fn decode(&self, _: P) -> L;
 }
 
@@ -29,6 +40,10 @@ impl<T> Transcoder<T, T> for NoOpTranscoder {
         v
     }
 
+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        Some((v, op))
+    }
+
     fn decode(&self, v: T) -> T {
         v
     }
@@ -56,13 +71,17 @@ pub struct ByteTrimmer {}
 impl<P, L> Transcoder<P, L> for ByteTrimmer
 where
     L: From<P>,
-    P: TryFrom<L>,
+    P: TryFrom<L> + PartialEq + PartialOrd,
     <P as TryFrom<L>>::Error: std::fmt::Debug,
 {
     fn encode(&self, v: L) -> P {
         P::try_from(v).unwrap()
     }
 
+    fn encode_comparable(&self, v: L, op: Operator) -> Option<(P, Operator)> {
+        P::try_from(v).ok().map(|p| (p, op))
+    }
+
     fn decode(&self, v: P) -> L {
         L::from(v)
     }
@@ -91,6 +110,54 @@ macro_rules! make_float_trimmer {
                 v as $type
             }
 
+            fn encode_comparable(&self, v: f64, op: Operator) -> Option<($type, Operator)> {
+                assert!(v <= <$type>::MAX as f64);
+                if v == ((v as $type) as f64) {
+                    return Some((v as $type, op));
+                }
+
+                match op {
+                    Operator::Equal => {
+                        None // no encoded values will == v
+                    }
+                    Operator::NotEqual => {
+                        None // all encoded values will != v
+                    }
+                    Operator::LT => {
+                        // convert to next highest encodable value. For example
+                        // given '< 23.2` return 24.0 encoded as the physical
+                        // type. < 23.2 is logically equivalent to < 24.0 since
+                        // there are no valid values in the domain (23.2, 24.0).
+                        Some((v.ceil() as $type, op))
+                    }
+                    Operator::LTE => {
+                        // convert to next highest encodable value and change
+                        // operator to <.
+                        // For example given '<= 23.2` return 24.0 encoded as
+                        // the physical type. <= 23.2 is logically equivalent
+                        // to < 24.0 since there are no valid values in the
+                        // domain [23.2, 24.0).
+                        Some((v.ceil() as $type, Operator::LT))
+                    }
+                    Operator::GT => {
+                        // convert to next lowest encodable value. For example
+                        // given '> 23.2` return 23.0 encoded as the physical
+                        // type. > 23.2 is logically equivalent to > 23.0 since
+                        // there are no valid values in the domain (23.0, 23.2].
+                        Some((v.floor() as $type, op))
+                    }
+                    Operator::GTE => {
+                        // convert to next lowest encodable value and change
+                        // operator to >.
+                        // For example given '>= 23.2` return 23.0 encoded as
+                        // the physical type. >= 23.2 is logically equivalent
+                        // to > 24.0 since there are no valid values in the
+                        // domain [23.2, 24.0).
+                        Some((v.floor() as $type, Operator::GT))
+                    }
+                }
+            }
+
             fn decode(&self, v: $type) -> f64 {
                 v.into()
             }
@@ -119,7 +186,7 @@ impl Display for FloatByteTrimmer {
 //        result.
 
 #[cfg(test)]
-use std::{sync::atomic::AtomicUsize, sync::atomic::Ordering, sync::Arc};
+use std::{sync::atomic, sync::atomic::AtomicUsize, sync::Arc};
 #[cfg(test)]
 /// A mock implementation of Transcoder that tracks calls to encode and decode.
 /// This is useful for testing encoder implementations.
@@ -127,6 +194,7 @@ use std::{sync::atomic::AtomicUsize, sync::atomic::Ordering, sync::Arc};
 pub struct MockTranscoder {
     encoding_calls: AtomicUsize,
     decoding_calls: AtomicUsize,
+    partial_cmp_calls: AtomicUsize,
 }
 
 #[cfg(test)]
@@ -135,6 +203,7 @@ impl Default for MockTranscoder {
         Self {
             encoding_calls: AtomicUsize::default(),
             decoding_calls: AtomicUsize::default(),
+            partial_cmp_calls: AtomicUsize::default(),
         }
     }
 }
@@ -142,23 +211,28 @@ impl Default for MockTranscoder {
 #[cfg(test)]
 impl MockTranscoder {
     pub fn encodings(&self) -> usize {
-        self.encoding_calls.load(Ordering::Relaxed)
+        self.encoding_calls.load(atomic::Ordering::Relaxed)
     }
 
     pub fn decodings(&self) -> usize {
-        self.decoding_calls.load(Ordering::Relaxed)
+        self.decoding_calls.load(atomic::Ordering::Relaxed)
     }
 }
 
 #[cfg(test)]
 impl<T> Transcoder<T, T> for MockTranscoder {
     fn encode(&self, v: T) -> T {
-        self.encoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
         v
     }
 
+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
+        Some((v, op))
+    }
+
     fn decode(&self, v: T) -> T {
-        self.decoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.decoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
         v
     }
 }
@@ -166,12 +240,17 @@ impl<T> Transcoder<T, T> for MockTranscoder {
 #[cfg(test)]
 impl<T> Transcoder<T, T> for Arc<MockTranscoder> {
     fn encode(&self, v: T) -> T {
-        self.encoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
         v
     }
 
+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
+        Some((v, op))
+    }
+
     fn decode(&self, v: T) -> T {
-        self.decoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.decoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
         v
     }
 }
diff --git a/read_buffer/src/column/float.rs b/read_buffer/src/column/float.rs
index 56022c180f..d0755115f4 100644
--- a/read_buffer/src/column/float.rs
+++ b/read_buffer/src/column/float.rs
@@ -754,4 +754,303 @@ mod test {
             //assert_eq!(dst.unwrap_vector(), &exp, "example '{} {:?}' failed", op, v);
         }
     }
+
+    #[test]
+    fn row_ids_filter_float_trimmer() {
+        let data = vec![100.0, 200.0, 100.0, 300.0, 400.0];
+
+        let float_trimmer = FloatByteTrimmer {};
+        let data_float_trimmed = data
+            .iter()
+            .cloned()
+            .map::<u16, _>(|x| float_trimmer.encode(x))
+            .collect::<Vec<u16>>();
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter(
+                data_float_trimmed.iter().cloned(),
+                float_trimmer,
+            )),
+            Box::new(Fixed::<u16, f64, _>::new(
+                data_float_trimmed.clone(),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                PrimitiveArray::from(data_float_trimmed),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_float_trimmer(enc)
+        }
+    }
+
+    fn _row_ids_filter_float_trimmer(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, 200.0, 100.0, 300.0, 400.0]
+        let cases = vec![
+            (100.0, Operator::Equal, vec![0, 2]),          // 100.0, 100.0
+            (100.0, Operator::NotEqual, vec![1, 3, 4]),    // 200.0, 300.0, 400.0
+            (100.0, Operator::LT, vec![]),                 //
+            (100.0, Operator::LTE, vec![0, 2]),            // 100.0, 100.0
+            (100.0, Operator::GT, vec![1, 3, 4]),          // 200.0, 300.0, 400.0
+            (100.0, Operator::GTE, vec![0, 1, 2, 3, 4]),   // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.0, Operator::Equal, vec![1]),             // 200.0
+            (200.0, Operator::NotEqual, vec![0, 2, 3, 4]), // 100.0, 100.0, 300.0, 400.0
+            (200.0, Operator::LT, vec![0, 2]),             // 100.0, 100.0
+            (200.0, Operator::LTE, vec![0, 1, 2]),         // 100.0, 200.0, 100.0
+            (200.0, Operator::GT, vec![3, 4]),             // 300.0, 400.0
+            (200.0, Operator::GTE, vec![1, 3, 4]),         // 200.0, 300.0, 400.0
+            (400.0, Operator::Equal, vec![4]),             // 400.0
+            (400.0, Operator::NotEqual, vec![0, 1, 2, 3]), // 100.0, 200.0, 100.0, 300.0
+            (400.0, Operator::LT, vec![0, 1, 2, 3]),       // 100.0, 200.0, 100.0, 300.0
+            (400.0, Operator::LTE, vec![0, 1, 2, 3, 4]),   // 100.0, 200.0, 100.0, 300.0, 400.0
+            (400.0, Operator::GT, vec![]),                 //
+            (400.0, Operator::GTE, vec![4]),               // 400.0
+            // Values not present in the column
+            (99.0, Operator::Equal, vec![]),                  //
+            (99.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]),  // 100.0, 200.0, 100.0, 300.0, 400.0
+            (99.0, Operator::LT, vec![]),                     //
+            (99.0, Operator::LTE, vec![]),                    //
+            (99.0, Operator::GT, vec![0, 1, 2, 3, 4]),        // 100.0, 200.0, 100.0, 300.0, 400.0
+            (99.0, Operator::GTE, vec![0, 1, 2, 3, 4]),       // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.4, Operator::Equal, vec![]),                 //
+            (200.4, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.4, Operator::LT, vec![0, 1, 2]),             // 100.0, 200.0, 100.0
+            (200.4, Operator::LTE, vec![0, 1, 2]),            // 100.0, 200.0, 100.0
+            (200.4, Operator::GT, vec![3, 4]),                // 300.0, 400.0
+            (200.4, Operator::GTE, vec![3, 4]),               // 300.0, 400.0
+            (201.0, Operator::Equal, vec![]),                 //
+            (201.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (201.0, Operator::LT, vec![0, 1, 2]),             // 100.0, 200.0, 100.0
+            (201.0, Operator::LTE, vec![0, 1, 2]),            // 100.0, 200.0, 100.0
+            (201.0, Operator::GT, vec![3, 4]),                // 300.0, 400.0
+            (201.0, Operator::GTE, vec![3, 4]),               // 300.0, 400.0
+            (401.0, Operator::Equal, vec![]),                 //
+            (401.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::LT, vec![0, 1, 2, 3, 4]),       // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::LTE, vec![0, 1, 2, 3, 4]),      // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::GT, vec![]),                    //
+            (401.0, Operator::GTE, vec![]),                   //
+        ];
+
+        for (v, op, exp) in cases {
+            let dst = enc.row_ids_filter(v, &op, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{} {:?}' failed for {:?}",
+                op,
+                v,
+                enc.name()
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_float_trimmer_with_nulls() {
+        let data = vec![Some(100.0), None, None, Some(200.0), None];
+
+        let float_trimmer = FloatByteTrimmer {};
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter_opt(
+                data.iter()
+                    .cloned()
+                    .map(|x| x.map(|v| float_trimmer.encode(v))),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                data.iter()
+                    .cloned()
+                    .map(|v| v.map(|v| float_trimmer.encode(v)))
+                    .collect(),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_float_trimmer_with_nulls(enc)
+        }
+    }
+
+    fn _row_ids_filter_float_trimmer_with_nulls(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, NULL, NULL, 200.0]
+        let cases = vec![
+            (100.0, Operator::Equal, vec![0]),    // 100.0
+            (100.0, Operator::NotEqual, vec![3]), // 200.0
+            (100.0, Operator::LT, vec![]),        //
+            (100.0, Operator::LTE, vec![0]),      // 100.0
+            (100.0, Operator::GT, vec![3]),       // 200.0
+            (100.0, Operator::GTE, vec![0, 3]),   // 100.0, 200.0
+            (200.0, Operator::Equal, vec![3]),    // 200.0
+            (200.0, Operator::NotEqual, vec![0]), // 100.0
+            (200.0, Operator::LT, vec![0]),       // 100.0
+            (200.0, Operator::LTE, vec![0, 3]),   // 100.0, 200.0
+            (200.0, Operator::GT, vec![]),        //
+            (200.0, Operator::GTE, vec![3]),      // 200.0
+            // Values not present in the column
+            (99.0, Operator::Equal, vec![]),         //
+            (99.0, Operator::NotEqual, vec![0, 3]),  // 100.0, 200.0
+            (99.0, Operator::LT, vec![]),            //
+            (99.0, Operator::LTE, vec![]),           //
+            (99.0, Operator::GT, vec![0, 3]),        // 100.0, 200.0
+            (99.0, Operator::GTE, vec![0, 3]),       // 100.0, 200.0
+            (200.4, Operator::Equal, vec![]),        //
+            (200.4, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (200.4, Operator::LT, vec![0, 3]),       // 100.0,200.0
+            (200.4, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (200.4, Operator::GT, vec![]),           //
+            (200.4, Operator::GTE, vec![]),          //
+            (201.0, Operator::Equal, vec![]),        //
+            (201.0, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (201.0, Operator::LT, vec![0, 3]),       // 100.0, 200.0
+            (201.0, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (201.0, Operator::GT, vec![]),           //
+            (201.0, Operator::GTE, vec![]),          //
+            (401.0, Operator::Equal, vec![]),        //
+            (401.0, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (401.0, Operator::LT, vec![0, 3]),       // 100.0, 200.0
+            (401.0, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (401.0, Operator::GT, vec![]),           //
+            (401.0, Operator::GTE, vec![]),          //
+        ];
+
+        for (v, op, exp) in cases {
+            let dst = enc.row_ids_filter(v, &op, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{} {:?}' failed for {:?}",
+                op,
+                v,
+                enc.name()
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_range_float_trimmer() {
+        let data = vec![100.0, 200.0, 100.0, 300.0, 400.0];
+
+        let float_trimmer = FloatByteTrimmer {};
+        let data_float_trimmed = data
+            .iter()
+            .cloned()
+            .map::<u16, _>(|x| float_trimmer.encode(x))
+            .collect::<Vec<u16>>();
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter(
+                data_float_trimmed.iter().cloned(),
+                float_trimmer,
+            )),
+            Box::new(Fixed::<u16, f64, _>::new(
+                data_float_trimmed.clone(),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                PrimitiveArray::from(data_float_trimmed),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_range_float_trimmer(enc)
+        }
+    }
+
+    fn _row_ids_filter_range_float_trimmer(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, 200.0, 100.0, 300.0, 400.0]
+        let cases = vec![
+            ((100.0, &Operator::LT), (99.0, &Operator::GT), vec![]), //
+            ((100.0, &Operator::LTE), (100.0, &Operator::GTE), vec![0, 2]), // 100.0, 100.0
+            (
+                (100.0, &Operator::GT),
+                (400.0, &Operator::LTE),
+                vec![1, 3, 4],
+            ), // 200.0, 300.0, 400.0
+            (
+                (100.0, &Operator::GTE),
+                (401.0, &Operator::LTE),
+                vec![0, 1, 2, 3, 4],
+            ), // 100.0, 200.0, 100.0, 300.0, 400.0
+            ((200.0, &Operator::LT), (99.6, &Operator::GT), vec![0, 2]), // 100.0, 100.0
+            ((200.0, &Operator::GT), (401.2, &Operator::LTE), vec![3, 4]), // 300.0, 400.0
+            (
+                (200.0, &Operator::GTE),
+                (400.9, &Operator::LT),
+                vec![1, 3, 4],
+            ), // 200.0, 300.0, 400.0
+            (
+                (99.8, &Operator::GT),
+                (500.87, &Operator::LT),
+                vec![0, 1, 2, 3, 4],
+            ), // 100.0, 200.0, 100.0, 300.0, 400.0
+        ];
+
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{:?} {:?}' failed for {:?}",
+                left,
+                right,
+                enc.name(),
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_range_float_trimmer_with_nulls() {
+        let data = vec![Some(100.0), None, None, Some(200.0), None];
+
+        let float_trimmer = FloatByteTrimmer {};
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter_opt(
+                data.iter()
+                    .cloned()
+                    .map(|x| x.map(|v| float_trimmer.encode(v))),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                data.iter()
+                    .cloned()
+                    .map(|v| v.map(|v| float_trimmer.encode(v)))
+                    .collect(),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_range_float_trimmer_with_nulls(enc)
+        }
+    }
+
+    fn _row_ids_filter_range_float_trimmer_with_nulls(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, NULL, NULL, 200.0, NULL]
+        let cases = vec![
+            ((100.0, &Operator::LT), (99.0, &Operator::GT), vec![]), //
+            ((100.0, &Operator::LTE), (100.0, &Operator::GTE), vec![0]), // 100.0
+            ((100.0, &Operator::GT), (400.0, &Operator::LTE), vec![3]), // 200.0
+            ((100.0, &Operator::GTE), (401.0, &Operator::LTE), vec![0, 3]), // 100.0, 200.0
+            ((200.0, &Operator::LT), (99.6, &Operator::GT), vec![0]), // 100.0
+            ((200.0, &Operator::GT), (401.2, &Operator::LTE), vec![]), //
+            ((99.8, &Operator::GT), (500.87, &Operator::LT), vec![0, 3]), // 100.0, 200.0
+        ];
+
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{:?} {:?}' failed for {:?}",
+                left,
+                right,
+                enc.name(),
+            );
+        }
+    }
 }
diff --git a/server/src/database.rs b/server/src/database.rs
index 60b62992a0..1cbbe5eee6 100644
--- a/server/src/database.rs
+++ b/server/src/database.rs
@@ -21,7 +21,7 @@ use internal_types::freezable::Freezable;
 use iox_object_store::IoxObjectStore;
 use observability_deps::tracing::{error, info, warn};
 use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
-use parquet_file::catalog::api::PreservedCatalog;
+use parquet_file::catalog::core::PreservedCatalog;
 use persistence_windows::checkpoint::ReplayPlan;
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{future::Future, sync::Arc, time::Duration};
@@ -63,7 +63,7 @@ pub enum Error {
     ))]
     WipePreservedCatalog {
         db_name: String,
-        source: Box<parquet_file::catalog::api::Error>,
+        source: Box<parquet_file::catalog::core::Error>,
     },
 
     #[snafu(display("failed to skip replay for database ({}): {}", db_name, source))]
diff --git a/server/src/db.rs b/server/src/db.rs
index 1b75e37638..22e65d36dd 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -32,8 +32,9 @@ use iox_object_store::IoxObjectStore;
 use mutable_buffer::chunk::{ChunkMetrics as MutableBufferChunkMetrics, MBChunk};
 use observability_deps::tracing::{debug, error, info};
 use parquet_file::catalog::{
-    api::{CatalogParquetInfo, CheckpointData, PreservedCatalog},
     cleanup::{delete_files as delete_parquet_files, get_unreferenced_parquet_files},
+    core::PreservedCatalog,
+    interface::{CatalogParquetInfo, CheckpointData},
     prune::prune_history as prune_catalog_transaction_history,
 };
 use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::PersistenceWindows};
@@ -518,7 +519,7 @@ impl Db {
     pub async fn delete(
         self: &Arc<Self>,
         table_name: &str,
-        delete_predicate: &Predicate,
+        delete_predicate: Arc<Predicate>,
     ) -> Result<()> {
         // get all partitions of this table
         let table = self
@@ -533,7 +534,7 @@ impl Db {
                 // save the delete predicate in the chunk
                 let mut chunk = chunk.write();
                 chunk
-                    .add_delete_predicate(delete_predicate)
+                    .add_delete_predicate(Arc::clone(&delete_predicate))
                     .context(AddDeletePredicateError)?;
             }
         }
@@ -1331,7 +1332,7 @@ mod tests {
         assert_store_sequenced_entry_failures,
         db::{
             catalog::chunk::ChunkStage,
-            test_helpers::{run_query, try_write_lp, write_lp},
+            test_helpers::{run_query, try_write_lp, write_lp, write_lp_with_time},
         },
         utils::{make_db, TestDb},
     };
@@ -1875,9 +1876,8 @@ mod tests {
         let test_db = make_db().await;
         let db = Arc::new(test_db.db);
 
-        let time0 = Utc::now();
-        write_lp(db.as_ref(), "cpu bar=1 10").await;
-        let time1 = Utc::now();
+        let t_write1 = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu bar=1 10", t_write1).await;
 
         let partition_key = "1970-01-01T00";
         let mb_chunk = db
@@ -1892,14 +1892,12 @@ mod tests {
 
         let first_old_rb_write = old_rb_chunk.time_of_first_write();
         let last_old_rb_write = old_rb_chunk.time_of_last_write();
-        assert!(time0 < first_old_rb_write);
         assert_eq!(first_old_rb_write, last_old_rb_write);
-        assert!(first_old_rb_write < time1);
+        assert_eq!(first_old_rb_write, t_write1);
 
         // Put new data into the mutable buffer
-        let time2 = Utc::now();
-        write_lp(db.as_ref(), "cpu bar=2 20").await;
-        let time3 = Utc::now();
+        let t_write2 = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu bar=2 20", t_write2).await;
 
         // now, compact it
         let compacted_rb_chunk = db.compact_partition("cpu", partition_key).await.unwrap();
@@ -1917,8 +1915,7 @@ mod tests {
         let last_compacted_write = compacted_rb_chunk.time_of_last_write();
         assert_eq!(first_old_rb_write, first_compacted_write);
         assert_ne!(last_old_rb_write, last_compacted_write);
-        assert!(time2 < last_compacted_write);
-        assert!(last_compacted_write < time3);
+        assert_eq!(last_compacted_write, t_write2);
 
         // data should be readable
         let expected = vec![
@@ -1935,7 +1932,7 @@ mod tests {
 
     async fn collect_read_filter(chunk: &DbChunk) -> Vec<RecordBatch> {
         chunk
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(&Default::default(), Selection::All, &[])
             .unwrap()
             .collect::<Vec<_>>()
             .await
@@ -2483,12 +2480,16 @@ mod tests {
     #[tokio::test]
     async fn partition_chunk_summaries_timestamp() {
         let db = Arc::new(make_db().await.db);
-        let start = Utc::now();
-        write_lp(&db, "cpu bar=1 1").await;
-        let after_first_write = Utc::now();
-        write_lp(&db, "cpu bar=2 2").await;
+
+        let t_first_write = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1 1", t_first_write).await;
+
+        let t_second_write = Utc::now();
+        write_lp_with_time(&db, "cpu bar=2 2", t_second_write).await;
+
+        let t_close_before = Utc::now();
         db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();
-        let after_close = Utc::now();
+        let t_close_after = Utc::now();
 
         let mut chunk_summaries = db.chunk_summaries().unwrap();
 
@@ -2496,59 +2497,18 @@ mod tests {
 
         let summary = &chunk_summaries[0];
         assert_eq!(summary.id, 0, "summary; {:#?}", summary);
-        assert!(
-            summary.time_of_first_write > start,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_of_first_write < after_close,
-            "summary; {:#?}",
-            summary
-        );
-
-        assert!(
-            summary.time_of_last_write > after_first_write,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_of_last_write < after_close,
-            "summary; {:#?}",
-            summary
-        );
-
-        assert!(
-            summary.time_closed.unwrap() > after_first_write,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_closed.unwrap() < after_close,
-            "summary; {:#?}",
-            summary
-        );
+        assert_eq!(summary.time_of_first_write, t_first_write);
+        assert_eq!(summary.time_of_last_write, t_second_write);
+        assert!(t_close_before <= summary.time_closed.unwrap());
+        assert!(summary.time_closed.unwrap() <= t_close_after);
     }
 
-    fn assert_first_last_times_eq(chunk_summary: &ChunkSummary) {
+    fn assert_first_last_times_eq(chunk_summary: &ChunkSummary, expected: DateTime<Utc>) {
         let first_write = chunk_summary.time_of_first_write;
         let last_write = chunk_summary.time_of_last_write;
 
         assert_eq!(first_write, last_write);
-    }
-
-    fn assert_first_last_times_between(
-        chunk_summary: &ChunkSummary,
-        before: DateTime<Utc>,
-        after: DateTime<Utc>,
-    ) {
-        let first_write = chunk_summary.time_of_first_write;
-        let last_write = chunk_summary.time_of_last_write;
-
-        assert!(before < first_write);
-        assert!(before < last_write);
-        assert!(first_write < after);
-        assert!(last_write < after);
+        assert_eq!(first_write, expected);
     }
 
     fn assert_chunks_times_ordered(before: &ChunkSummary, after: &ChunkSummary) {
@@ -2581,21 +2541,17 @@ mod tests {
         let db = make_db().await.db;
 
         // get three chunks: one open, one closed in mb and one close in rb
-        // TIME 0 ---------------------------------------------------------------------------------
-        let time0 = Utc::now();
         // In open chunk, will end up in rb/os
-        write_lp(&db, "cpu bar=1 1").await;
-        // TIME 1 ---------------------------------------------------------------------------------
-        let time1 = Utc::now();
+        let t_write1 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1 1", t_write1).await;
+
         // Move open chunk to closed
         db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();
-        // TIME 2 ---------------------------------------------------------------------------------
-        let time2 = Utc::now();
+
         // New open chunk in mb
         // This point will end up in rb/os
-        write_lp(&db, "cpu bar=1,baz=2 2").await;
-        // TIME 3 ---------------------------------------------------------------------------------
-        let time3 = Utc::now();
+        let t_write2 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=2 2", t_write2).await;
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2604,19 +2560,15 @@ mod tests {
         // Each chunk has one write, so both chunks should have first write == last write
         let closed_mb_t3 = chunk_summaries[0].clone();
         assert_eq!(closed_mb_t3.storage, ChunkStorage::ClosedMutableBuffer);
-        assert_first_last_times_eq(&closed_mb_t3);
-        assert_first_last_times_between(&closed_mb_t3, time0, time1);
+        assert_first_last_times_eq(&closed_mb_t3, t_write1);
         let open_mb_t3 = chunk_summaries[1].clone();
         assert_eq!(open_mb_t3.storage, ChunkStorage::OpenMutableBuffer);
-        assert_first_last_times_eq(&open_mb_t3);
-        assert_first_last_times_between(&open_mb_t3, time2, time3);
+        assert_first_last_times_eq(&open_mb_t3, t_write2);
         assert_chunks_times_ordered(&closed_mb_t3, &open_mb_t3);
 
         // This point makes a new open mb chunk and will end up in the closed mb chunk
-        write_lp(&db, "cpu bar=1,baz=2,frob=3 400000000000000").await;
-        // TIME 4 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time3 and time4
+        let t_write3 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=2,frob=3 400000000000000", t_write3).await;
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2639,9 +2591,6 @@ mod tests {
         db.move_chunk_to_read_buffer("cpu", "1970-01-01T00", 0)
             .await
             .unwrap();
-        // TIME 5 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time4 and time5
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2668,9 +2617,6 @@ mod tests {
         )
         .await
         .unwrap();
-        // TIME 6 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time5 and time6
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2692,8 +2638,6 @@ mod tests {
 
         // Move open chunk to closed
         db.rollover_partition("cpu", "1970-01-05T15").await.unwrap();
-        // TIME 7 ---------------------------------------------------------------------------------
-        let time7 = Utc::now();
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2710,9 +2654,8 @@ mod tests {
 
         // New open chunk in mb
         // This point will stay in this open mb chunk
-        write_lp(&db, "cpu bar=1,baz=3,blargh=3 400000000000000").await;
-        // TIME 8 ---------------------------------------------------------------------------------
-        let time8 = Utc::now();
+        let t_write4 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=3,blargh=3 400000000000000", t_write4).await;
 
         // Check first/last write times on the chunks at this point
         let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@@ -2730,8 +2673,7 @@ mod tests {
         // times should be the same
         let open_mb_t8 = chunk_summaries[2].clone();
         assert_eq!(open_mb_t8.storage, ChunkStorage::OpenMutableBuffer);
-        assert_first_last_times_eq(&open_mb_t8);
-        assert_first_last_times_between(&open_mb_t8, time7, time8);
+        assert_first_last_times_eq(&open_mb_t8, t_write4);
 
         let lifecycle_action = None;
 
diff --git a/server/src/db/catalog/chunk.rs b/server/src/db/catalog/chunk.rs
index dcba0b4576..bf474d3403 100644
--- a/server/src/db/catalog/chunk.rs
+++ b/server/src/db/catalog/chunk.rs
@@ -16,7 +16,7 @@ use internal_types::{access::AccessRecorder, schema::Schema};
 use mutable_buffer::chunk::{snapshot::ChunkSnapshot as MBChunkSnapshot, MBChunk};
 use observability_deps::tracing::debug;
 use parquet_file::chunk::ParquetChunk;
-use predicate::predicate::{Predicate, PredicateBuilder};
+use predicate::predicate::Predicate;
 use read_buffer::RBChunk;
 use tracker::{TaskRegistration, TaskTracker};
 
@@ -80,7 +80,7 @@ pub struct ChunkMetadata {
     pub schema: Arc<Schema>,
 
     /// Delete predicates of this chunk
-    pub delete_predicates: Arc<Vec<Predicate>>,
+    pub delete_predicates: Vec<Arc<Predicate>>,
 }
 
 /// Different memory representations of a frozen chunk.
@@ -307,14 +307,14 @@ impl CatalogChunk {
         time_of_last_write: DateTime<Utc>,
         schema: Arc<Schema>,
         metrics: ChunkMetrics,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
         order: ChunkOrder,
     ) -> Self {
         let stage = ChunkStage::Frozen {
             meta: Arc::new(ChunkMetadata {
                 table_summary: Arc::new(chunk.table_summary()),
                 schema,
-                delete_predicates: Arc::clone(&delete_predicates),
+                delete_predicates,
             }),
             representation: ChunkStageFrozenRepr::ReadBuffer(Arc::new(chunk)),
         };
@@ -342,7 +342,7 @@ impl CatalogChunk {
         time_of_first_write: DateTime<Utc>,
         time_of_last_write: DateTime<Utc>,
         metrics: ChunkMetrics,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
         order: ChunkOrder,
     ) -> Self {
         assert_eq!(chunk.table_name(), addr.table_name.as_ref());
@@ -469,30 +469,24 @@ impl CatalogChunk {
         }
     }
 
-    pub fn add_delete_predicate(&mut self, delete_predicate: &Predicate) -> Result<()> {
+    pub fn add_delete_predicate(&mut self, delete_predicate: Arc<Predicate>) -> Result<()> {
+        debug!(
+            ?delete_predicate,
+            "Input delete predicate to CatalogChunk add_delete_predicate"
+        );
         match &mut self.stage {
             ChunkStage::Open { mb_chunk: _ } => {
                 // Freeze/close this chunk and add delete_predicate to its frozen one
                 self.freeze_with_predicate(delete_predicate)?;
             }
-            ChunkStage::Frozen { meta, .. } => {
+            ChunkStage::Frozen { meta, .. } | ChunkStage::Persisted { meta, .. } => {
                 // Add the delete_predicate into the chunk's metadata
-                let mut del_preds: Vec<Predicate> = (*meta.delete_predicates).clone();
-                del_preds.push(delete_predicate.clone());
+                let mut del_preds = meta.delete_predicates.clone();
+                del_preds.push(delete_predicate);
                 *meta = Arc::new(ChunkMetadata {
                     table_summary: Arc::clone(&meta.table_summary),
                     schema: Arc::clone(&meta.schema),
-                    delete_predicates: Arc::new(del_preds),
-                });
-            }
-            ChunkStage::Persisted { meta, .. } => {
-                // Add the delete_predicate into the chunk's metadata
-                let mut del_preds: Vec<Predicate> = (*meta.delete_predicates).clone();
-                del_preds.push(delete_predicate.clone());
-                *meta = Arc::new(ChunkMetadata {
-                    table_summary: Arc::clone(&meta.table_summary),
-                    schema: Arc::clone(&meta.schema),
-                    delete_predicates: Arc::new(del_preds),
+                    delete_predicates: del_preds,
                 });
             }
         }
@@ -500,14 +494,23 @@ impl CatalogChunk {
         Ok(())
     }
 
-    pub fn delete_predicates(&mut self) -> Arc<Vec<Predicate>> {
+    pub fn delete_predicates(&mut self) -> &[Arc<Predicate>] {
         match &self.stage {
             ChunkStage::Open { mb_chunk: _ } => {
                 // no delete predicate for open chunk
-                Arc::new(vec![])
+                debug!("delete_predicates of Open chunk is empty");
+                &[]
+            }
+            ChunkStage::Frozen { meta, .. } => {
+                let preds = &meta.delete_predicates;
+                debug!(?preds, "delete_predicates of Frozen chunk");
+                preds
+            }
+            ChunkStage::Persisted { meta, .. } => {
+                let preds = &meta.delete_predicates;
+                debug!(?preds, "delete_predicates of Persisted chunk");
+                preds
             }
-            ChunkStage::Frozen { meta, .. } => Arc::clone(&meta.delete_predicates),
-            ChunkStage::Persisted { meta, .. } => Arc::clone(&meta.delete_predicates),
         }
     }
 
@@ -679,7 +682,14 @@ impl CatalogChunk {
     ///
     /// This only works for chunks in the _open_ stage (chunk is converted) and the _frozen_ stage
     /// (no-op) and will fail for other stages.
-    pub fn freeze_with_predicate(&mut self, delete_predicate: &Predicate) -> Result<()> {
+    pub fn freeze_with_predicate(&mut self, delete_predicate: Arc<Predicate>) -> Result<()> {
+        self.freeze_with_delete_predicates(vec![delete_predicate])
+    }
+
+    fn freeze_with_delete_predicates(
+        &mut self,
+        delete_predicates: Vec<Arc<Predicate>>,
+    ) -> Result<()> {
         match &self.stage {
             ChunkStage::Open { mb_chunk, .. } => {
                 debug!(%self.addr, row_count=mb_chunk.rows(), "freezing chunk");
@@ -692,7 +702,7 @@ impl CatalogChunk {
                 let metadata = ChunkMetadata {
                     table_summary: Arc::new(mb_chunk.table_summary()),
                     schema: s.full_schema(),
-                    delete_predicates: Arc::new(vec![delete_predicate.clone()]),
+                    delete_predicates,
                 };
 
                 self.stage = ChunkStage::Frozen {
@@ -714,7 +724,7 @@ impl CatalogChunk {
     }
 
     pub fn freeze(&mut self) -> Result<()> {
-        self.freeze_with_predicate(&PredicateBuilder::default().build())
+        self.freeze_with_delete_predicates(vec![])
     }
 
     /// Set the chunk to the Moving state, returning a handle to the underlying storage
@@ -776,7 +786,7 @@ impl CatalogChunk {
                 *meta = Arc::new(ChunkMetadata {
                     table_summary: Arc::clone(&meta.table_summary),
                     schema,
-                    delete_predicates: Arc::clone(&meta.delete_predicates),
+                    delete_predicates: meta.delete_predicates.clone(),
                 });
 
                 match &representation {
@@ -1151,7 +1161,7 @@ mod tests {
         expected_exprs1.push(e);
 
         // Add a delete predicate into a chunk the open chunk = delete simulation for open chunk
-        chunk.add_delete_predicate(&del_pred1).unwrap();
+        chunk.add_delete_predicate(Arc::new(del_pred1)).unwrap();
         // chunk must be in frozen stage now
         assert_eq!(chunk.stage().name(), "Frozen");
         // chunk must have a delete predicate
@@ -1182,7 +1192,7 @@ mod tests {
         let mut expected_exprs2 = vec![];
         let e = col("cost").not_eq(lit(15));
         expected_exprs2.push(e);
-        chunk.add_delete_predicate(&del_pred2).unwrap();
+        chunk.add_delete_predicate(Arc::new(del_pred2)).unwrap();
         // chunk still must be in frozen stage now
         assert_eq!(chunk.stage().name(), "Frozen");
         // chunk must have 2 delete predicates
@@ -1248,7 +1258,7 @@ mod tests {
             now,
             now,
             ChunkMetrics::new_unregistered(),
-            Arc::new(vec![] as Vec<Predicate>),
+            vec![],
             ChunkOrder::new(6),
         )
     }
diff --git a/server/src/db/catalog/partition.rs b/server/src/db/catalog/partition.rs
index 3c989e6a7f..bcd07e8435 100644
--- a/server/src/db/catalog/partition.rs
+++ b/server/src/db/catalog/partition.rs
@@ -176,7 +176,7 @@ impl Partition {
         time_of_first_write: DateTime<Utc>,
         time_of_last_write: DateTime<Utc>,
         schema: Arc<Schema>,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
         chunk_order: ChunkOrder,
     ) -> (u32, Arc<RwLock<CatalogChunk>>) {
         let chunk_id = Self::pick_next(&mut self.next_chunk_id, "Chunk ID Overflow");
@@ -231,7 +231,7 @@ impl Partition {
         chunk: Arc<parquet_file::chunk::ParquetChunk>,
         time_of_first_write: DateTime<Utc>,
         time_of_last_write: DateTime<Utc>,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
         chunk_order: ChunkOrder,
     ) -> Arc<RwLock<CatalogChunk>> {
         assert_eq!(chunk.table_name(), self.table_name());
@@ -246,7 +246,7 @@ impl Partition {
                     time_of_first_write,
                     time_of_last_write,
                     self.metrics.new_chunk_metrics(),
-                    Arc::clone(&delete_predicates),
+                    delete_predicates,
                     chunk_order,
                 )),
         );
diff --git a/server/src/db/chunk.rs b/server/src/db/chunk.rs
index 854c219180..b9386d93c4 100644
--- a/server/src/db/chunk.rs
+++ b/server/src/db/chunk.rs
@@ -121,7 +121,7 @@ impl DbChunk {
                 let meta = ChunkMetadata {
                     table_summary: Arc::new(mb_chunk.table_summary()),
                     schema: snapshot.full_schema(),
-                    delete_predicates: Arc::new(vec![]), //todo: consider to use the one of the given chunk if appropriate
+                    delete_predicates: vec![], // open chunk does not have delete predicate
                 };
                 (state, Arc::new(meta))
             }
@@ -224,6 +224,19 @@ impl DbChunk {
     pub fn time_of_last_write(&self) -> DateTime<Utc> {
         self.time_of_last_write
     }
+
+    pub fn to_rub_negated_predicates(
+        delete_predicates: &[Arc<Predicate>],
+    ) -> Result<Vec<read_buffer::Predicate>> {
+        let mut rub_preds: Vec<read_buffer::Predicate> = vec![];
+        for pred in delete_predicates {
+            let rub_pred = to_read_buffer_predicate(pred).context(PredicateConversion)?;
+            rub_preds.push(rub_pred);
+        }
+
+        debug!(?rub_preds, "RUB delete predicates");
+        Ok(rub_preds)
+    }
 }
 
 impl QueryChunk for DbChunk {
@@ -314,11 +327,11 @@ impl QueryChunk for DbChunk {
         Ok(pred_result)
     }
 
-    // NGA todo: add delete predicate here to eliminate data at query time
     fn read_filter(
         &self,
         predicate: &Predicate,
         selection: Selection<'_>,
+        delete_predicates: &[Arc<Predicate>],
     ) -> Result<SendableRecordBatchStream, Self::Error> {
         // Predicate is not required to be applied for correctness. We only pushed it down
         // when possible for performance gain
@@ -326,6 +339,24 @@ impl QueryChunk for DbChunk {
         debug!(?predicate, "Input Predicate to read_filter");
         self.access_recorder.record_access_now();
 
+        debug!(?delete_predicates, "Input Delete Predicates to read_filter");
+
+        // add negated deleted ranges to the predicate
+        let mut pred_with_deleted_ranges = predicate.clone();
+        pred_with_deleted_ranges.add_delete_ranges(delete_predicates);
+        debug!(
+            ?pred_with_deleted_ranges,
+            "Input Predicate plus deleted ranges"
+        );
+
+        // add negated deleted predicates
+        let mut pred_wth_deleted_exprs = pred_with_deleted_ranges.clone();
+        pred_wth_deleted_exprs.add_delete_exprs(delete_predicates);
+        debug!(
+            ?pred_wth_deleted_exprs,
+            "Input Predicate plus deleted ranges and deleted predicates"
+        );
+
         match &self.state {
             State::MutableBuffer { chunk, .. } => {
                 let batch = chunk.read_filter(selection).context(MutableBufferChunk)?;
@@ -339,12 +370,16 @@ impl QueryChunk for DbChunk {
                         Ok(predicate) => predicate,
                         Err(_) => read_buffer::Predicate::default(),
                     };
-
                 debug!(?rb_predicate, "Predicate pushed down to RUB");
 
-                // TODO: add collection of delete predicates associated with
-                // this chunk.
-                let read_results = chunk.read_filter(rb_predicate, selection, vec![]);
+                // combine all delete expressions to RUB's negated ones
+                let negated_delete_exprs = Self::to_rub_negated_predicates(delete_predicates)?;
+                debug!(
+                    ?negated_delete_exprs,
+                    "Negated Predicate pushed down to RUB"
+                );
+
+                let read_results = chunk.read_filter(rb_predicate, selection, negated_delete_exprs);
                 let schema =
                     chunk
                         .read_filter_table_schema(selection)
@@ -357,13 +392,11 @@ impl QueryChunk for DbChunk {
                     schema.into(),
                 )))
             }
-            State::ParquetFile { chunk, .. } => {
-                chunk
-                    .read_filter(predicate, selection)
-                    .context(ParquetFileChunkError {
-                        chunk_id: self.id(),
-                    })
-            }
+            State::ParquetFile { chunk, .. } => chunk
+                .read_filter(&pred_wth_deleted_exprs, selection)
+                .context(ParquetFileChunkError {
+                    chunk_id: self.id(),
+                }),
         }
     }
 
@@ -503,8 +536,11 @@ impl QueryChunkMeta for DbChunk {
     }
 
     // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        Arc::clone(&self.meta.delete_predicates)
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = &self.meta.delete_predicates;
+        debug!(?pred, "Delete predicate in  DbChunk");
+
+        pred
     }
 }
 
@@ -514,7 +550,7 @@ mod tests {
     use crate::{
         db::{
             catalog::chunk::{CatalogChunk, ChunkStage},
-            test_helpers::write_lp,
+            test_helpers::{write_lp, write_lp_with_time},
         },
         utils::make_db,
     };
@@ -527,7 +563,7 @@ mod tests {
         let t2 = chunk.access_recorder().get_metrics();
 
         snapshot
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(&Default::default(), Selection::All, &[])
             .unwrap();
         let t3 = chunk.access_recorder().get_metrics();
 
@@ -608,9 +644,8 @@ mod tests {
     async fn parquet_records_access() {
         let db = make_db().await.db;
 
-        let before_creation = Utc::now();
-        write_lp(&db, "cpu,tag=1 bar=1 1").await;
-        let after_creation = Utc::now();
+        let creation_time = Utc::now();
+        write_lp_with_time(&db, "cpu,tag=1 bar=1 1", creation_time).await;
 
         let id = db
             .persist_partition(
@@ -632,8 +667,7 @@ mod tests {
         let first_write = chunk.time_of_first_write();
         let last_write = chunk.time_of_last_write();
         assert_eq!(first_write, last_write);
-        assert!(before_creation < first_write);
-        assert!(last_write < after_creation);
+        assert_eq!(first_write, creation_time);
 
         test_chunk_access(&chunk).await
     }
diff --git a/server/src/db/lifecycle/compact.rs b/server/src/db/lifecycle/compact.rs
index 576fa328a3..b661dfc546 100644
--- a/server/src/db/lifecycle/compact.rs
+++ b/server/src/db/lifecycle/compact.rs
@@ -45,7 +45,7 @@ pub(crate) fn compact_chunks(
     let mut input_rows = 0;
     let mut time_of_first_write: Option<DateTime<Utc>> = None;
     let mut time_of_last_write: Option<DateTime<Utc>> = None;
-    let mut delete_predicates: Vec<Predicate> = vec![];
+    let mut delete_predicates: Vec<Arc<Predicate>> = vec![];
     let mut min_order = ChunkOrder::MAX;
     let query_chunks = chunks
         .into_iter()
@@ -66,8 +66,7 @@ pub(crate) fn compact_chunks(
                 .map(|prev_last| prev_last.max(candidate_last))
                 .or(Some(candidate_last));
 
-            let mut preds = (*chunk.delete_predicates()).clone();
-            delete_predicates.append(&mut preds);
+            delete_predicates.extend(chunk.delete_predicates().iter().cloned());
 
             min_order = min_order.min(chunk.order());
 
@@ -103,7 +102,7 @@ pub(crate) fn compact_chunks(
             ReorgPlanner::new().compact_plan(schema, query_chunks.iter().map(Arc::clone), key)?;
 
         let physical_plan = ctx.prepare_plan(&plan)?;
-        let stream = ctx.execute(physical_plan).await?;
+        let stream = ctx.execute_stream(physical_plan).await?;
         let rb_chunk = collect_rub(stream, &addr, metric_registry.as_ref())
             .await?
             .expect("chunk has zero rows");
@@ -119,7 +118,7 @@ pub(crate) fn compact_chunks(
                 time_of_first_write,
                 time_of_last_write,
                 schema,
-                Arc::new(delete_predicates),
+                delete_predicates,
                 min_order,
             )
         };
@@ -148,7 +147,7 @@ pub(crate) fn compact_chunks(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{db::test_helpers::write_lp, utils::make_db};
+    use crate::{db::test_helpers::write_lp_with_time, utils::make_db};
     use data_types::chunk_metadata::ChunkStorage;
     use lifecycle::{LockableChunk, LockablePartition};
     use query::QueryDatabase;
@@ -158,15 +157,13 @@ mod tests {
         let test_db = make_db().await;
         let db = test_db.db;
 
-        let time0 = Utc::now();
-
-        write_lp(db.as_ref(), "cpu,tag1=cupcakes bar=1 10").await;
-        write_lp(db.as_ref(), "cpu,tag1=asfd,tag2=foo bar=2 20").await;
-        write_lp(db.as_ref(), "cpu,tag1=bingo,tag2=foo bar=2 10").await;
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 20").await;
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 10").await;
-
-        let time1 = Utc::now();
+        let t_first_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=cupcakes bar=1 10", t_first_write).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=asfd,tag2=foo bar=2 20", Utc::now()).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bingo,tag2=foo bar=2 10", Utc::now()).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 20", Utc::now()).await;
+        let t_last_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 10", t_last_write).await;
 
         let partition_keys = db.partition_keys().unwrap();
         assert_eq!(partition_keys.len(), 1);
@@ -182,9 +179,8 @@ mod tests {
 
         let (_, fut) = compact_chunks(partition.upgrade(), vec![chunk.upgrade()]).unwrap();
         // NB: perform the write before spawning the background task that performs the compaction
-        let time2 = Utc::now();
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 40").await;
-        let time3 = Utc::now();
+        let t_later_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 40", t_later_write).await;
         tokio::spawn(fut).await.unwrap().unwrap().unwrap();
 
         let mut chunk_summaries: Vec<_> = db_partition.read().chunk_summaries().collect();
@@ -194,16 +190,14 @@ mod tests {
         let mub_summary = &chunk_summaries[0];
         let first_mub_write = mub_summary.time_of_first_write;
         let last_mub_write = mub_summary.time_of_last_write;
-        assert!(time2 < first_mub_write);
         assert_eq!(first_mub_write, last_mub_write);
-        assert!(first_mub_write < time3);
+        assert_eq!(first_mub_write, t_later_write);
 
         let rub_summary = &chunk_summaries[1];
         let first_rub_write = rub_summary.time_of_first_write;
         let last_rub_write = rub_summary.time_of_last_write;
-        assert!(time0 < first_rub_write);
-        assert!(first_rub_write < last_rub_write);
-        assert!(last_rub_write < time1);
+        assert_eq!(first_rub_write, t_first_write);
+        assert_eq!(last_rub_write, t_last_write);
 
         let summaries: Vec<_> = chunk_summaries
             .iter()
diff --git a/server/src/db/lifecycle/error.rs b/server/src/db/lifecycle/error.rs
index 4f9a15838b..036edea016 100644
--- a/server/src/db/lifecycle/error.rs
+++ b/server/src/db/lifecycle/error.rs
@@ -49,7 +49,7 @@ pub enum Error {
 
     #[snafu(display("Error while commiting transaction on preserved catalog: {}", source))]
     CommitError {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
     },
 
     #[snafu(display("Cannot write chunk: {}", addr))]
diff --git a/server/src/db/lifecycle/move_chunk.rs b/server/src/db/lifecycle/move_chunk.rs
index e0ddf48b36..2dbe716eb1 100644
--- a/server/src/db/lifecycle/move_chunk.rs
+++ b/server/src/db/lifecycle/move_chunk.rs
@@ -54,7 +54,7 @@ pub fn move_chunk_to_read_buffer(
             ReorgPlanner::new().compact_plan(schema, query_chunks.iter().map(Arc::clone), key)?;
 
         let physical_plan = ctx.prepare_plan(&plan)?;
-        let stream = ctx.execute(physical_plan).await?;
+        let stream = ctx.execute_stream(physical_plan).await?;
         let rb_chunk = collect_rub(
             stream,
             &addr.clone().into_partition(),
diff --git a/server/src/db/lifecycle/persist.rs b/server/src/db/lifecycle/persist.rs
index 8bf5bddc78..7fa38818f7 100644
--- a/server/src/db/lifecycle/persist.rs
+++ b/server/src/db/lifecycle/persist.rs
@@ -52,7 +52,7 @@ pub fn persist_chunks(
     let mut time_of_first_write: Option<DateTime<Utc>> = None;
     let mut time_of_last_write: Option<DateTime<Utc>> = None;
     let mut query_chunks = vec![];
-    let mut delete_predicates: Vec<Predicate> = vec![];
+    let mut delete_predicates: Vec<Arc<Predicate>> = vec![];
     let mut min_order = ChunkOrder::MAX;
     for mut chunk in chunks {
         // Sanity-check
@@ -72,8 +72,7 @@ pub fn persist_chunks(
             .map(|prev_last| prev_last.max(candidate_last))
             .or(Some(candidate_last));
 
-        let mut preds = (*chunk.delete_predicates()).clone();
-        delete_predicates.append(&mut preds);
+        delete_predicates.extend(chunk.delete_predicates().iter().cloned());
 
         min_order = min_order.min(chunk.order());
 
@@ -112,8 +111,10 @@ pub fn persist_chunks(
             "Expected split plan to produce exactly 2 partitions"
         );
 
-        let to_persist_stream = ctx.execute_partition(Arc::clone(&physical_plan), 0).await?;
-        let remainder_stream = ctx.execute_partition(physical_plan, 1).await?;
+        let to_persist_stream = ctx
+            .execute_stream_partitioned(Arc::clone(&physical_plan), 0)
+            .await?;
+        let remainder_stream = ctx.execute_stream_partitioned(physical_plan, 1).await?;
 
         let (to_persist, remainder) = futures::future::try_join(
             collect_rub(to_persist_stream, &addr, metric_registry.as_ref()),
@@ -131,7 +132,6 @@ pub fn persist_chunks(
                 partition_write.force_drop_chunk(id)
             }
 
-            let del_preds = Arc::new(delete_predicates);
             // Upsert remainder to catalog
             if let Some(remainder) = remainder {
                 partition_write.create_rub_chunk(
@@ -139,11 +139,13 @@ pub fn persist_chunks(
                     time_of_first_write,
                     time_of_last_write,
                     Arc::clone(&schema),
-                    Arc::clone(&del_preds),
+                    delete_predicates.clone(),
                     min_order,
                 );
             }
 
+            // NGA todo: we hit this error if there are rows but they are deleted
+            // Need to think a way to handle this (https://github.com/influxdata/influxdb_iox/issues/2546)
             let to_persist = to_persist.expect("should be rows to persist");
 
             let (new_chunk_id, new_chunk) = partition_write.create_rub_chunk(
@@ -151,7 +153,7 @@ pub fn persist_chunks(
                 time_of_first_write,
                 time_of_last_write,
                 schema,
-                del_preds,
+                delete_predicates,
                 min_order,
             );
             let to_persist = LockableCatalogChunk {
diff --git a/server/src/db/lifecycle/write.rs b/server/src/db/lifecycle/write.rs
index 1b28f8f1e2..bfd7b7df71 100644
--- a/server/src/db/lifecycle/write.rs
+++ b/server/src/db/lifecycle/write.rs
@@ -17,7 +17,7 @@ use data_types::{chunk_metadata::ChunkLifecycleAction, job::Job};
 use internal_types::selection::Selection;
 use observability_deps::tracing::{debug, warn};
 use parquet_file::{
-    catalog::api::CatalogParquetInfo,
+    catalog::interface::CatalogParquetInfo,
     chunk::{ChunkMetrics as ParquetChunkMetrics, ParquetChunk},
     metadata::IoxMetadata,
     storage::Storage,
@@ -26,7 +26,7 @@ use persistence_windows::{
     checkpoint::{DatabaseCheckpoint, PartitionCheckpoint, PersistCheckpointBuilder},
     persistence_windows::FlushHandle,
 };
-use query::QueryChunk;
+use query::{QueryChunk, QueryChunkMeta};
 use snafu::ResultExt;
 use std::{future::Future, sync::Arc};
 use tracker::{TaskTracker, TrackedFuture, TrackedFutureExt};
@@ -89,7 +89,11 @@ pub(super) fn write_chunk_to_object_store(
 
         // Get RecordBatchStream of data from the read buffer chunk
         let stream = db_chunk
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(
+                &Default::default(),
+                Selection::All,
+                db_chunk.delete_predicates(),
+            )
             .expect("read filter should be infallible");
 
         // check that the upcoming state change will very likely succeed
diff --git a/server/src/db/load.rs b/server/src/db/load.rs
index 1afd1b0eb0..d0cdd1817d 100644
--- a/server/src/db/load.rs
+++ b/server/src/db/load.rs
@@ -1,11 +1,17 @@
 //! Functionality to load a [`Catalog`](crate::db::catalog::Catalog) and other information from a
-//! [`PreservedCatalog`](parquet_file::catalog::api::PreservedCatalog).
+//! [`PreservedCatalog`](parquet_file::catalog::core::PreservedCatalog).
 
 use super::catalog::{chunk::ChunkStage, table::TableSchemaUpsertHandle, Catalog};
 use iox_object_store::{IoxObjectStore, ParquetFilePath};
 use observability_deps::tracing::{error, info};
 use parquet_file::{
-    catalog::api::{CatalogParquetInfo, CatalogState, ChunkCreationFailed, PreservedCatalog},
+    catalog::{
+        core::PreservedCatalog,
+        interface::{
+            CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError,
+            ChunkCreationFailed,
+        },
+    },
     chunk::{ChunkMetrics as ParquetChunkMetrics, ParquetChunk},
 };
 use persistence_windows::checkpoint::{ReplayPlan, ReplayPlanner};
@@ -22,17 +28,17 @@ pub enum Error {
 
     #[snafu(display("Cannot create new empty preserved catalog: {}", source))]
     CannotCreateCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
     },
 
     #[snafu(display("Cannot load preserved catalog: {}", source))]
     CannotLoadCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
     },
 
     #[snafu(display("Cannot wipe preserved catalog: {}", source))]
     CannotWipeCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
     },
 }
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -166,8 +172,10 @@ impl CatalogState for Loader {
         &mut self,
         iox_object_store: Arc<IoxObjectStore>,
         info: CatalogParquetInfo,
-    ) -> parquet_file::catalog::api::Result<()> {
-        use parquet_file::catalog::api::{MetadataExtractFailed, ReplayPlanError, SchemaError};
+    ) -> Result<(), CatalogStateAddError> {
+        use parquet_file::catalog::interface::{
+            MetadataExtractFailed, ReplayPlanError, SchemaError,
+        };
 
         // extract relevant bits from parquet file metadata
         let iox_md = info
@@ -212,15 +220,13 @@ impl CatalogState for Loader {
             .get_or_create_partition(&iox_md.table_name, &iox_md.partition_key);
         let mut partition = partition.write();
         if partition.chunk(iox_md.chunk_id).is_some() {
-            return Err(
-                parquet_file::catalog::api::Error::ParquetFileAlreadyExists { path: info.path },
-            );
+            return Err(CatalogStateAddError::ParquetFileAlreadyExists { path: info.path });
         }
         let schema_handle = TableSchemaUpsertHandle::new(&table_schema, &parquet_chunk.schema())
             .map_err(|e| Box::new(e) as _)
             .context(SchemaError { path: info.path })?;
 
-        let delete_predicates: Arc<Vec<Predicate>> = Arc::new(vec![]); // NGA todo: After Marco save delete predicate into the catalog, it will need to extract into this variable
+        let delete_predicates: Vec<Arc<Predicate>> = vec![]; // NGA todo: After Marco saves delete predicate into the catalog, it will need to get extracted into this variable
         partition.insert_object_store_only_chunk(
             iox_md.chunk_id,
             parquet_chunk,
@@ -234,7 +240,7 @@ impl CatalogState for Loader {
         Ok(())
     }
 
-    fn remove(&mut self, path: &ParquetFilePath) -> parquet_file::catalog::api::Result<()> {
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
         let mut removed_any = false;
 
         for partition in self.catalog.partitions() {
@@ -261,7 +267,7 @@ impl CatalogState for Loader {
         if removed_any {
             Ok(())
         } else {
-            Err(parquet_file::catalog::api::Error::ParquetFileDoesNotExist { path: path.clone() })
+            Err(CatalogStateRemoveError::ParquetFileDoesNotExist { path: path.clone() })
         }
     }
 }
@@ -273,7 +279,7 @@ mod tests {
     use data_types::{server_id::ServerId, DatabaseName};
     use object_store::ObjectStore;
     use parquet_file::catalog::{
-        api::CheckpointData,
+        interface::CheckpointData,
         test_helpers::{assert_catalog_state_implementation, TestCatalogState},
     };
     use std::convert::TryFrom;
diff --git a/server/src/lib.rs b/server/src/lib.rs
index 10f29cd73b..804aee59d2 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -1195,7 +1195,7 @@ mod tests {
         path::{parsed::DirsAndFileName, ObjectStorePath},
         ObjectStore, ObjectStoreApi,
     };
-    use parquet_file::catalog::{api::PreservedCatalog, test_helpers::TestCatalogState};
+    use parquet_file::catalog::{core::PreservedCatalog, test_helpers::TestCatalogState};
     use query::{exec::ExecutionContextProvider, frontend::sql::SqlQueryPlanner, QueryDatabase};
     use std::{
         convert::{TryFrom, TryInto},
diff --git a/src/commands/database/partition.rs b/src/commands/database/partition.rs
index 22046f8046..72d8dd9988 100644
--- a/src/commands/database/partition.rs
+++ b/src/commands/database/partition.rs
@@ -1,6 +1,5 @@
 //! This module implements the `partition` CLI command
 use data_types::chunk_metadata::ChunkSummary;
-use data_types::job::Operation;
 use generated_types::google::FieldViolation;
 use influxdb_iox_client::{
     connection::Connection,
@@ -10,7 +9,7 @@ use influxdb_iox_client::{
         PersistPartitionError, UnloadPartitionChunkError,
     },
 };
-use std::convert::{TryFrom, TryInto};
+use std::convert::TryFrom;
 use structopt::StructOpt;
 use thiserror::Error;
 
@@ -283,10 +282,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                 chunk_id,
             } = close_chunk;
 
-            let operation: Operation = client
+            let operation = client
                 .close_partition_chunk(db_name, table_name, partition_key, chunk_id)
-                .await?
-                .try_into()?;
+                .await?;
 
             serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
         }
diff --git a/src/commands/database/recover.rs b/src/commands/database/recover.rs
index 2df1dee01b..5b9b8aa6eb 100644
--- a/src/commands/database/recover.rs
+++ b/src/commands/database/recover.rs
@@ -1,6 +1,3 @@
-use std::convert::TryInto;
-
-use data_types::job::Operation;
 use generated_types::google::FieldViolation;
 use influxdb_iox_client::{connection::Connection, management};
 use snafu::{ResultExt, Snafu};
@@ -74,12 +71,10 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                 return Err(Error::NeedsTheForceError);
             }
 
-            let operation: Operation = client
+            let operation = client
                 .wipe_persisted_catalog(db_name)
                 .await
-                .context(WipeError)?
-                .try_into()
-                .context(InvalidResponse)?;
+                .context(WipeError)?;
 
             serde_json::to_writer_pretty(std::io::stdout(), &operation).context(WritingJson)?;
         }
diff --git a/src/commands/operations.rs b/src/commands/operations.rs
index 44d284f0f2..af741e446d 100644
--- a/src/commands/operations.rs
+++ b/src/commands/operations.rs
@@ -1,11 +1,8 @@
-use data_types::job::Operation;
-use generated_types::google::FieldViolation;
 use influxdb_iox_client::{
     connection::Connection,
     management,
     operations::{self, Client},
 };
-use std::convert::TryInto;
 use structopt::StructOpt;
 use thiserror::Error;
 
@@ -15,9 +12,6 @@ pub enum Error {
     #[error("Client error: {0}")]
     ClientError(#[from] operations::Error),
 
-    #[error("Received invalid response: {0}")]
-    InvalidResponse(#[from] FieldViolation),
-
     #[error("Failed to create dummy job: {0}")]
     CreateDummyJobError(#[from] management::CreateDummyJobError),
 
@@ -68,29 +62,16 @@ enum Command {
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
     match config.command {
         Command::List => {
-            let result: Result<Vec<Operation>, _> = Client::new(connection)
-                .list_operations()
-                .await?
-                .into_iter()
-                .map(|c| c.operation())
-                .map(TryInto::try_into)
-                .collect();
-            let operations = result?;
+            let operations = Client::new(connection).list_operations().await?;
             serde_json::to_writer_pretty(std::io::stdout(), &operations)?;
         }
         Command::Get { id } => {
-            let operation: Operation = Client::new(connection)
-                .get_operation(id)
-                .await?
-                .try_into()?;
+            let operation = Client::new(connection).get_operation(id).await?;
             serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
         }
         Command::Wait { id, nanos } => {
             let timeout = nanos.map(std::time::Duration::from_nanos);
-            let operation: Operation = Client::new(connection)
-                .wait_operation(id, timeout)
-                .await?
-                .try_into()?;
+            let operation = Client::new(connection).wait_operation(id, timeout).await?;
             serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
         }
         Command::Cancel { id } => {
@@ -98,10 +79,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
             println!("Ok");
         }
         Command::Test { nanos } => {
-            let operation: Operation = management::Client::new(connection)
+            let operation = management::Client::new(connection)
                 .create_dummy_job(nanos)
-                .await?
-                .try_into()?;
+                .await?;
             serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
         }
     }
diff --git a/src/influxdb_ioxd.rs b/src/influxdb_ioxd.rs
index 1a22ffea0e..afcffc61bd 100644
--- a/src/influxdb_ioxd.rs
+++ b/src/influxdb_ioxd.rs
@@ -753,9 +753,11 @@ mod tests {
         child(prepare_sql_span, "prepare_plan").unwrap();
 
         let collect_span = child(ctx_span, "collect").unwrap();
+        let execute_span = child(collect_span, "execute_stream_partitioned").unwrap();
+        let coalesce_span = child(execute_span, "CoalescePartitionsEx").unwrap();
 
         // validate spans from DataFusion ExecutionPlan are present
-        child(collect_span, "ProjectionExec: expr").unwrap();
+        child(coalesce_span, "ProjectionExec: expr").unwrap();
 
         let database_not_found = root_spans[3];
         assert_eq!(database_not_found.status, SpanStatus::Err);
diff --git a/src/influxdb_ioxd/http.rs b/src/influxdb_ioxd/http.rs
index 76ac71d79a..04d4df9e20 100644
--- a/src/influxdb_ioxd/http.rs
+++ b/src/influxdb_ioxd/http.rs
@@ -28,7 +28,7 @@ use data_types::{
 };
 use influxdb_iox_client::format::QueryOutputFormat;
 use influxdb_line_protocol::parse_lines;
-use query::{exec::ExecutionContextProvider, QueryDatabase};
+use query::exec::ExecutionContextProvider;
 use server::{ApplicationState, ConnectionManager, Error, Server as AppServer};
 
 // External crates
@@ -392,7 +392,6 @@ where
         .get("/health", health::<M>)
         .get("/metrics", handle_metrics::<M>)
         .get("/iox/api/v1/databases/:name/query", query::<M>)
-        .get("/api/v1/partitions", list_partitions::<M>)
         .get("/debug/pprof", pprof_home::<M>)
         .get("/debug/pprof/profile", pprof_profile::<M>)
         .get("/debug/pprof/allocs", pprof_heappy_profile::<M>)
@@ -644,43 +643,6 @@ async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
     Ok(Response::new(Body::from(body)))
 }
 
-#[derive(Deserialize, Debug)]
-/// Arguments in the query string of the request to /partitions
-struct DatabaseInfo {
-    org: String,
-    bucket: String,
-}
-
-#[tracing::instrument(level = "debug")]
-async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
-    req: Request<Body>,
-) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
-
-    let query = req.uri().query().context(ExpectedQueryString {})?;
-
-    let info: DatabaseInfo = serde_urlencoded::from_str(query).context(InvalidQueryString {
-        query_string: query,
-    })?;
-
-    let db_name =
-        org_and_bucket_to_database(&info.org, &info.bucket).context(BucketMappingError)?;
-
-    let db = server.db(&db_name)?;
-
-    let partition_keys =
-        db.partition_keys()
-            .map_err(|e| Box::new(e) as _)
-            .context(BucketByName {
-                org: &info.org,
-                bucket_name: &info.bucket,
-            })?;
-
-    let result = serde_json::to_string(&partition_keys).context(JsonGenerationError)?;
-
-    Ok(Response::new(Body::from(result)))
-}
-
 #[derive(Deserialize, Debug)]
 /// Arguments in the query string of the request to /snapshot
 struct SnapshotInfo {
diff --git a/src/influxdb_ioxd/rpc/management.rs b/src/influxdb_ioxd/rpc/management.rs
index 278ee303ac..e9ac840c8d 100644
--- a/src/influxdb_ioxd/rpc/management.rs
+++ b/src/influxdb_ioxd/rpc/management.rs
@@ -631,7 +631,7 @@ where
                     del_predicate.exprs.push(expr);
                 }
 
-                db.delete(&table_name, &del_predicate)
+                db.delete(&table_name, Arc::new(del_predicate))
                     .await
                     .map_err(default_db_error_handler)?;
             }
diff --git a/tests/end_to_end_cases/management_api.rs b/tests/end_to_end_cases/management_api.rs
index e8c025ff0e..5bbcee73cf 100644
--- a/tests/end_to_end_cases/management_api.rs
+++ b/tests/end_to_end_cases/management_api.rs
@@ -9,7 +9,6 @@ use generated_types::{
 };
 use influxdb_iox_client::{
     management::{Client, CreateDatabaseError},
-    operations,
     write::WriteError,
 };
 
@@ -880,20 +879,16 @@ async fn test_close_partition_chunk() {
     assert_eq!(chunks[0].storage, ChunkStorage::OpenMutableBuffer as i32);
 
     // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
         .close_partition_chunk(&db_name, table_name, partition_key, 0)
         .await
         .expect("new partition chunk");
 
-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();
 
     // ensure we got a legit job description back
-    if let Some(Job::CloseChunk(close_chunk)) = meta.job {
+    if let Some(Job::CloseChunk(close_chunk)) = iox_operation.metadata.job {
         assert_eq!(close_chunk.db_name, db_name);
         assert_eq!(close_chunk.partition_key, partition_key);
         assert_eq!(close_chunk.chunk_id, 0);
@@ -1020,20 +1015,16 @@ async fn test_wipe_preserved_catalog() {
     // Recover by wiping preserved catalog
     //
 
-    let operation = management_client
+    let iox_operation = management_client
         .wipe_persisted_catalog(&db_name)
         .await
         .expect("wipe persisted catalog");
 
-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();
 
     // ensure we got a legit job description back
-    if let Some(Job::WipePreservedCatalog(wipe_persisted_catalog)) = meta.job {
+    if let Some(Job::WipePreservedCatalog(wipe_persisted_catalog)) = iox_operation.metadata.job {
         assert_eq!(wipe_persisted_catalog.db_name, db_name);
     } else {
         panic!("unexpected job returned")
diff --git a/tests/end_to_end_cases/management_cli.rs b/tests/end_to_end_cases/management_cli.rs
index ce03987073..8236398a34 100644
--- a/tests/end_to_end_cases/management_cli.rs
+++ b/tests/end_to_end_cases/management_cli.rs
@@ -1,13 +1,12 @@
-use std::sync::Arc;
 use std::time::Duration;
 
 use assert_cmd::Command;
 use predicates::prelude::*;
 
-use data_types::chunk_metadata::ChunkAddr;
-use data_types::{
-    chunk_metadata::ChunkStorage,
-    job::{Job, Operation},
+use data_types::chunk_metadata::ChunkStorage;
+use generated_types::google::longrunning::IoxOperation;
+use generated_types::influxdata::iox::management::v1::{
+    operation_metadata::Job, CloseChunk, WipePreservedCatalog,
 };
 use test_helpers::make_temp_file;
 use write_buffer::maybe_skip_kafka_integration;
@@ -112,7 +111,7 @@ async fn test_create_database() {
                 .and(predicate::str::contains(format!(r#""name": "{}"#, db)))
                 // validate the defaults have been set reasonably
                 .and(predicate::str::contains("%Y-%m-%d %H:00:00"))
-                .and(predicate::str::contains(r#""bufferSizeHard": 104857600"#))
+                .and(predicate::str::contains(r#""bufferSizeHard": "104857600""#))
                 .and(predicate::str::contains("lifecycleRules")),
         );
 }
@@ -147,7 +146,7 @@ async fn test_create_database_size() {
         .assert()
         .success()
         .stdout(
-            predicate::str::contains(r#""bufferSizeHard": 1000"#)
+            predicate::str::contains(r#""bufferSizeHard": "1000""#)
                 .and(predicate::str::contains("lifecycleRules")),
         );
 }
@@ -765,7 +764,7 @@ async fn test_close_partition_chunk() {
     let lp_data = vec!["cpu,region=west user=23.2 100"];
     load_lp(addr, &db_name, lp_data);
 
-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
         &Command::cargo_bin("influxdb_iox")
             .unwrap()
             .arg("database")
@@ -784,18 +783,16 @@ async fn test_close_partition_chunk() {
     )
     .expect("Expected JSON output");
 
-    let expected_job = Job::CompactChunk {
-        chunk: ChunkAddr {
-            db_name: Arc::from(db_name.as_str()),
-            table_name: Arc::from("cpu"),
-            partition_key: Arc::from("cpu"),
-            chunk_id: 0,
-        },
-    };
+    let expected_job = Job::CloseChunk(CloseChunk {
+        db_name,
+        table_name: "cpu".to_string(),
+        partition_key: "cpu".to_string(),
+        chunk_id: 0,
+    });
 
     assert_eq!(
         Some(expected_job),
-        stdout.job,
+        stdout.metadata.job,
         "operation was {:#?}",
         stdout
     );
@@ -828,7 +825,7 @@ async fn test_wipe_persisted_catalog() {
     let server_fixture = fixture_broken_catalog(&db_name).await;
     let addr = server_fixture.grpc_base();
 
-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
         &Command::cargo_bin("influxdb_iox")
             .unwrap()
             .arg("database")
@@ -845,13 +842,11 @@ async fn test_wipe_persisted_catalog() {
     )
     .expect("Expected JSON output");
 
-    let expected_job = Job::WipePreservedCatalog {
-        db_name: Arc::from(db_name.as_str()),
-    };
+    let expected_job = Job::WipePreservedCatalog(WipePreservedCatalog { db_name });
 
     assert_eq!(
         Some(expected_job),
-        stdout.job,
+        stdout.metadata.job,
         "operation was {:#?}",
         stdout
     );
diff --git a/tests/end_to_end_cases/operations_api.rs b/tests/end_to_end_cases/operations_api.rs
index b42ce4a887..798ddad7b9 100644
--- a/tests/end_to_end_cases/operations_api.rs
+++ b/tests/end_to_end_cases/operations_api.rs
@@ -17,7 +17,7 @@ async fn test_operations() {
 
     let nanos = vec![Duration::from_secs(20).as_nanos() as _, 1];
 
-    let operation = management_client
+    let iox_operation = management_client
         .create_dummy_job(nanos.clone())
         .await
         .expect("create dummy job failed");
@@ -28,20 +28,15 @@ async fn test_operations() {
         .expect("list operations failed");
 
     assert_eq!(running_ops.len(), 1);
-    assert_eq!(running_ops[0].name(), operation.name);
+    assert_eq!(running_ops[0].operation.name, iox_operation.operation.name);
 
-    let id = operation.name.parse().expect("not an integer");
+    let id = iox_operation.operation.id();
+    let iox_operation = operations_client.get_operation(id).await.unwrap();
 
-    let meta = operations_client
-        .client_operation(id)
-        .await
-        .unwrap()
-        .metadata();
+    let job = iox_operation.metadata.job.expect("expected a job");
 
-    let job = meta.job.expect("expected a job");
-
-    assert_eq!(meta.total_count, 2);
-    assert_eq!(meta.pending_count, 1);
+    assert_eq!(iox_operation.metadata.total_count, 2);
+    assert_eq!(iox_operation.metadata.pending_count, 1);
     assert_eq!(
         job,
         operation_metadata::Job::Dummy(Dummy {
@@ -51,14 +46,14 @@ async fn test_operations() {
     );
 
     // Check wait times out correctly
-    let fetched = operations_client
+    let iox_operation = operations_client
         .wait_operation(id, Some(Duration::from_micros(10)))
         .await
         .expect("failed to wait operation");
 
-    assert!(!fetched.done);
+    assert!(!iox_operation.operation.done);
     // Shouldn't specify wall_nanos as not complete
-    assert_eq!(meta.wall_nanos, 0);
+    assert_eq!(iox_operation.metadata.wall_nanos, 0);
 
     let wait = tokio::spawn(async move {
         let mut operations_client = server_fixture.operations_client();
@@ -74,18 +69,15 @@ async fn test_operations() {
         .expect("failed to cancel operation");
 
     let waited = wait.await.unwrap();
-    let meta = operations::ClientOperation::try_new(waited.clone())
-        .unwrap()
-        .metadata();
 
-    assert!(waited.done);
-    assert!(meta.wall_nanos > 0);
-    assert!(meta.cpu_nanos > 0);
-    assert_eq!(meta.pending_count, 0);
-    assert_eq!(meta.total_count, 2);
-    assert_eq!(meta.cancelled_count, 1);
+    assert!(waited.operation.done);
+    assert!(waited.metadata.wall_nanos > 0);
+    assert!(waited.metadata.cpu_nanos > 0);
+    assert_eq!(waited.metadata.pending_count, 0);
+    assert_eq!(waited.metadata.total_count, 2);
+    assert_eq!(waited.metadata.cancelled_count, 1);
 
-    match waited.result {
+    match waited.operation.result {
         Some(operations::generated_types::operation::Result::Error(status)) => {
             assert_eq!(status.code, tonic::Code::Cancelled as i32)
         }
diff --git a/tests/end_to_end_cases/operations_cli.rs b/tests/end_to_end_cases/operations_cli.rs
index 24e63c1c22..0284e55a69 100644
--- a/tests/end_to_end_cases/operations_cli.rs
+++ b/tests/end_to_end_cases/operations_cli.rs
@@ -1,6 +1,7 @@
 use crate::common::server_fixture::ServerFixture;
 use assert_cmd::Command;
-use data_types::job::{Job, Operation, OperationStatus};
+use generated_types::google::longrunning::IoxOperation;
+use generated_types::influxdata::iox::management::v1::{operation_metadata::Job, Dummy};
 use predicates::prelude::*;
 
 #[tokio::test]
@@ -9,7 +10,7 @@ async fn test_start_stop() {
     let addr = server_fixture.grpc_base();
     let duration = std::time::Duration::from_secs(10).as_nanos() as u64;
 
-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
         &Command::cargo_bin("influxdb_iox")
             .unwrap()
             .arg("operation")
@@ -24,13 +25,13 @@ async fn test_start_stop() {
     )
     .expect("expected JSON output");
 
-    assert_eq!(stdout.total_count, 1);
-    match stdout.job {
-        Some(Job::Dummy { nanos, .. }) => assert_eq!(nanos, vec![duration]),
-        _ => panic!("expected dummy job got {:?}", stdout.job),
+    assert_eq!(stdout.metadata.total_count, 1);
+    match stdout.metadata.job {
+        Some(Job::Dummy(Dummy { nanos, .. })) => assert_eq!(nanos, vec![duration]),
+        _ => panic!("expected dummy job got {:?}", stdout.metadata.job),
     }
 
-    let operations: Vec<Operation> = serde_json::from_slice(
+    let operations: Vec<IoxOperation> = serde_json::from_slice(
         &Command::cargo_bin("influxdb_iox")
             .unwrap()
             .arg("operation")
@@ -45,33 +46,33 @@ async fn test_start_stop() {
     .expect("expected JSON output");
 
     assert_eq!(operations.len(), 1);
-    match &operations[0].job {
-        Some(Job::Dummy { nanos, .. }) => {
+    match &operations[0].metadata.job {
+        Some(Job::Dummy(Dummy { nanos, .. })) => {
             assert_eq!(nanos.len(), 1);
             assert_eq!(nanos[0], duration);
         }
-        _ => panic!("expected dummy job got {:?}", &operations[0].job),
+        _ => panic!("expected dummy job got {:?}", &operations[0].metadata.job),
     }
 
-    let id = operations[0].id;
+    let name = &operations[0].operation.name;
 
     Command::cargo_bin("influxdb_iox")
         .unwrap()
         .arg("operation")
         .arg("cancel")
-        .arg(id.to_string())
+        .arg(name.clone())
         .arg("--host")
         .arg(addr)
         .assert()
         .success()
         .stdout(predicate::str::contains("Ok"));
 
-    let completed: Operation = serde_json::from_slice(
+    let completed: IoxOperation = serde_json::from_slice(
         &Command::cargo_bin("influxdb_iox")
             .unwrap()
             .arg("operation")
             .arg("wait")
-            .arg(id.to_string())
+            .arg(name.to_string())
             .arg("--host")
             .arg(addr)
             .assert()
@@ -81,9 +82,8 @@ async fn test_start_stop() {
     )
     .expect("expected JSON output");
 
-    assert_eq!(completed.pending_count, 0);
-    assert_eq!(completed.total_count, 1);
-    assert_eq!(completed.cancelled_count, 1);
-    assert_eq!(completed.status, OperationStatus::Cancelled);
-    assert_eq!(&completed.job, &operations[0].job)
+    assert_eq!(completed.metadata.pending_count, 0);
+    assert_eq!(completed.metadata.total_count, 1);
+    assert_eq!(completed.metadata.cancelled_count, 1);
+    assert_eq!(&completed.metadata.job, &operations[0].metadata.job)
 }
diff --git a/tests/end_to_end_cases/persistence.rs b/tests/end_to_end_cases/persistence.rs
index 46a6f8c23e..2061e71642 100644
--- a/tests/end_to_end_cases/persistence.rs
+++ b/tests/end_to_end_cases/persistence.rs
@@ -2,7 +2,6 @@ use itertools::Itertools;
 
 use arrow_util::assert_batches_eq;
 use data_types::chunk_metadata::ChunkStorage;
-use influxdb_iox_client::operations;
 
 use crate::{
     common::server_fixture::ServerFixture,
@@ -125,11 +124,11 @@ async fn test_full_lifecycle() {
         .await
         .unwrap()
         .iter()
-        .any(|operation| match operation.metadata().job {
+        .any(|operation| match &operation.metadata.job {
             Some(Job::CompactChunks(CompactChunks {
                 db_name: operation_db_name,
                 ..
-            })) => operation_db_name == db_name,
+            })) => operation_db_name == &db_name,
             _ => false,
         });
     assert!(performed_compaction);
@@ -269,20 +268,16 @@ async fn create_readbuffer_chunk(fixture: &ServerFixture, db_name: &str) -> u32
     assert_eq!(chunks[0].storage, ChunkStorage::OpenMutableBuffer);
 
     // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
         .close_partition_chunk(db_name, table_name, partition_key, 0)
         .await
         .expect("new partition chunk");
 
-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();
 
     // ensure we got a legit job description back
-    if let Some(Job::CloseChunk(close_chunk)) = meta.job {
+    if let Some(Job::CloseChunk(close_chunk)) = iox_operation.metadata.job {
         assert_eq!(close_chunk.db_name, db_name);
         assert_eq!(close_chunk.partition_key, partition_key);
         assert_eq!(close_chunk.chunk_id, 0);
diff --git a/tests/end_to_end_cases/scenario.rs b/tests/end_to_end_cases/scenario.rs
index 8dfd6a7ebb..03c889ee71 100644
--- a/tests/end_to_end_cases/scenario.rs
+++ b/tests/end_to_end_cases/scenario.rs
@@ -556,12 +556,8 @@ where
         }
 
         if t_start.elapsed() >= wait_time {
-            let operations = fixture.operations_client().list_operations().await.unwrap();
-            let mut operations: Vec<_> = operations
-                .into_iter()
-                .map(|x| (x.name().parse::<usize>().unwrap(), x.metadata()))
-                .collect();
-            operations.sort_by_key(|x| x.0);
+            let mut operations = fixture.operations_client().list_operations().await.unwrap();
+            operations.sort_by(|a, b| a.operation.name.cmp(&b.operation.name));
 
             panic!(
                 "Could not find {} within {:?}.\nChunks were: {:#?}\nOperations were: {:#?}",
diff --git a/tests/end_to_end_cases/sql_cli.rs b/tests/end_to_end_cases/sql_cli.rs
index 6c4a128926..e670710d1b 100644
--- a/tests/end_to_end_cases/sql_cli.rs
+++ b/tests/end_to_end_cases/sql_cli.rs
@@ -306,17 +306,20 @@ async fn test_sql_observer_operations() {
     let partition_key = "cpu";
     let table_name = "cpu";
     // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
         .close_partition_chunk(&db_name, table_name, partition_key, 0)
         .await
         .expect("new partition chunk");
 
-    println!("Operation response is {:?}", operation);
+    println!("Operation response is {:?}", iox_operation);
 
     // wait for the job to be done
     fixture
         .operations_client()
-        .wait_operation(operation.id(), Some(std::time::Duration::from_secs(1)))
+        .wait_operation(
+            iox_operation.operation.id(),
+            Some(std::time::Duration::from_secs(1)),
+        )
         .await
         .expect("failed to wait operation");
 
diff --git a/tests/end_to_end_cases/system_tables.rs b/tests/end_to_end_cases/system_tables.rs
index 530911f236..594ad0dba8 100644
--- a/tests/end_to_end_cases/system_tables.rs
+++ b/tests/end_to_end_cases/system_tables.rs
@@ -27,12 +27,12 @@ async fn test_operations() {
         .expect("write succeded");
 
     // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
         .close_partition_chunk(&db_name1, table_name, partition_key, 0)
         .await
         .expect("new partition chunk");
 
-    let operation_id = operation.id();
+    let operation_id = iox_operation.operation.id();
     operations_client
         .wait_operation(operation_id, Some(std::time::Duration::from_secs(1)))
         .await
diff --git a/trace/src/ctx.rs b/trace/src/ctx.rs
index 94d4a41f47..22a7739044 100644
--- a/trace/src/ctx.rs
+++ b/trace/src/ctx.rs
@@ -55,6 +55,22 @@ pub struct SpanContext {
 }
 
 impl SpanContext {
+    /// Create a new root span context, sent to `collector`. The
+    /// new span context has a random trace_id and span_id, and thus
+    /// is not connected to any existing span or trace.
+    pub fn new(collector: Arc<dyn TraceCollector>) -> Self {
+        let mut rng = rand::thread_rng();
+        let trace_id: u128 = rng.gen_range(1..u128::MAX);
+        let span_id: u64 = rng.gen_range(1..u64::MAX);
+
+        Self {
+            trace_id: TraceId(NonZeroU128::new(trace_id).unwrap()),
+            parent_span_id: None,
+            span_id: SpanId(NonZeroU64::new(span_id).unwrap()),
+            collector: Some(collector),
+        }
+    }
+
     /// Creates a new child of the Span described by this TraceContext
     pub fn child(&self, name: impl Into<Cow<'static, str>>) -> Span {
         Span {
@@ -73,3 +89,22 @@ impl SpanContext {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::RingBufferTraceCollector;
+
+    #[test]
+    fn test_new() {
+        // two newly created spans should not have duplicated trace or span ids
+        let collector = Arc::new(RingBufferTraceCollector::new(5)) as _;
+
+        let ctx1 = SpanContext::new(Arc::clone(&collector));
+        let ctx2 = SpanContext::new(collector);
+
+        assert_ne!(ctx1.trace_id, ctx2.trace_id);
+        assert_ne!(ctx1.span_id, ctx2.span_id);
+    }
+}
diff --git a/trace/src/span.rs b/trace/src/span.rs
index 55c5808a92..953cc7b99e 100644
--- a/trace/src/span.rs
+++ b/trace/src/span.rs
@@ -201,29 +201,14 @@ impl<'a> Drop for SpanRecorder {
 
 #[cfg(test)]
 mod tests {
-    use std::num::{NonZeroU128, NonZeroU64};
     use std::sync::Arc;
 
-    use crate::ctx::{SpanId, TraceId};
     use crate::{RingBufferTraceCollector, TraceCollector};
 
     use super::*;
 
     fn make_span(collector: Arc<dyn TraceCollector>) -> Span {
-        Span {
-            name: "foo".into(),
-            ctx: SpanContext {
-                trace_id: TraceId(NonZeroU128::new(23948923).unwrap()),
-                parent_span_id: None,
-                span_id: SpanId(NonZeroU64::new(3498394).unwrap()),
-                collector: Some(collector),
-            },
-            start: None,
-            end: None,
-            status: SpanStatus::Unknown,
-            metadata: Default::default(),
-            events: vec![],
-        }
+        SpanContext::new(collector).child("foo")
     }
 
     #[test]