Merge branch 'main' into cn/restore

2021-09-17 17:52:56 +00:00 · 2021-09-17 17:52:56 +00:00 · 23cc980d9e
parent 91fd32d506 0168c043ad
commit 23cc980d9e
83 changed files with 4546 additions and 986 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1318,13 +1318,13 @@ dependencies = [
 "data_types",
 "futures",
 "google_types",
- "influxdb_line_protocol",
 "num_cpus",
 "observability_deps",
+ "pbjson",
+ "pbjson_build",
 "proc-macro2",
 "prost",
 "prost-build",
- "prost-types",
 "regex",
 "serde",
 "serde_json",
@ -1392,6 +1392,8 @@ version = "0.1.0"
 dependencies = [
 "bytes",
 "chrono",
+ "pbjson",
+ "pbjson_build",
 "prost",
 "prost-build",
 "serde",
@ -2896,6 +2898,39 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58"

+[[package]]
+name = "pbjson"
+version = "0.1.0"
+dependencies = [
+ "base64 0.13.0",
+ "bytes",
+ "serde",
+]
+
+[[package]]
+name = "pbjson_build"
+version = "0.1.0"
+dependencies = [
+ "heck",
+ "itertools 0.10.1",
+ "pbjson_test",
+ "prost",
+ "prost-types",
+ "tempfile",
+]
+
+[[package]]
+name = "pbjson_test"
+version = "0.1.0"
+dependencies = [
+ "pbjson",
+ "pbjson_build",
+ "prost",
+ "prost-build",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -57,6 +57,9 @@ members = [
    "observability_deps",
    "packers",
    "panic_logging",
+    "pbjson",
+    "pbjson_build",
+    "pbjson_test",
    "persistence_windows",
    "predicate",
    "query",
--- a/data_types/src/job.rs
+++ b/data_types/src/job.rs
@ -117,45 +117,3 @@ impl Job {
        }
    }
 }
-
-/// The status of a running operation
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-pub enum OperationStatus {
-    /// A task associated with the operation is running
-    Running,
-    /// All tasks associated with the operation have finished successfully
-    Success,
-    /// The operation was cancelled and no associated tasks are running
-    Cancelled,
-    /// An operation error was returned
-    Errored,
-}
-
-/// A group of asynchronous tasks being performed by an IOx server
-///
-/// TODO: Temporary until prost adds JSON support
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Operation {
-    /// ID of the running operation
-    pub id: usize,
-    // The total number of created tasks
-    pub total_count: u64,
-    // The number of pending tasks
-    pub pending_count: u64,
-    // The number of tasks that completed successfully
-    pub success_count: u64,
-    // The number of tasks that returned an error
-    pub error_count: u64,
-    // The number of tasks that were cancelled
-    pub cancelled_count: u64,
-    // The number of tasks that did not run to completion (e.g. panic)
-    pub dropped_count: u64,
-    /// Wall time spent executing this operation
-    pub wall_time: std::time::Duration,
-    /// CPU time spent executing this operation
-    pub cpu_time: std::time::Duration,
-    /// Additional job metadata
-    pub job: Option<Job>,
-    /// The status of the running operation
-    pub status: OperationStatus,
-}
--- a/generated_types/Cargo.toml
+++ b/generated_types/Cargo.toml
@ -7,16 +7,12 @@ edition = "2018"
 [dependencies] # In alphabetical order
 bytes = { version = "1.0", features = ["serde"] }
 data_types = { path = "../data_types" }
-# See docs/regenerating_flatbuffers.md about updating generated code when updating the
-# version of the flatbuffers crate
-#flatbuffers = "2"
 futures = "0.3"
 google_types = { path = "../google_types" }
-influxdb_line_protocol = { path = "../influxdb_line_protocol" }
-observability_deps = { path = "../observability_deps" }
 num_cpus = "1.13.0"
+observability_deps = { path = "../observability_deps" }
+pbjson = { path = "../pbjson" }
 prost = "0.8"
-prost-types = "0.8"
 regex = "1.4"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0.67"
@ -31,3 +27,4 @@ chrono = { version = "0.4", features = ["serde"] }
 proc-macro2 = "=1.0.27"
 tonic-build = "0.5"
 prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
--- a/generated_types/build.rs
+++ b/generated_types/build.rs
@ -64,12 +64,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
    config
        .compile_well_known_types()
        .disable_comments(&[".google"])
-        // approximates jsonpb. This is still not enough to deal with the special cases like Any
-        // Tracking issue for proper jsonpb support in prost: https://github.com/danburkert/prost/issues/277
-        .type_attribute(
-            ".",
-            "#[derive(serde::Serialize,serde::Deserialize)] #[serde(rename_all = \"camelCase\")]",
-        )
        .extern_path(".google.protobuf", "::google_types::protobuf")
        .bytes(&[".influxdata.iox.catalog.v1.AddParquet.metadata"]);

@ -79,5 +73,11 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
        .format(true)
        .compile_with_config(config, &proto_files, &[root.into()])?;

+    let descriptor_set = std::fs::read(descriptor_path)?;
+
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".influxdata", ".google.longrunning", ".google.rpc"])?;
+
    Ok(())
 }
--- a/generated_types/protos/influxdata/iox/management/v1/jobs.proto
+++ b/generated_types/protos/influxdata/iox/management/v1/jobs.proto
@ -27,12 +27,11 @@ message OperationMetadata {
  // The number of tasks that did not run to completion (e.g. panic)
  uint64 dropped_count = 16;

+  reserved 6;
+
  // What kind of job is it?
  oneof job {
    Dummy dummy = 5;
-    /* historical artifact
-    PersistSegment persist_segment = 6;
-    */
    CloseChunk close_chunk = 7;
    WriteChunk write_chunk = 8;
    WipePreservedCatalog wipe_preserved_catalog = 9;
--- a/generated_types/src/google.rs
+++ b/generated_types/src/google.rs
@ -5,10 +5,17 @@ pub use google_types::*;

 pub mod rpc {
    include!(concat!(env!("OUT_DIR"), "/google.rpc.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.rpc.serde.rs"));
 }

 pub mod longrunning {
    include!(concat!(env!("OUT_DIR"), "/google.longrunning.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.longrunning.serde.rs"));
+
+    use crate::google::{FieldViolation, FieldViolationExt};
+    use crate::influxdata::iox::management::v1::{OperationMetadata, OPERATION_METADATA};
+    use prost::{bytes::Bytes, Message};
+    use std::convert::TryFrom;

    impl Operation {
        /// Return the IOx operation `id`. This `id` can
@ -19,6 +26,46 @@ pub mod longrunning {
                .parse()
                .expect("Internal error: id returned from server was not an integer")
        }
+
+        /// Decodes an IOx `OperationMetadata` metadata payload
+        pub fn iox_metadata(&self) -> Result<OperationMetadata, FieldViolation> {
+            let metadata = self
+                .metadata
+                .as_ref()
+                .ok_or_else(|| FieldViolation::required("metadata"))?;
+
+            if !crate::protobuf_type_url_eq(&metadata.type_url, OPERATION_METADATA) {
+                return Err(FieldViolation {
+                    field: "metadata.type_url".to_string(),
+                    description: "Unexpected field type".to_string(),
+                });
+            }
+
+            Message::decode(Bytes::clone(&metadata.value)).field("metadata.value")
+        }
+    }
+
+    /// Groups together an `Operation` with a decoded `OperationMetadata`
+    ///
+    /// When serialized this will serialize the encoded Any field on `Operation` along
+    /// with its decoded representation as `OperationMetadata`
+    #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
+    pub struct IoxOperation {
+        /// The `Operation` message returned from the API
+        pub operation: Operation,
+        /// The decoded `Operation::metadata` contained within `IoxOperation::operation`
+        pub metadata: OperationMetadata,
+    }
+
+    impl TryFrom<Operation> for IoxOperation {
+        type Error = FieldViolation;
+
+        fn try_from(operation: Operation) -> Result<Self, Self::Error> {
+            Ok(Self {
+                metadata: operation.iox_metadata()?,
+                operation,
+            })
+        }
    }
 }

--- a/generated_types/src/job.rs
+++ b/generated_types/src/job.rs
@ -1,11 +1,5 @@
-use crate::google::{longrunning, protobuf::Any, FieldViolation, FieldViolationExt};
 use crate::influxdata::iox::management::v1 as management;
-use crate::protobuf_type_url_eq;
-use data_types::chunk_metadata::ChunkAddr;
-use data_types::job::{Job, OperationStatus};
-use data_types::partition_metadata::PartitionAddr;
-use std::convert::TryFrom;
-use std::sync::Arc;
+use data_types::job::Job;

 impl From<Job> for management::operation_metadata::Job {
    fn from(job: Job) -> Self {
@ -61,142 +55,3 @@ impl From<Job> for management::operation_metadata::Job {
        }
    }
 }
-
-impl From<management::operation_metadata::Job> for Job {
-    fn from(value: management::operation_metadata::Job) -> Self {
-        use management::operation_metadata::Job;
-        match value {
-            Job::Dummy(management::Dummy { nanos, db_name }) => Self::Dummy {
-                nanos,
-                db_name: (!db_name.is_empty()).then(|| Arc::from(db_name.as_str())),
-            },
-            Job::CloseChunk(management::CloseChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::CompactChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::WriteChunk(management::WriteChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::WriteChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::WipePreservedCatalog(management::WipePreservedCatalog { db_name }) => {
-                Self::WipePreservedCatalog {
-                    db_name: Arc::from(db_name.as_str()),
-                }
-            }
-            Job::CompactChunks(management::CompactChunks {
-                db_name,
-                partition_key,
-                table_name,
-                chunks,
-            }) => Self::CompactChunks {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-                chunks,
-            },
-            Job::PersistChunks(management::PersistChunks {
-                db_name,
-                partition_key,
-                table_name,
-                chunks,
-            }) => Self::PersistChunks {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-                chunks,
-            },
-            Job::DropChunk(management::DropChunk {
-                db_name,
-                partition_key,
-                table_name,
-                chunk_id,
-            }) => Self::DropChunk {
-                chunk: ChunkAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                    chunk_id,
-                },
-            },
-            Job::DropPartition(management::DropPartition {
-                db_name,
-                partition_key,
-                table_name,
-            }) => Self::DropPartition {
-                partition: PartitionAddr {
-                    db_name: Arc::from(db_name.as_str()),
-                    table_name: Arc::from(table_name.as_str()),
-                    partition_key: Arc::from(partition_key.as_str()),
-                },
-            },
-        }
-    }
-}
-
-impl TryFrom<longrunning::Operation> for data_types::job::Operation {
-    type Error = FieldViolation;
-
-    fn try_from(operation: longrunning::Operation) -> Result<Self, Self::Error> {
-        let metadata: Any = operation
-            .metadata
-            .ok_or_else(|| FieldViolation::required("metadata"))?;
-
-        if !protobuf_type_url_eq(&metadata.type_url, management::OPERATION_METADATA) {
-            return Err(FieldViolation {
-                field: "metadata.type_url".to_string(),
-                description: "Unexpected field type".to_string(),
-            });
-        }
-
-        let meta: management::OperationMetadata =
-            prost::Message::decode(metadata.value).field("metadata.value")?;
-
-        let status = match &operation.result {
-            None => OperationStatus::Running,
-            Some(longrunning::operation::Result::Response(_)) => OperationStatus::Success,
-            Some(longrunning::operation::Result::Error(status)) => {
-                if status.code == tonic::Code::Cancelled as i32 {
-                    OperationStatus::Cancelled
-                } else {
-                    OperationStatus::Errored
-                }
-            }
-        };
-
-        Ok(Self {
-            id: operation.name.parse().field("name")?,
-            total_count: meta.total_count,
-            pending_count: meta.pending_count,
-            success_count: meta.success_count,
-            error_count: meta.error_count,
-            cancelled_count: meta.cancelled_count,
-            dropped_count: meta.dropped_count,
-            wall_time: std::time::Duration::from_nanos(meta.wall_nanos),
-            cpu_time: std::time::Duration::from_nanos(meta.cpu_nanos),
-            job: meta.job.map(Into::into),
-            status,
-        })
-    }
-}
--- a/generated_types/src/lib.rs
+++ b/generated_types/src/lib.rs
@ -10,6 +10,10 @@ pub mod influxdata {
    pub mod platform {
        pub mod storage {
            include!(concat!(env!("OUT_DIR"), "/influxdata.platform.storage.rs"));
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.storage.serde.rs"
+            ));

            // Can't implement `Default` because `prost::Message` implements `Default`
            impl TimestampRange {
@ -27,6 +31,10 @@ pub mod influxdata {
        pub mod catalog {
            pub mod v1 {
                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog.v1.serde.rs"
+                ));
            }
        }

@ -37,12 +45,20 @@ pub mod influxdata {
                    "influxdata.iox.management.v1.OperationMetadata";

                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.management.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.management.v1.serde.rs"
+                ));
            }
        }

        pub mod write {
            pub mod v1 {
                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.write.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.write.v1.serde.rs"
+                ));
            }
        }
    }
@ -50,6 +66,7 @@ pub mod influxdata {
    pub mod pbdata {
        pub mod v1 {
            include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.rs"));
+            include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.serde.rs"));
        }
    }
 }
--- a/google_types/Cargo.toml
+++ b/google_types/Cargo.toml
@ -8,8 +8,10 @@ edition = "2018"
 [dependencies] # In alphabetical order
 bytes = { version = "1.0", features = ["serde"] }
 chrono = "0.4"
+pbjson = { path = "../pbjson" }
 prost = "0.8"
 serde = { version = "1.0", features = ["derive"] }

 [build-dependencies] # In alphabetical order
 prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
--- a/google_types/build.rs
+++ b/google_types/build.rs
@ -1,6 +1,7 @@
 //! Compiles Protocol Buffers and FlatBuffers schema definitions into
 //! native Rust types.

+use std::env;
 use std::path::PathBuf;

 type Error = Box<dyn std::error::Error>;
@ -16,16 +17,18 @@ fn main() -> Result<()> {
        println!("cargo:rerun-if-changed={}", proto_file.display());
    }

+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
    prost_build::Config::new()
+        .file_descriptor_set_path(&descriptor_path)
        .compile_well_known_types()
        .disable_comments(&["."])
-        // approximates jsonpb. This is still not enough to deal with the special cases like Any.
-        .type_attribute(
-            ".google",
-            "#[derive(serde::Serialize,serde::Deserialize)] #[serde(rename_all = \"camelCase\")]",
-        )
        .bytes(&[".google"])
        .compile_protos(&proto_files, &[root])?;

+    let descriptor_set = std::fs::read(descriptor_path)?;
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".google"])?;
+
    Ok(())
 }
--- a/google_types/src/lib.rs
+++ b/google_types/src/lib.rs
@ -17,6 +17,7 @@ mod pb {
            use std::convert::{TryFrom, TryInto};

            include!(concat!(env!("OUT_DIR"), "/google.protobuf.rs"));
+            include!(concat!(env!("OUT_DIR"), "/google.protobuf.serde.rs"));

            impl TryFrom<Duration> for std::time::Duration {
                type Error = std::num::TryFromIntError;
--- a/influxdb_iox_client/src/client/management.rs
+++ b/influxdb_iox_client/src/client/management.rs
@ -3,8 +3,8 @@ use thiserror::Error;
 use self::generated_types::{management_service_client::ManagementServiceClient, *};

 use crate::connection::Connection;
-use ::generated_types::google::longrunning::Operation;

+use crate::google::{longrunning::IoxOperation, FieldViolation};
 use std::convert::TryInto;
 use std::num::NonZeroU32;

@ -200,6 +200,10 @@ pub enum CreateDummyJobError {
    #[error("Server returned an empty response")]
    EmptyResponse,

+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
    /// Client received an unexpected error from the server
    #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
    ServerError(tonic::Status),
@ -284,6 +288,10 @@ pub enum ClosePartitionChunkError {
    #[error("Server unavailable: {}", .0.message())]
    Unavailable(tonic::Status),

+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
    /// Client received an unexpected error from the server
    #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
    ServerError(tonic::Status),
@ -336,6 +344,10 @@ pub enum WipePersistedCatalogError {
    #[error("Server returned an empty response")]
    EmptyResponse,

+    /// Response payload was invalid
+    #[error("Invalid response: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
    /// Client received an unexpected error from the server
    #[error("Unexpected server error: {}: {}", .0.code(), .0.message())]
    ServerError(tonic::Status),
@ -842,7 +854,7 @@ impl Client {
    pub async fn create_dummy_job(
        &mut self,
        nanos: Vec<u64>,
-    ) -> Result<Operation, CreateDummyJobError> {
+    ) -> Result<IoxOperation, CreateDummyJobError> {
        let response = self
            .inner
            .create_dummy_job(CreateDummyJobRequest { nanos })
@ -852,7 +864,8 @@ impl Client {
        Ok(response
            .into_inner()
            .operation
-            .ok_or(CreateDummyJobError::EmptyResponse)?)
+            .ok_or(CreateDummyJobError::EmptyResponse)?
+            .try_into()?)
    }

    /// Closes the specified chunk in the specified partition and
@ -865,7 +878,7 @@ impl Client {
        table_name: impl Into<String> + Send,
        partition_key: impl Into<String> + Send,
        chunk_id: u32,
-    ) -> Result<Operation, ClosePartitionChunkError> {
+    ) -> Result<IoxOperation, ClosePartitionChunkError> {
        let db_name = db_name.into();
        let partition_key = partition_key.into();
        let table_name = table_name.into();
@ -888,7 +901,8 @@ impl Client {
        Ok(response
            .into_inner()
            .operation
-            .ok_or(ClosePartitionChunkError::EmptyResponse)?)
+            .ok_or(ClosePartitionChunkError::EmptyResponse)?
+            .try_into()?)
    }

    /// Unload chunk from read buffer but keep it in object store.
@ -929,7 +943,7 @@ impl Client {
    pub async fn wipe_persisted_catalog(
        &mut self,
        db_name: impl Into<String> + Send,
-    ) -> Result<Operation, WipePersistedCatalogError> {
+    ) -> Result<IoxOperation, WipePersistedCatalogError> {
        let db_name = db_name.into();

        let response = self
@ -947,7 +961,8 @@ impl Client {
        Ok(response
            .into_inner()
            .operation
-            .ok_or(WipePersistedCatalogError::EmptyResponse)?)
+            .ok_or(WipePersistedCatalogError::EmptyResponse)?
+            .try_into()?)
    }

    /// Skip replay of an uninitialized database.
--- a/influxdb_iox_client/src/client/operations.rs
+++ b/influxdb_iox_client/src/client/operations.rs
@ -1,11 +1,9 @@
 use thiserror::Error;

-use ::generated_types::{
-    google::FieldViolation, influxdata::iox::management::v1 as management, protobuf_type_url_eq,
-};
-
 use self::generated_types::{operations_client::OperationsClient, *};
 use crate::connection::Connection;
+use std::convert::TryInto;
+
 /// Re-export generated_types
 pub mod generated_types {
    pub use generated_types::google::longrunning::*;
@ -16,7 +14,7 @@ pub mod generated_types {
 pub enum Error {
    /// Client received an invalid response
    #[error("Invalid server response: {}", .0)]
-    InvalidResponse(#[from] FieldViolation),
+    InvalidResponse(#[from] ::generated_types::google::FieldViolation),

    /// Operation was not found
    #[error("Operation not found: {}", .0)]
@ -66,7 +64,7 @@ impl Client {
    }

    /// Get information of all client operation
-    pub async fn list_operations(&mut self) -> Result<Vec<ClientOperation>> {
+    pub async fn list_operations(&mut self) -> Result<Vec<IoxOperation>> {
        Ok(self
            .inner
            .list_operations(ListOperationsRequest::default())
@ -75,12 +73,12 @@ impl Client {
            .into_inner()
            .operations
            .into_iter()
-            .map(|o| ClientOperation::try_new(o).unwrap())
-            .collect())
+            .map(TryInto::try_into)
+            .collect::<Result<_, _>>()?)
    }

    /// Get information about a specific operation
-    pub async fn get_operation(&mut self, id: usize) -> Result<Operation> {
+    pub async fn get_operation(&mut self, id: usize) -> Result<IoxOperation> {
        Ok(self
            .inner
            .get_operation(GetOperationRequest {
@ -91,7 +89,8 @@ impl Client {
                tonic::Code::NotFound => Error::NotFound(id),
                _ => Error::ServerError(e),
            })?
-            .into_inner())
+            .into_inner()
+            .try_into()?)
    }

    /// Cancel a given operation
@ -115,7 +114,7 @@ impl Client {
        &mut self,
        id: usize,
        timeout: Option<std::time::Duration>,
-    ) -> Result<Operation> {
+    ) -> Result<IoxOperation> {
        Ok(self
            .inner
            .wait_operation(WaitOperationRequest {
@ -127,50 +126,7 @@ impl Client {
                tonic::Code::NotFound => Error::NotFound(id),
                _ => Error::ServerError(e),
            })?
-            .into_inner())
-    }
-
-    /// Return the Client Operation
-    pub async fn client_operation(&mut self, id: usize) -> Result<ClientOperation> {
-        let operation = self.get_operation(id).await?;
-        ClientOperation::try_new(operation)
-    }
-}
-
-/// IOx's Client Operation
-#[derive(Debug, Clone)]
-pub struct ClientOperation {
-    inner: generated_types::Operation,
-}
-
-impl ClientOperation {
-    /// Create a new Cient Operation
-    pub fn try_new(operation: generated_types::Operation) -> Result<Self> {
-        if operation.metadata.is_some() {
-            let metadata = operation.metadata.clone().unwrap();
-            if !protobuf_type_url_eq(&metadata.type_url, management::OPERATION_METADATA) {
-                return Err(Error::WrongOperationMetaData);
-            }
-        } else {
-            return Err(Error::NotFound(0));
-        }
-
-        Ok(Self { inner: operation })
-    }
-
-    /// Return Metadata for this client operation
-    pub fn metadata(&self) -> management::OperationMetadata {
-        prost::Message::decode(self.inner.metadata.clone().unwrap().value)
-            .expect("failed to decode metadata")
-    }
-
-    /// Return name of this operation
-    pub fn name(&self) -> &str {
-        &self.inner.name
-    }
-
-    /// Return the inner's Operation
-    pub fn operation(self) -> Operation {
-        self.inner
+            .into_inner()
+            .try_into()?)
    }
 }
--- a/iox_data_generator/src/lib.rs
+++ b/iox_data_generator/src/lib.rs
@ -77,6 +77,15 @@ pub enum Error {
        /// Underlying `agent` module error that caused this problem
        source: agent::Error,
    },
+
+    /// Error that may happen when constructing an agent's writer
+    #[snafu(display("Could not create writer for agent `{}`, caused by:\n{}", name, source))]
+    CouldNotCreateAgentWriter {
+        /// The name of the relevant agent
+        name: String,
+        /// Underlying `write` module error that caused this problem
+        source: write::Error,
+    },
 }

 type Result<T, E = Error> = std::result::Result<T, E>;
@ -135,7 +144,9 @@ pub async fn generate<T: DataGenRng>(
            )
            .context(CouldNotCreateAgent { name: &agent_name })?;

-            let agent_points_writer = points_writer_builder.build_for_agent(&agent_name);
+            let agent_points_writer = points_writer_builder
+                .build_for_agent(&agent_name)
+                .context(CouldNotCreateAgentWriter { name: &agent_name })?;

            handles.push(tokio::task::spawn(async move {
                agent.generate_all(agent_points_writer, batch_size).await
--- a/iox_data_generator/src/write.rs
+++ b/iox_data_generator/src/write.rs
@ -10,14 +10,23 @@ use std::{
 };
 use std::{
    fs,
-    fs::OpenOptions,
+    fs::{File, OpenOptions},
+    io::BufWriter,
    path::{Path, PathBuf},
 };
-use tracing::info;

 /// Errors that may happen while writing points.
 #[derive(Snafu, Debug)]
 pub enum Error {
+    /// Error that may happen when writing line protocol to a file
+    #[snafu(display("Could open line protocol file {}: {}", filename.display(), source))]
+    CantOpenLineProtocolFile {
+        /// The location of the file we tried to open
+        filename: PathBuf,
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
    /// Error that may happen when writing line protocol to a no-op sink
    #[snafu(display("Could not generate line protocol: {}", source))]
    CantWriteToNoOp {
@ -174,7 +183,7 @@ impl PointsWriterBuilder {

    /// Create a writer out of this writer's configuration for a particular
    /// agent that runs in a separate thread/task.
-    pub fn build_for_agent(&mut self, agent_name: &str) -> PointsWriter {
+    pub fn build_for_agent(&mut self, agent_name: &str) -> Result<PointsWriter> {
        let inner_writer = match &mut self.config {
            PointsWriterConfig::Api {
                client,
@ -189,7 +198,16 @@ impl PointsWriterBuilder {
                let mut filename = dir_path.clone();
                filename.push(agent_name);
                filename.set_extension("txt");
-                InnerPointsWriter::File(filename)
+
+                let file = OpenOptions::new()
+                    .append(true)
+                    .create(true)
+                    .open(&filename)
+                    .context(CantOpenLineProtocolFile { filename })?;
+
+                let file = BufWriter::new(file);
+
+                InnerPointsWriter::File { file }
            }
            PointsWriterConfig::NoOp { perform_write } => InnerPointsWriter::NoOp {
                perform_write: *perform_write,
@ -204,7 +222,7 @@ impl PointsWriterBuilder {
            PointsWriterConfig::Stdout => InnerPointsWriter::Stdout,
        };

-        PointsWriter { inner_writer }
+        Ok(PointsWriter { inner_writer })
    }
 }

@ -228,7 +246,9 @@ enum InnerPointsWriter {
        org: String,
        bucket: String,
    },
-    File(PathBuf),
+    File {
+        file: BufWriter<File>,
+    },
    NoOp {
        perform_write: bool,
    },
@ -250,22 +270,12 @@ impl InnerPointsWriter {
                    .await
                    .context(CantWriteToApi)?;
            }
-            Self::File(filename) => {
-                info!("Opening file {:?}", filename);
-                let num_points = points.len();
-                let file = OpenOptions::new()
-                    .append(true)
-                    .create(true)
-                    .open(&filename)
-                    .context(CantWriteToLineProtocolFile)?;
-
-                let mut file = std::io::BufWriter::new(file);
+            Self::File { file } => {
                for point in points {
                    point
-                        .write_data_point_to(&mut file)
+                        .write_data_point_to(&mut *file)
                        .context(CantWriteToLineProtocolFile)?;
                }
-                info!("Wrote {} points to {:?}", num_points, filename);
            }
            Self::NoOp { perform_write } => {
                if *perform_write {
--- a/parquet_file/src/catalog/cleanup.rs
+++ b/parquet_file/src/catalog/cleanup.rs
@ -1,7 +1,6 @@
 //! Methods to cleanup the object store.
 use std::{collections::HashSet, sync::Arc};

-use crate::catalog::api::{CatalogParquetInfo, CatalogState, PreservedCatalog};
 use futures::TryStreamExt;
 use iox_object_store::{IoxObjectStore, ParquetFilePath};
 use object_store::{ObjectStore, ObjectStoreApi};
@ -9,6 +8,11 @@ use observability_deps::tracing::info;
 use parking_lot::Mutex;
 use snafu::{ResultExt, Snafu};

+use crate::catalog::{
+    core::PreservedCatalog,
+    interface::{CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError},
+};
+
 #[derive(Debug, Snafu)]
 pub enum Error {
    #[snafu(display("Error from read operation while cleaning object store: {}", source))]
@ -22,7 +26,7 @@ pub enum Error {
    },

    #[snafu(display("Error from catalog loading while cleaning object store: {}", source))]
-    CatalogLoadError { source: crate::catalog::api::Error },
+    CatalogLoadError { source: crate::catalog::core::Error },
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -124,12 +128,12 @@ impl CatalogState for TracerCatalogState {
        &mut self,
        _iox_object_store: Arc<IoxObjectStore>,
        info: CatalogParquetInfo,
-    ) -> crate::catalog::api::Result<()> {
+    ) -> Result<(), CatalogStateAddError> {
        self.files.lock().insert(info.path);
        Ok(())
    }

-    fn remove(&mut self, _path: &ParquetFilePath) -> crate::catalog::api::Result<()> {
+    fn remove(&mut self, _path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
        // Do NOT remove the file since we still need it for time travel
        Ok(())
    }
--- a/parquet_file/src/catalog/core.rs
+++ b/parquet_file/src/catalog/core.rs
@ -1,10 +1,13 @@
 //! Catalog preservation and transaction handling.

 use crate::{
-    catalog::internals::{
-        proto_io::{load_transaction_proto, store_transaction_proto},
-        proto_parse,
-        types::{FileType, TransactionKey},
+    catalog::{
+        interface::{CatalogParquetInfo, CatalogState, CheckpointData},
+        internals::{
+            proto_io::{load_transaction_proto, store_transaction_proto},
+            proto_parse,
+            types::{FileType, TransactionKey},
+        },
    },
    metadata::IoxParquetMetaData,
 };
@ -113,55 +116,9 @@ pub enum Error {
    #[snafu(display("Upgrade path not implemented/supported: {}", format))]
    UnsupportedUpgrade { format: String },

-    #[snafu(display("Parquet already exists in catalog: {:?}", path))]
-    ParquetFileAlreadyExists { path: ParquetFilePath },
-
-    #[snafu(display("Parquet does not exist in catalog: {:?}", path))]
-    ParquetFileDoesNotExist { path: ParquetFilePath },
-
    #[snafu(display("Cannot decode parquet metadata: {}", source))]
    MetadataDecodingFailed { source: crate::metadata::Error },

-    #[snafu(
-        display("Cannot extract metadata from {:?}: {}", path, source),
-        visibility(pub)
-    )]
-    MetadataExtractFailed {
-        source: crate::metadata::Error,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display("Schema for {:?} does not work with existing schema: {}", path, source),
-        visibility(pub)
-    )]
-    SchemaError {
-        source: Box<dyn std::error::Error + Send + Sync>,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display(
-            "Internal error: Using checkpoints from {:?} leads to broken replay plan: {}, catalog likely broken",
-            path,
-            source
-        ),
-        visibility(pub)
-    )]
-    ReplayPlanError {
-        source: Box<dyn std::error::Error + Send + Sync>,
-        path: ParquetFilePath,
-    },
-
-    #[snafu(
-        display("Cannot create parquet chunk from {:?}: {}", path, source),
-        visibility(pub)
-    )]
-    ChunkCreationFailed {
-        source: crate::chunk::Error,
-        path: ParquetFilePath,
-    },
-
    #[snafu(display("Catalog already exists"))]
    AlreadyExists {},

@ -177,44 +134,20 @@ pub enum Error {

    #[snafu(display("Cannot commit transaction: {}", source))]
    CommitError { source: Box<Error> },
+
+    #[snafu(display("Cannot add parquet file during load: {}", source))]
+    AddError {
+        source: crate::catalog::interface::CatalogStateAddError,
+    },
+
+    #[snafu(display("Cannot remove parquet file during load: {}", source))]
+    RemoveError {
+        source: crate::catalog::interface::CatalogStateRemoveError,
+    },
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;

-/// Struct containing all information that a catalog received for a new parquet file.
-#[derive(Debug, Clone)]
-pub struct CatalogParquetInfo {
-    /// Path within this database.
-    pub path: ParquetFilePath,
-
-    /// Size of the parquet file, in bytes
-    pub file_size_bytes: usize,
-
-    /// Associated parquet metadata.
-    pub metadata: Arc<IoxParquetMetaData>,
-}
-
-/// Abstraction over how the in-memory state of the catalog works.
-pub trait CatalogState {
-    /// Input to create a new empty instance.
-    ///
-    /// See [`new_empty`](Self::new_empty) for details.
-    type EmptyInput: Send;
-
-    /// Create empty state w/o any known files.
-    fn new_empty(db_name: &str, data: Self::EmptyInput) -> Self;
-
-    /// Add parquet file to state.
-    fn add(
-        &mut self,
-        iox_object_store: Arc<IoxObjectStore>,
-        info: CatalogParquetInfo,
-    ) -> Result<()>;
-
-    /// Remove parquet file from state.
-    fn remove(&mut self, path: &ParquetFilePath) -> Result<()>;
-}
-
 /// In-memory view of the preserved catalog.
 pub struct PreservedCatalog {
    // We need an RWLock AND a semaphore, so that readers are NOT blocked during an open
@ -605,20 +538,22 @@ impl OpenTransaction {

                let metadata = Arc::new(metadata);

-                state.add(
-                    Arc::clone(iox_object_store),
-                    CatalogParquetInfo {
-                        path,
-                        file_size_bytes,
-                        metadata,
-                    },
-                )?;
+                state
+                    .add(
+                        Arc::clone(iox_object_store),
+                        CatalogParquetInfo {
+                            path,
+                            file_size_bytes,
+                            metadata,
+                        },
+                    )
+                    .context(AddError)?;
            }
            proto::transaction::action::Action::RemoveParquet(a) => {
                let path =
                    proto_parse::parse_dirs_and_filename(a.path.as_ref().context(PathRequired)?)
                        .context(ProtobufParseError)?;
-                state.remove(&path)?;
+                state.remove(&path).context(RemoveError)?;
            }
        };
        Ok(())
@ -737,20 +672,6 @@ impl OpenTransaction {
    }
 }

-/// Structure that holds all information required to create a checkpoint.
-///
-/// Note that while checkpoint are addressed using the same schema as we use for transaction
-/// (revision counter, UUID), they contain the changes at the end (aka including) the transaction
-/// they refer.
-#[derive(Debug)]
-pub struct CheckpointData {
-    /// List of all Parquet files that are currently (i.e. by the current version) tracked by the
-    /// catalog.
-    ///
-    /// If a file was once added but later removed it MUST NOT appear in the result.
-    pub files: HashMap<ParquetFilePath, CatalogParquetInfo>,
-}
-
 /// Handle for an open uncommitted transaction.
 ///
 /// Dropping this object w/o calling [`commit`](Self::commit) will issue a warning.
--- a/parquet_file/src/catalog/dump.rs
+++ b/parquet_file/src/catalog/dump.rs
@ -223,8 +223,7 @@ mod tests {

    use crate::{
        catalog::{
-            api::{CatalogParquetInfo, PreservedCatalog},
-            test_helpers::TestCatalogState,
+            core::PreservedCatalog, interface::CatalogParquetInfo, test_helpers::TestCatalogState,
        },
        test_utils::{chunk_addr, make_iox_object_store, make_metadata, TestSize},
    };
--- a/parquet_file/src/catalog/interface.rs
+++ b/parquet_file/src/catalog/interface.rs
@ -0,0 +1,99 @@
+//! Abstract interfaces to make different users work with the perserved catalog.
+use std::{collections::HashMap, sync::Arc};
+
+use iox_object_store::{IoxObjectStore, ParquetFilePath};
+use snafu::Snafu;
+
+use crate::metadata::IoxParquetMetaData;
+
+/// Struct containing all information that a catalog received for a new parquet file.
+#[derive(Debug, Clone)]
+pub struct CatalogParquetInfo {
+    /// Path within this database.
+    pub path: ParquetFilePath,
+
+    /// Size of the parquet file, in bytes
+    pub file_size_bytes: usize,
+
+    /// Associated parquet metadata.
+    pub metadata: Arc<IoxParquetMetaData>,
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum CatalogStateAddError {
+    #[snafu(display("Cannot extract metadata from {:?}: {}", path, source))]
+    MetadataExtractFailed {
+        source: crate::metadata::Error,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Schema for {:?} does not work with existing schema: {}", path, source))]
+    SchemaError {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(
+        display(
+            "Internal error: Using checkpoints from {:?} leads to broken replay plan: {}, catalog likely broken",
+            path,
+            source
+        ),
+    )]
+    ReplayPlanError {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Cannot create parquet chunk from {:?}: {}", path, source))]
+    ChunkCreationFailed {
+        source: crate::chunk::Error,
+        path: ParquetFilePath,
+    },
+
+    #[snafu(display("Parquet already exists in catalog: {:?}", path))]
+    ParquetFileAlreadyExists { path: ParquetFilePath },
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum CatalogStateRemoveError {
+    #[snafu(display("Parquet does not exist in catalog: {:?}", path))]
+    ParquetFileDoesNotExist { path: ParquetFilePath },
+}
+
+/// Abstraction over how the in-memory state of the catalog works.
+pub trait CatalogState {
+    /// Input to create a new empty instance.
+    ///
+    /// See [`new_empty`](Self::new_empty) for details.
+    type EmptyInput: Send;
+
+    /// Create empty state w/o any known files.
+    fn new_empty(db_name: &str, data: Self::EmptyInput) -> Self;
+
+    /// Add parquet file to state.
+    fn add(
+        &mut self,
+        iox_object_store: Arc<IoxObjectStore>,
+        info: CatalogParquetInfo,
+    ) -> Result<(), CatalogStateAddError>;
+
+    /// Remove parquet file from state.
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError>;
+}
+
+/// Structure that holds all information required to create a checkpoint.
+///
+/// Note that while checkpoint are addressed using the same schema as we use for transaction
+/// (revision counter, UUID), they contain the changes at the end (aka including) the transaction
+/// they refer.
+#[derive(Debug)]
+pub struct CheckpointData {
+    /// List of all Parquet files that are currently (i.e. by the current version) tracked by the
+    /// catalog.
+    ///
+    /// If a file was once added but later removed it MUST NOT appear in the result.
+    pub files: HashMap<ParquetFilePath, CatalogParquetInfo>,
+}
--- a/parquet_file/src/catalog/mod.rs
+++ b/parquet_file/src/catalog/mod.rs
@ -1,6 +1,7 @@
-pub mod api;
 pub mod cleanup;
+pub mod core;
 pub mod dump;
+pub mod interface;
 mod internals;
 pub mod prune;
 pub mod rebuild;
--- a/parquet_file/src/catalog/prune.rs
+++ b/parquet_file/src/catalog/prune.rs
@ -8,7 +8,7 @@ use object_store::{ObjectStore, ObjectStoreApi};
 use snafu::{ResultExt, Snafu};

 use crate::catalog::{
-    api::{ProtoIOError, ProtoParseError},
+    core::{ProtoIOError, ProtoParseError},
    internals::{proto_io::load_transaction_proto, proto_parse::parse_timestamp},
 };

@ -33,7 +33,7 @@ pub enum Error {

 pub type Result<T, E = Error> = std::result::Result<T, E>;

-/// Prune history of [`PreservedCatalog`](crate::catalog::api::PreservedCatalog).
+/// Prune history of [`PreservedCatalog`](crate::catalog::core::PreservedCatalog).
 ///
 /// This deletes all transactions and checkpoints that were started prior to `before`. Note that this only deletes data
 /// that is safe to delete when time travel to `before` is allowed. For example image the following transactions:
@ -133,8 +133,7 @@ fn is_checkpoint_or_zero(path: &TransactionFilePath) -> bool {
 mod tests {
    use crate::{
        catalog::{
-            api::{CheckpointData, PreservedCatalog},
-            test_helpers::TestCatalogState,
+            core::PreservedCatalog, interface::CheckpointData, test_helpers::TestCatalogState,
        },
        test_utils::make_iox_object_store,
    };
--- a/parquet_file/src/catalog/rebuild.rs
+++ b/parquet_file/src/catalog/rebuild.rs
@ -7,13 +7,17 @@ use observability_deps::tracing::error;
 use snafu::{ResultExt, Snafu};

 use crate::{
-    catalog::api::{CatalogParquetInfo, CatalogState, PreservedCatalog},
+    catalog::{
+        core::PreservedCatalog,
+        interface::{CatalogParquetInfo, CatalogState},
+    },
    metadata::IoxParquetMetaData,
 };
+
 #[derive(Debug, Snafu)]
 pub enum Error {
    #[snafu(display("Cannot create new empty catalog: {}", source))]
-    NewEmptyFailure { source: crate::catalog::api::Error },
+    NewEmptyFailure { source: crate::catalog::core::Error },

    #[snafu(display("Cannot read store: {}", source))]
    ReadFailure { source: object_store::Error },
@ -25,13 +29,15 @@ pub enum Error {
    },

    #[snafu(display("Cannot add file to transaction: {}", source))]
-    FileRecordFailure { source: crate::catalog::api::Error },
+    FileRecordFailure {
+        source: crate::catalog::interface::CatalogStateAddError,
+    },

    #[snafu(display("Cannot commit transaction: {}", source))]
-    CommitFailure { source: crate::catalog::api::Error },
+    CommitFailure { source: crate::catalog::core::Error },

    #[snafu(display("Cannot create checkpoint: {}", source))]
-    CheckpointFailure { source: crate::catalog::api::Error },
+    CheckpointFailure { source: crate::catalog::core::Error },
 }
 pub type Result<T, E = Error> = std::result::Result<T, E>;

@ -164,7 +170,7 @@ async fn read_parquet(
 mod tests {
    use super::*;
    use crate::{
-        catalog::{api::PreservedCatalog, test_helpers::TestCatalogState},
+        catalog::{core::PreservedCatalog, test_helpers::TestCatalogState},
        metadata::IoxMetadata,
        storage::{MemWriter, Storage},
        test_utils::{
--- a/parquet_file/src/catalog/test_helpers.rs
+++ b/parquet_file/src/catalog/test_helpers.rs
@ -12,7 +12,11 @@ use snafu::ResultExt;

 use crate::{
    catalog::{
-        api::{CatalogParquetInfo, CatalogState, CheckpointData, PreservedCatalog},
+        core::PreservedCatalog,
+        interface::{
+            CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError,
+            CheckpointData,
+        },
        internals::{
            proto_io::{load_transaction_proto, store_transaction_proto},
            types::TransactionKey,
@ -61,8 +65,8 @@ impl TestCatalogState {
    }

    /// Inserts a file into this catalog state
-    pub fn insert(&mut self, info: CatalogParquetInfo) -> crate::catalog::api::Result<()> {
-        use crate::catalog::api::{Error, MetadataExtractFailed};
+    pub fn insert(&mut self, info: CatalogParquetInfo) -> Result<(), CatalogStateAddError> {
+        use crate::catalog::interface::MetadataExtractFailed;

        let iox_md = info
            .metadata
@ -80,7 +84,7 @@ impl TestCatalogState {

        match partition.chunks.entry(iox_md.chunk_id) {
            Occupied(o) => {
-                return Err(Error::ParquetFileAlreadyExists {
+                return Err(CatalogStateAddError::ParquetFileAlreadyExists {
                    path: o.get().path.clone(),
                });
            }
@ -104,13 +108,11 @@ impl CatalogState for TestCatalogState {
        &mut self,
        _iox_object_store: Arc<IoxObjectStore>,
        info: CatalogParquetInfo,
-    ) -> crate::catalog::api::Result<()> {
+    ) -> Result<(), CatalogStateAddError> {
        self.insert(info)
    }

-    fn remove(&mut self, path: &ParquetFilePath) -> crate::catalog::api::Result<()> {
-        use crate::catalog::api::Error;
-
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
        let partitions = self
            .tables
            .values_mut()
@ -136,7 +138,7 @@ impl CatalogState for TestCatalogState {
        }

        match removed {
-            0 => Err(Error::ParquetFileDoesNotExist { path: path.clone() }),
+            0 => Err(CatalogStateRemoveError::ParquetFileDoesNotExist { path: path.clone() }),
            _ => Ok(()),
        }
    }
@ -173,8 +175,6 @@ where
    S: CatalogState + Debug + Send + Sync,
    F: Fn(&S) -> CheckpointData + Send,
 {
-    use crate::catalog::api::Error;
-
    // empty state
    let iox_object_store = make_iox_object_store().await;
    let (_catalog, mut state) =
@ -317,11 +317,17 @@ where
                },
            )
            .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));

        // does not exist as has a different UUID
        let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));
    }
    assert_checkpoint(&state, &f, &expected);

@ -340,7 +346,10 @@ where
                },
            )
            .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));

        // this transaction will still work
        let (path, metadata) =
@ -369,12 +378,18 @@ where
                },
            )
            .unwrap_err();
-        assert!(matches!(err, Error::ParquetFileAlreadyExists { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateAddError::ParquetFileAlreadyExists { .. }
+        ));

        // does not exist - as different UUID
        let path = ParquetFilePath::new(&chunk_addr(7));
        let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));

        // this still works
        let (path, _) = expected.remove(&7).unwrap();
@ -382,7 +397,10 @@ where

        // recently removed
        let err = state.remove(&path).unwrap_err();
-        assert!(matches!(err, Error::ParquetFileDoesNotExist { .. }));
+        assert!(matches!(
+            err,
+            CatalogStateRemoveError::ParquetFileDoesNotExist { .. }
+        ));
    }
    assert_checkpoint(&state, &f, &expected);
 }
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@ -120,7 +120,7 @@ use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol, TOutputPro
 /// For breaking changes, this will change.
 ///
 /// **Important: When changing this structure, consider bumping the
-///   [catalog transaction version](crate::catalog::api::TRANSACTION_VERSION)!**
+///   [catalog transaction version](crate::catalog::core::TRANSACTION_VERSION)!**
 pub const METADATA_VERSION: u32 = 6;

 /// File-level metadata key to store the IOx-specific data.
--- a/pbjson/Cargo.toml
+++ b/pbjson/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "pbjson"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Utilities for pbjson converion"
+
+[dependencies]
+
+serde = { version = "1.0", features = ["derive"] }
+base64 = "0.13"
+
+[dev-dependencies]
+bytes = "1.0"
--- a/pbjson/src/lib.rs
+++ b/pbjson/src/lib.rs
@ -0,0 +1,82 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::future_not_send
+)]
+
+#[doc(hidden)]
+pub mod private {
+    /// Re-export base64
+    pub use base64;
+
+    use serde::Deserialize;
+    use std::str::FromStr;
+
+    /// Used to parse a number from either a string or its raw representation
+    #[derive(Debug, Copy, Clone, PartialOrd, PartialEq, Hash, Ord, Eq)]
+    pub struct NumberDeserialize<T>(pub T);
+
+    #[derive(Deserialize)]
+    #[serde(untagged)]
+    enum Content<'a, T> {
+        Str(&'a str),
+        Number(T),
+    }
+
+    impl<'de, T> serde::Deserialize<'de> for NumberDeserialize<T>
+    where
+        T: FromStr + serde::Deserialize<'de>,
+        <T as FromStr>::Err: std::error::Error,
+    {
+        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+        where
+            D: serde::Deserializer<'de>,
+        {
+            let content = Content::deserialize(deserializer)?;
+            Ok(Self(match content {
+                Content::Str(v) => v.parse().map_err(serde::de::Error::custom)?,
+                Content::Number(v) => v,
+            }))
+        }
+    }
+
+    #[derive(Debug, Copy, Clone, PartialOrd, PartialEq, Hash, Ord, Eq)]
+    pub struct BytesDeserialize<T>(pub T);
+
+    impl<'de, T> Deserialize<'de> for BytesDeserialize<T>
+    where
+        T: From<Vec<u8>>,
+    {
+        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+        where
+            D: serde::Deserializer<'de>,
+        {
+            let s: &str = Deserialize::deserialize(deserializer)?;
+            let decoded = base64::decode(s).map_err(serde::de::Error::custom)?;
+            Ok(Self(decoded.into()))
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+        use bytes::Bytes;
+        use serde::de::value::{BorrowedStrDeserializer, Error};
+
+        #[test]
+        fn test_bytes() {
+            let raw = vec![2, 5, 62, 2, 5, 7, 8, 43, 5, 8, 4, 23, 5, 7, 7, 3, 2, 5, 196];
+            let encoded = base64::encode(&raw);
+
+            let deserializer = BorrowedStrDeserializer::<'_, Error>::new(&encoded);
+            let a: Bytes = BytesDeserialize::deserialize(deserializer).unwrap().0;
+            let b: Vec<u8> = BytesDeserialize::deserialize(deserializer).unwrap().0;
+
+            assert_eq!(raw.as_slice(), &a);
+            assert_eq!(raw.as_slice(), &b);
+        }
+    }
+}
--- a/pbjson_build/Cargo.toml
+++ b/pbjson_build/Cargo.toml
@ -0,0 +1,17 @@
+[package]
+name = "pbjson_build"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Generates Serialize and Deserialize implementations for prost message types"
+
+[dependencies]
+
+heck = "0.3"
+prost = "0.8"
+prost-types = "0.8"
+itertools = "0.10"
+
+[dev-dependencies]
+tempfile = "3.1"
+pbjson_test = { path = "../pbjson_test" }
--- a/pbjson_build/src/descriptor.rs
+++ b/pbjson_build/src/descriptor.rs
@ -0,0 +1,260 @@
+//! This module contains code to parse and extract the protobuf descriptor
+//! format for use by the rest of the codebase
+
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
+use std::io::{Error, ErrorKind, Result};
+
+use itertools::{EitherOrBoth, Itertools};
+use prost_types::{
+    DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto,
+    FileDescriptorSet, MessageOptions, OneofDescriptorProto,
+};
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct Package(String);
+
+impl Display for Package {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl Package {
+    pub fn new(s: impl Into<String>) -> Self {
+        let s = s.into();
+        assert!(
+            !s.starts_with('.'),
+            "package cannot start with \'.\', got \"{}\"",
+            s
+        );
+        Self(s)
+    }
+}
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct TypeName(String);
+
+impl Display for TypeName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl TypeName {
+    pub fn new(s: impl Into<String>) -> Self {
+        let s = s.into();
+        assert!(
+            !s.contains('.'),
+            "type name cannot contain \'.\', got \"{}\"",
+            s
+        );
+        Self(s)
+    }
+
+    pub fn to_snake_case(&self) -> String {
+        use heck::SnakeCase;
+        self.0.to_snake_case()
+    }
+
+    pub fn to_camel_case(&self) -> String {
+        use heck::CamelCase;
+        self.0.to_camel_case()
+    }
+
+    pub fn to_shouty_snake_case(&self) -> String {
+        use heck::ShoutySnakeCase;
+        self.0.to_shouty_snake_case()
+    }
+}
+
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct TypePath {
+    package: Package,
+    path: Vec<TypeName>,
+}
+
+impl Display for TypePath {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.package.fmt(f)?;
+        for element in &self.path {
+            write!(f, ".{}", element)?;
+        }
+        Ok(())
+    }
+}
+
+impl TypePath {
+    pub fn new(package: Package) -> Self {
+        Self {
+            package,
+            path: Default::default(),
+        }
+    }
+
+    pub fn package(&self) -> &Package {
+        &self.package
+    }
+
+    pub fn path(&self) -> &[TypeName] {
+        self.path.as_slice()
+    }
+
+    pub fn child(&self, name: TypeName) -> Self {
+        let path = self
+            .path
+            .iter()
+            .cloned()
+            .chain(std::iter::once(name))
+            .collect();
+        Self {
+            package: self.package.clone(),
+            path,
+        }
+    }
+
+    pub fn matches_prefix(&self, prefix: &str) -> bool {
+        let prefix = match prefix.strip_prefix('.') {
+            Some(prefix) => prefix,
+            None => return false,
+        };
+
+        if prefix.len() <= self.package.0.len() {
+            return self.package.0.starts_with(prefix);
+        }
+
+        match prefix.strip_prefix(&self.package.0) {
+            Some(prefix) => {
+                let split = prefix.split('.').skip(1);
+                for zipped in self.path.iter().zip_longest(split) {
+                    match zipped {
+                        EitherOrBoth::Both(a, b) if a.0.as_str() == b => continue,
+                        EitherOrBoth::Left(_) => return true,
+                        _ => return false,
+                    }
+                }
+                true
+            }
+            None => false,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct DescriptorSet {
+    descriptors: BTreeMap<TypePath, Descriptor>,
+}
+
+impl DescriptorSet {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn register_encoded(&mut self, encoded: &[u8]) -> Result<()> {
+        let descriptors: FileDescriptorSet =
+            prost::Message::decode(encoded).map_err(|e| Error::new(ErrorKind::InvalidData, e))?;
+
+        for file in descriptors.file {
+            let syntax = match file.syntax.as_deref() {
+                None | Some("proto2") => Syntax::Proto2,
+                Some("proto3") => Syntax::Proto3,
+                Some(s) => panic!("unknown syntax: {}", s),
+            };
+
+            let package = Package::new(file.package.expect("expected package"));
+            let path = TypePath::new(package);
+
+            for descriptor in file.message_type {
+                self.register_message(&path, descriptor, syntax)
+            }
+
+            for descriptor in file.enum_type {
+                self.register_enum(&path, descriptor)
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = (&TypePath, &Descriptor)> {
+        self.descriptors.iter()
+    }
+
+    fn register_message(&mut self, path: &TypePath, descriptor: DescriptorProto, syntax: Syntax) {
+        let name = TypeName::new(descriptor.name.expect("expected name"));
+        let child_path = path.child(name);
+
+        for child_descriptor in descriptor.enum_type {
+            self.register_enum(&child_path, child_descriptor)
+        }
+
+        for child_descriptor in descriptor.nested_type {
+            self.register_message(&child_path, child_descriptor, syntax)
+        }
+
+        self.register_descriptor(
+            child_path.clone(),
+            Descriptor::Message(MessageDescriptor {
+                path: child_path,
+                options: descriptor.options,
+                one_of: descriptor.oneof_decl,
+                fields: descriptor.field,
+                syntax,
+            }),
+        );
+    }
+
+    fn register_enum(&mut self, path: &TypePath, descriptor: EnumDescriptorProto) {
+        let name = TypeName::new(descriptor.name.expect("expected name"));
+        self.register_descriptor(
+            path.child(name),
+            Descriptor::Enum(EnumDescriptor {
+                values: descriptor.value,
+            }),
+        );
+    }
+
+    fn register_descriptor(&mut self, path: TypePath, descriptor: Descriptor) {
+        match self.descriptors.entry(path) {
+            Entry::Occupied(o) => panic!("descriptor already registered for {}", o.key()),
+            Entry::Vacant(v) => v.insert(descriptor),
+        };
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum Syntax {
+    Proto2,
+    Proto3,
+}
+
+#[derive(Debug, Clone)]
+pub enum Descriptor {
+    Enum(EnumDescriptor),
+    Message(MessageDescriptor),
+}
+
+#[derive(Debug, Clone)]
+pub struct EnumDescriptor {
+    pub values: Vec<EnumValueDescriptorProto>,
+}
+
+#[derive(Debug, Clone)]
+pub struct MessageDescriptor {
+    pub path: TypePath,
+    pub options: Option<MessageOptions>,
+    pub one_of: Vec<OneofDescriptorProto>,
+    pub fields: Vec<FieldDescriptorProto>,
+    pub syntax: Syntax,
+}
+
+impl MessageDescriptor {
+    /// Whether this is an auto-generated type for the map field
+    pub fn is_map(&self) -> bool {
+        self.options
+            .as_ref()
+            .and_then(|options| options.map_entry)
+            .unwrap_or(false)
+    }
+}
--- a/pbjson_build/src/escape.rs
+++ b/pbjson_build/src/escape.rs
@ -0,0 +1,26 @@
+///! Contains code to escape strings to avoid collisions with reserved Rust keywords
+
+pub fn escape_ident(mut ident: String) -> String {
+    // Copied from prost-build::ident
+    //
+    // Use a raw identifier if the identifier matches a Rust keyword:
+    // https://doc.rust-lang.org/reference/keywords.html.
+    match ident.as_str() {
+        // 2015 strict keywords.
+        | "as" | "break" | "const" | "continue" | "else" | "enum" | "false"
+        | "fn" | "for" | "if" | "impl" | "in" | "let" | "loop" | "match" | "mod" | "move" | "mut"
+        | "pub" | "ref" | "return" | "static" | "struct" | "trait" | "true"
+        | "type" | "unsafe" | "use" | "where" | "while"
+        // 2018 strict keywords.
+        | "dyn"
+        // 2015 reserved keywords.
+        | "abstract" | "become" | "box" | "do" | "final" | "macro" | "override" | "priv" | "typeof"
+        | "unsized" | "virtual" | "yield"
+        // 2018 reserved keywords.
+        | "async" | "await" | "try" => ident.insert_str(0, "r#"),
+        // the following keywords are not supported as raw identifiers and are therefore suffixed with an underscore.
+        "self" | "super" | "extern" | "crate" => ident += "_",
+        _ => (),
+    };
+    ident
+}
--- a/pbjson_build/src/generator.rs
+++ b/pbjson_build/src/generator.rs
@ -0,0 +1,129 @@
+//! This module contains the actual code generation logic
+
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
+use std::io::{Result, Write};
+
+use crate::descriptor::TypePath;
+
+mod enumeration;
+mod message;
+
+pub use enumeration::generate_enum;
+pub use message::generate_message;
+
+#[derive(Debug, Clone, Copy)]
+struct Indent(usize);
+
+impl Display for Indent {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        for _ in 0..self.0 {
+            write!(f, "    ")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub struct Config {
+    pub extern_types: BTreeMap<TypePath, String>,
+}
+
+impl Config {
+    fn rust_type(&self, path: &TypePath) -> String {
+        if let Some(t) = self.extern_types.get(path) {
+            return t.clone();
+        }
+
+        let mut ret = String::new();
+        let path = path.path();
+        assert!(!path.is_empty(), "path cannot be empty");
+
+        for i in &path[..(path.len() - 1)] {
+            ret.push_str(i.to_snake_case().as_str());
+            ret.push_str("::");
+        }
+        ret.push_str(path.last().unwrap().to_camel_case().as_str());
+        ret
+    }
+
+    fn rust_variant(&self, enumeration: &TypePath, variant: &str) -> String {
+        use heck::CamelCase;
+        assert!(
+            variant
+                .chars()
+                .all(|c| matches!(c, '0'..='9' | 'A'..='Z' | '_')),
+            "illegal variant - {}",
+            variant
+        );
+
+        // TODO: Config to disable stripping prefix
+
+        let enumeration_name = enumeration.path().last().unwrap().to_shouty_snake_case();
+        let variant = match variant.strip_prefix(&enumeration_name) {
+            Some("") => variant,
+            Some(stripped) => stripped,
+            None => variant,
+        };
+        variant.to_camel_case()
+    }
+}
+
+fn write_fields_array<'a, W: Write, I: Iterator<Item = &'a str>>(
+    writer: &mut W,
+    indent: usize,
+    variants: I,
+) -> Result<()> {
+    writeln!(writer, "{}const FIELDS: &[&str] = &[", Indent(indent))?;
+    for name in variants {
+        writeln!(writer, "{}\"{}\",", Indent(indent + 1), name)?;
+    }
+    writeln!(writer, "{}];", Indent(indent))?;
+    writeln!(writer)
+}
+
+fn write_serialize_start<W: Write>(indent: usize, rust_type: &str, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}impl serde::Serialize for {rust_type} {{
+{indent}    #[allow(deprecated)]
+{indent}    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+{indent}    where
+{indent}        S: serde::Serializer,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type
+    )
+}
+
+fn write_serialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent),
+    )
+}
+
+fn write_deserialize_start<W: Write>(indent: usize, rust_type: &str, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::Deserialize<'de> for {rust_type} {{
+{indent}    #[allow(deprecated)]
+{indent}    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+{indent}    where
+{indent}        D: serde::Deserializer<'de>,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type
+    )
+}
+
+fn write_deserialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(
+        writer,
+        r#"{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent),
+    )
+}
--- a/pbjson_build/src/generator/enumeration.rs
+++ b/pbjson_build/src/generator/enumeration.rs
@ -0,0 +1,138 @@
+//! This module contains the code to generate Serialize and Deserialize
+//! implementations for enumeration type
+//!
+//! An enumeration should be decode-able from the full string variant name
+//! or its integer tag number, and should encode to the string representation
+
+use super::{
+    write_deserialize_end, write_deserialize_start, write_serialize_end, write_serialize_start,
+    Config, Indent,
+};
+use crate::descriptor::{EnumDescriptor, TypePath};
+use crate::generator::write_fields_array;
+use std::io::{Result, Write};
+
+pub fn generate_enum<W: Write>(
+    config: &Config,
+    path: &TypePath,
+    descriptor: &EnumDescriptor,
+    writer: &mut W,
+) -> Result<()> {
+    let rust_type = config.rust_type(path);
+
+    let variants: Vec<_> = descriptor
+        .values
+        .iter()
+        .map(|variant| {
+            let variant_name = variant.name.clone().unwrap();
+            let rust_variant = config.rust_variant(path, &variant_name);
+            (variant_name, rust_variant)
+        })
+        .collect();
+
+    // Generate Serialize
+    write_serialize_start(0, &rust_type, writer)?;
+    writeln!(writer, "{}let variant = match self {{", Indent(2))?;
+    for (variant_name, rust_variant) in &variants {
+        writeln!(
+            writer,
+            "{}Self::{} => \"{}\",",
+            Indent(3),
+            rust_variant,
+            variant_name
+        )?;
+    }
+    writeln!(writer, "{}}};", Indent(2))?;
+
+    writeln!(writer, "{}serializer.serialize_str(variant)", Indent(2))?;
+    write_serialize_end(0, writer)?;
+
+    // Generate Deserialize
+    write_deserialize_start(0, &rust_type, writer)?;
+    write_fields_array(writer, 2, variants.iter().map(|(name, _)| name.as_str()))?;
+    write_visitor(writer, 2, &rust_type, &variants)?;
+
+    // Use deserialize_any to allow users to provide integers or strings
+    writeln!(
+        writer,
+        "{}deserializer.deserialize_any(GeneratedVisitor)",
+        Indent(2)
+    )?;
+
+    write_deserialize_end(0, writer)?;
+    Ok(())
+}
+
+fn write_visitor<W: Write>(
+    writer: &mut W,
+    indent: usize,
+    rust_type: &str,
+    variants: &[(String, String)],
+) -> Result<()> {
+    // Protobuf supports deserialization of enumerations both from string and integer values
+    writeln!(
+        writer,
+        r#"{indent}struct GeneratedVisitor;
+
+{indent}impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}    type Value = {rust_type};
+
+{indent}    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}        write!(formatter, "expected one of: {{:?}}", &FIELDS)
+{indent}    }}
+
+{indent}    fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{
+{indent}        use std::convert::TryFrom;
+{indent}        i32::try_from(v)
+{indent}            .ok()
+{indent}            .and_then({rust_type}::from_i32)
+{indent}            .ok_or_else(|| {{
+{indent}                serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+{indent}            }})
+{indent}    }}
+
+{indent}    fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{
+{indent}        use std::convert::TryFrom;
+{indent}        i32::try_from(v)
+{indent}            .ok()
+{indent}            .and_then({rust_type}::from_i32)
+{indent}            .ok_or_else(|| {{
+{indent}                serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+{indent}            }})
+{indent}    }}
+
+{indent}    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
+{indent}    where
+{indent}        E: serde::de::Error,
+{indent}    {{"#,
+        indent = Indent(indent),
+        rust_type = rust_type,
+    )?;
+
+    writeln!(writer, "{}match value {{", Indent(indent + 2))?;
+    for (variant_name, rust_variant) in variants {
+        writeln!(
+            writer,
+            "{}\"{}\" => Ok({}::{}),",
+            Indent(indent + 3),
+            variant_name,
+            rust_type,
+            rust_variant
+        )?;
+    }
+
+    writeln!(
+        writer,
+        "{indent}_ => Err(serde::de::Error::unknown_variant(value, FIELDS)),",
+        indent = Indent(indent + 3)
+    )?;
+    writeln!(writer, "{}}}", Indent(indent + 2))?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
--- a/pbjson_build/src/generator/message.rs
+++ b/pbjson_build/src/generator/message.rs
@ -0,0 +1,809 @@
+//! This module contains the code to generate Serialize and Deserialize
+//! implementations for message types
+//!
+//! The implementation follows the proto3 [JSON mapping][1] with the default options
+//!
+//! Importantly:
+//! - numeric types can be decoded from either a string or number
+//! - 32-bit integers and floats are encoded as numbers
+//! - 64-bit integers are encoded as strings
+//! - repeated fields are encoded as arrays
+//! - bytes are base64 encoded (NOT CURRENTLY SUPPORTED)
+//! - messages and maps are encoded as objects
+//! - fields are lowerCamelCase except where overridden by the proto definition
+//! - default values are not emitted on encode
+//! - unrecognised fields error on decode
+//!
+//! Note: This will not generate code to correctly serialize/deserialize well-known-types
+//! such as google.protobuf.Any, google.protobuf.Duration, etc... conversions for these
+//! special-cased messages will need to be manually implemented. Once done so, however,
+//! any messages containing these types will serialize/deserialize correctly
+//!
+//! [1]: https://developers.google.com/protocol-buffers/docs/proto3#json
+
+use std::io::{Result, Write};
+
+use crate::message::{Field, FieldModifier, FieldType, Message, OneOf, ScalarType};
+
+use super::{
+    write_deserialize_end, write_deserialize_start, write_serialize_end, write_serialize_start,
+    Config, Indent,
+};
+use crate::descriptor::TypePath;
+use crate::generator::write_fields_array;
+
+pub fn generate_message<W: Write>(
+    config: &Config,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    let rust_type = config.rust_type(&message.path);
+
+    // Generate Serialize
+    write_serialize_start(0, &rust_type, writer)?;
+    write_message_serialize(config, 2, message, writer)?;
+    write_serialize_end(0, writer)?;
+
+    // Generate Deserialize
+    write_deserialize_start(0, &rust_type, writer)?;
+    write_deserialize_message(config, 2, message, &rust_type, writer)?;
+    write_deserialize_end(0, writer)?;
+    Ok(())
+}
+
+fn write_field_empty_predicate<W: Write>(member: &Field, writer: &mut W) -> Result<()> {
+    match (&member.field_type, &member.field_modifier) {
+        (_, FieldModifier::Required) => unreachable!(),
+        (_, FieldModifier::Repeated)
+        | (FieldType::Map(_, _), _)
+        | (FieldType::Scalar(ScalarType::String), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::Bytes), FieldModifier::UseDefault) => {
+            write!(writer, "!self.{}.is_empty()", member.rust_field_name())
+        }
+        (_, FieldModifier::Optional) | (FieldType::Message(_), _) => {
+            write!(writer, "self.{}.is_some()", member.rust_field_name())
+        }
+        (FieldType::Scalar(ScalarType::F64), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::F32), FieldModifier::UseDefault) => {
+            write!(writer, "self.{} != 0.", member.rust_field_name())
+        }
+        (FieldType::Scalar(ScalarType::Bool), FieldModifier::UseDefault) => {
+            write!(writer, "self.{}", member.rust_field_name())
+        }
+        (FieldType::Enum(_), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::I64), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::I32), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::U32), FieldModifier::UseDefault)
+        | (FieldType::Scalar(ScalarType::U64), FieldModifier::UseDefault) => {
+            write!(writer, "self.{} != 0", member.rust_field_name())
+        }
+    }
+}
+
+fn write_message_serialize<W: Write>(
+    config: &Config,
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    write_struct_serialize_start(indent, message, writer)?;
+
+    for field in &message.fields {
+        write_serialize_field(config, indent, field, writer)?;
+    }
+
+    for one_of in &message.one_ofs {
+        write_serialize_one_of(indent, config, one_of, writer)?;
+    }
+
+    write_struct_serialize_end(indent, writer)
+}
+
+fn write_struct_serialize_start<W: Write>(
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(writer, "{}use serde::ser::SerializeStruct;", Indent(indent))?;
+
+    let required_len = message
+        .fields
+        .iter()
+        .filter(|member| member.field_modifier.is_required())
+        .count();
+
+    if required_len != message.fields.len() || !message.one_ofs.is_empty() {
+        writeln!(writer, "{}let mut len = {};", Indent(indent), required_len)?;
+    } else {
+        writeln!(writer, "{}let len = {};", Indent(indent), required_len)?;
+    }
+
+    for field in &message.fields {
+        if field.field_modifier.is_required() {
+            continue;
+        }
+        write!(writer, "{}if ", Indent(indent))?;
+        write_field_empty_predicate(field, writer)?;
+        writeln!(writer, " {{")?;
+        writeln!(writer, "{}len += 1;", Indent(indent + 1))?;
+        writeln!(writer, "{}}}", Indent(indent))?;
+    }
+
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{}if self.{}.is_some() {{",
+            Indent(indent),
+            one_of.rust_field_name()
+        )?;
+        writeln!(writer, "{}len += 1;", Indent(indent + 1))?;
+        writeln!(writer, "{}}}", Indent(indent))?;
+    }
+
+    if !message.fields.is_empty() || !message.one_ofs.is_empty() {
+        writeln!(
+            writer,
+            "{}let mut struct_ser = serializer.serialize_struct(\"{}\", len)?;",
+            Indent(indent),
+            message.path
+        )?;
+    } else {
+        writeln!(
+            writer,
+            "{}let struct_ser = serializer.serialize_struct(\"{}\", len)?;",
+            Indent(indent),
+            message.path
+        )?;
+    }
+    Ok(())
+}
+
+fn write_struct_serialize_end<W: Write>(indent: usize, writer: &mut W) -> Result<()> {
+    writeln!(writer, "{}struct_ser.end()", Indent(indent))
+}
+
+fn write_decode_variant<W: Write>(
+    config: &Config,
+    indent: usize,
+    value: &str,
+    path: &TypePath,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(writer, "{}::from_i32({})", config.rust_type(path), value)?;
+    write!(
+        writer,
+        "{}.ok_or_else(|| serde::ser::Error::custom(format!(\"Invalid variant {{}}\", {})))",
+        Indent(indent),
+        value
+    )
+}
+
+/// Depending on the type of the field different ways of accessing field's value
+/// are needed - this allows decoupling the type serialization logic from the logic
+/// that manipulates its container e.g. Vec, Option, HashMap
+struct Variable<'a> {
+    /// A reference to the field's value
+    as_ref: &'a str,
+    /// The field's value
+    as_unref: &'a str,
+    /// The field without any leading "&" or "*"
+    raw: &'a str,
+}
+
+fn write_serialize_variable<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    variable: Variable<'_>,
+    writer: &mut W,
+) -> Result<()> {
+    match &field.field_type {
+        FieldType::Scalar(scalar) => write_serialize_scalar_variable(
+            indent,
+            *scalar,
+            field.field_modifier,
+            variable,
+            field.json_name(),
+            writer,
+        ),
+        FieldType::Enum(path) => {
+            write!(writer, "{}let v = ", Indent(indent))?;
+            match field.field_modifier {
+                FieldModifier::Repeated => {
+                    writeln!(writer, "{}.iter().cloned().map(|v| {{", variable.raw)?;
+                    write!(writer, "{}", Indent(indent + 1))?;
+                    write_decode_variant(config, indent + 2, "v", path, writer)?;
+                    writeln!(writer)?;
+                    write!(
+                        writer,
+                        "{}}}).collect::<Result<Vec<_>, _>>()",
+                        Indent(indent + 1)
+                    )
+                }
+                _ => write_decode_variant(config, indent + 1, variable.as_unref, path, writer),
+            }?;
+
+            writeln!(writer, "?;")?;
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &v)?;",
+                Indent(indent),
+                field.json_name()
+            )
+        }
+        FieldType::Map(_, value_type)
+            if matches!(
+                value_type.as_ref(),
+                FieldType::Scalar(ScalarType::I64)
+                    | FieldType::Scalar(ScalarType::U64)
+                    | FieldType::Enum(_)
+            ) =>
+        {
+            writeln!(
+                writer,
+                "{}let v: std::collections::HashMap<_, _> = {}.iter()",
+                Indent(indent),
+                variable.raw
+            )?;
+
+            match value_type.as_ref() {
+                FieldType::Scalar(ScalarType::I64) | FieldType::Scalar(ScalarType::U64) => {
+                    writeln!(
+                        writer,
+                        "{}.map(|(k, v)| (k, v.to_string())).collect();",
+                        Indent(indent + 1)
+                    )?;
+                }
+                FieldType::Enum(path) => {
+                    writeln!(writer, "{}.map(|(k, v)| {{", Indent(indent + 1))?;
+                    write!(writer, "{}let v = ", Indent(indent + 2))?;
+                    write_decode_variant(config, indent + 3, "*v", path, writer)?;
+                    writeln!(writer, "?;")?;
+                    writeln!(writer, "{}Ok((k, v))", Indent(indent + 2))?;
+                    writeln!(
+                        writer,
+                        "{}}}).collect::<Result<_,_>>()?;",
+                        Indent(indent + 1)
+                    )?;
+                }
+                _ => unreachable!(),
+            }
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &v)?;",
+                Indent(indent),
+                field.json_name()
+            )
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {})?;",
+                Indent(indent),
+                field.json_name(),
+                variable.as_ref
+            )
+        }
+    }
+}
+
+fn write_serialize_scalar_variable<W: Write>(
+    indent: usize,
+    scalar: ScalarType,
+    field_modifier: FieldModifier,
+    variable: Variable<'_>,
+    json_name: String,
+    writer: &mut W,
+) -> Result<()> {
+    let conversion = match scalar {
+        ScalarType::I64 | ScalarType::U64 => "ToString::to_string",
+        ScalarType::Bytes => "pbjson::private::base64::encode",
+        _ => {
+            return writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {})?;",
+                Indent(indent),
+                json_name,
+                variable.as_ref
+            )
+        }
+    };
+
+    match field_modifier {
+        FieldModifier::Repeated => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", &{}.iter().map({}).collect::<Vec<_>>())?;",
+                Indent(indent),
+                json_name,
+                variable.raw,
+                conversion
+            )
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}struct_ser.serialize_field(\"{}\", {}(&{}).as_str())?;",
+                Indent(indent),
+                json_name,
+                conversion,
+                variable.raw,
+            )
+        }
+    }
+}
+
+fn write_serialize_field<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    writer: &mut W,
+) -> Result<()> {
+    let as_ref = format!("&self.{}", field.rust_field_name());
+    let variable = Variable {
+        as_ref: as_ref.as_str(),
+        as_unref: &as_ref.as_str()[1..],
+        raw: &as_ref.as_str()[1..],
+    };
+
+    match &field.field_modifier {
+        FieldModifier::Required => {
+            write_serialize_variable(config, indent, field, variable, writer)?;
+        }
+        FieldModifier::Optional => {
+            writeln!(
+                writer,
+                "{}if let Some(v) = {}.as_ref() {{",
+                Indent(indent),
+                variable.as_unref
+            )?;
+            let variable = Variable {
+                as_ref: "v",
+                as_unref: "*v",
+                raw: "v",
+            };
+            write_serialize_variable(config, indent + 1, field, variable, writer)?;
+            writeln!(writer, "{}}}", Indent(indent))?;
+        }
+        FieldModifier::Repeated | FieldModifier::UseDefault => {
+            write!(writer, "{}if ", Indent(indent))?;
+            write_field_empty_predicate(field, writer)?;
+            writeln!(writer, " {{")?;
+            write_serialize_variable(config, indent + 1, field, variable, writer)?;
+            writeln!(writer, "{}}}", Indent(indent))?;
+        }
+    }
+    Ok(())
+}
+
+fn write_serialize_one_of<W: Write>(
+    indent: usize,
+    config: &Config,
+    one_of: &OneOf,
+    writer: &mut W,
+) -> Result<()> {
+    writeln!(
+        writer,
+        "{}if let Some(v) = self.{}.as_ref() {{",
+        Indent(indent),
+        one_of.rust_field_name()
+    )?;
+
+    writeln!(writer, "{}match v {{", Indent(indent + 1))?;
+    for field in &one_of.fields {
+        writeln!(
+            writer,
+            "{}{}::{}(v) => {{",
+            Indent(indent + 2),
+            config.rust_type(&one_of.path),
+            field.rust_type_name(),
+        )?;
+        let variable = Variable {
+            as_ref: "v",
+            as_unref: "*v",
+            raw: "v",
+        };
+        write_serialize_variable(config, indent + 3, field, variable, writer)?;
+        writeln!(writer, "{}}}", Indent(indent + 2))?;
+    }
+
+    writeln!(writer, "{}}}", Indent(indent + 1),)?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_deserialize_message<W: Write>(
+    config: &Config,
+    indent: usize,
+    message: &Message,
+    rust_type: &str,
+    writer: &mut W,
+) -> Result<()> {
+    write_deserialize_field_name(2, message, writer)?;
+
+    writeln!(writer, "{}struct GeneratedVisitor;", Indent(indent))?;
+
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}    type Value = {rust_type};
+
+{indent}    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}        formatter.write_str("struct {name}")
+{indent}    }}
+
+{indent}    fn visit_map<V>(self, mut map: V) -> Result<{rust_type}, V::Error>
+{indent}        where
+{indent}            V: serde::de::MapAccess<'de>,
+{indent}    {{"#,
+        indent = Indent(indent),
+        name = message.path,
+        rust_type = rust_type,
+    )?;
+
+    for field in &message.fields {
+        writeln!(
+            writer,
+            "{}let mut {} = None;",
+            Indent(indent + 2),
+            field.rust_field_name(),
+        )?;
+    }
+
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{}let mut {} = None;",
+            Indent(indent + 2),
+            one_of.rust_field_name(),
+        )?;
+    }
+
+    if !message.fields.is_empty() || !message.one_ofs.is_empty() {
+        writeln!(
+            writer,
+            "{}while let Some(k) = map.next_key()? {{",
+            Indent(indent + 2)
+        )?;
+
+        writeln!(writer, "{}match k {{", Indent(indent + 3))?;
+
+        for field in &message.fields {
+            write_deserialize_field(config, indent + 4, field, None, writer)?;
+        }
+
+        for one_of in &message.one_ofs {
+            for field in &one_of.fields {
+                write_deserialize_field(config, indent + 4, field, Some(one_of), writer)?;
+            }
+        }
+
+        writeln!(writer, "{}}}", Indent(indent + 3))?;
+        writeln!(writer, "{}}}", Indent(indent + 2))?;
+    } else {
+        writeln!(
+            writer,
+            "{}while map.next_key::<GeneratedField>()?.is_some() {{}}",
+            Indent(indent + 2)
+        )?;
+    }
+
+    writeln!(writer, "{}Ok({} {{", Indent(indent + 2), rust_type)?;
+    for field in &message.fields {
+        match field.field_modifier {
+            FieldModifier::Required => {
+                writeln!(
+                    writer,
+                    "{indent}{field}: {field}.ok_or_else(|| serde::de::Error::missing_field(\"{json_name}\"))?,",
+                    indent=Indent(indent + 3),
+                    field= field.rust_field_name(),
+                    json_name= field.json_name()
+                )?;
+            }
+            FieldModifier::UseDefault | FieldModifier::Repeated => {
+                // Note: this currently does not hydrate optional proto2 fields with defaults
+                writeln!(
+                    writer,
+                    "{indent}{field}: {field}.unwrap_or_default(),",
+                    indent = Indent(indent + 3),
+                    field = field.rust_field_name()
+                )?;
+            }
+            _ => {
+                writeln!(
+                    writer,
+                    "{indent}{field},",
+                    indent = Indent(indent + 3),
+                    field = field.rust_field_name()
+                )?;
+            }
+        }
+    }
+    for one_of in &message.one_ofs {
+        writeln!(
+            writer,
+            "{indent}{field},",
+            indent = Indent(indent + 3),
+            field = one_of.rust_field_name(),
+        )?;
+    }
+
+    writeln!(writer, "{}}})", Indent(indent + 2))?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    writeln!(writer, "{}}}", Indent(indent))?;
+    writeln!(
+        writer,
+        "{}deserializer.deserialize_struct(\"{}\", FIELDS, GeneratedVisitor)",
+        Indent(indent),
+        message.path
+    )
+}
+
+fn write_deserialize_field_name<W: Write>(
+    indent: usize,
+    message: &Message,
+    writer: &mut W,
+) -> Result<()> {
+    let fields: Vec<_> = message
+        .all_fields()
+        .map(|field| (field.json_name(), field.rust_type_name()))
+        .collect();
+
+    write_fields_array(writer, indent, fields.iter().map(|(name, _)| name.as_str()))?;
+    write_fields_enum(writer, indent, fields.iter().map(|(_, name)| name.as_str()))?;
+
+    writeln!(
+        writer,
+        r#"{indent}impl<'de> serde::Deserialize<'de> for GeneratedField {{
+{indent}    fn deserialize<D>(deserializer: D) -> Result<GeneratedField, D::Error>
+{indent}    where
+{indent}        D: serde::Deserializer<'de>,
+{indent}    {{
+{indent}        struct GeneratedVisitor;
+
+{indent}        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {{
+{indent}            type Value = GeneratedField;
+
+{indent}            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
+{indent}                write!(formatter, "expected one of: {{:?}}", &FIELDS)
+{indent}            }}
+
+{indent}            fn visit_str<E>(self, value: &str) -> Result<GeneratedField, E>
+{indent}            where
+{indent}                E: serde::de::Error,
+{indent}            {{"#,
+        indent = Indent(indent)
+    )?;
+
+    if !fields.is_empty() {
+        writeln!(writer, "{}match value {{", Indent(indent + 4))?;
+        for (json_name, type_name) in &fields {
+            writeln!(
+                writer,
+                "{}\"{}\" => Ok(GeneratedField::{}),",
+                Indent(indent + 5),
+                json_name,
+                type_name
+            )?;
+        }
+        writeln!(
+            writer,
+            "{}_ => Err(serde::de::Error::unknown_field(value, FIELDS)),",
+            Indent(indent + 5)
+        )?;
+        writeln!(writer, "{}}}", Indent(indent + 4))?;
+    } else {
+        writeln!(
+            writer,
+            "{}Err(serde::de::Error::unknown_field(value, FIELDS))",
+            Indent(indent + 4)
+        )?;
+    }
+
+    writeln!(
+        writer,
+        r#"{indent}            }}
+{indent}        }}
+{indent}        deserializer.deserialize_identifier(GeneratedVisitor)
+{indent}    }}
+{indent}}}"#,
+        indent = Indent(indent)
+    )
+}
+
+fn write_fields_enum<'a, W: Write, I: Iterator<Item = &'a str>>(
+    writer: &mut W,
+    indent: usize,
+    fields: I,
+) -> Result<()> {
+    writeln!(
+        writer,
+        "{}#[allow(clippy::enum_variant_names)]",
+        Indent(indent)
+    )?;
+    writeln!(writer, "{}enum GeneratedField {{", Indent(indent))?;
+    for type_name in fields {
+        writeln!(writer, "{}{},", Indent(indent + 1), type_name)?;
+    }
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_deserialize_field<W: Write>(
+    config: &Config,
+    indent: usize,
+    field: &Field,
+    one_of: Option<&OneOf>,
+    writer: &mut W,
+) -> Result<()> {
+    let field_name = match one_of {
+        Some(one_of) => one_of.rust_field_name(),
+        None => field.rust_field_name(),
+    };
+
+    let json_name = field.json_name();
+    writeln!(
+        writer,
+        "{}GeneratedField::{} => {{",
+        Indent(indent),
+        field.rust_type_name()
+    )?;
+    writeln!(
+        writer,
+        "{}if {}.is_some() {{",
+        Indent(indent + 1),
+        field_name
+    )?;
+
+    // Note: this will report duplicate field if multiple value are specified for a one of
+    writeln!(
+        writer,
+        "{}return Err(serde::de::Error::duplicate_field(\"{}\"));",
+        Indent(indent + 2),
+        json_name
+    )?;
+    writeln!(writer, "{}}}", Indent(indent + 1))?;
+    write!(writer, "{}{} = Some(", Indent(indent + 1), field_name)?;
+
+    if let Some(one_of) = one_of {
+        write!(
+            writer,
+            "{}::{}(",
+            config.rust_type(&one_of.path),
+            field.rust_type_name()
+        )?;
+    }
+
+    match &field.field_type {
+        FieldType::Scalar(scalar) => {
+            write_encode_scalar_field(indent + 1, *scalar, field.field_modifier, writer)?;
+        }
+        FieldType::Enum(path) => match field.field_modifier {
+            FieldModifier::Repeated => {
+                write!(
+                    writer,
+                    "map.next_value::<Vec<{}>>()?.into_iter().map(|x| x as i32).collect()",
+                    config.rust_type(path)
+                )?;
+            }
+            _ => {
+                write!(
+                    writer,
+                    "map.next_value::<{}>()? as i32",
+                    config.rust_type(path)
+                )?;
+            }
+        },
+        FieldType::Map(key, value) => {
+            writeln!(writer)?;
+            write!(
+                writer,
+                "{}map.next_value::<std::collections::HashMap<",
+                Indent(indent + 2),
+            )?;
+
+            let map_k = match key {
+                ScalarType::Bytes => {
+                    // https://github.com/tokio-rs/prost/issues/531
+                    panic!("bytes are not currently supported as map keys")
+                }
+                _ if key.is_numeric() => {
+                    write!(
+                        writer,
+                        "::pbjson::private::NumberDeserialize<{}>",
+                        key.rust_type()
+                    )?;
+                    "k.0"
+                }
+                _ => {
+                    write!(writer, "_")?;
+                    "k"
+                }
+            };
+            write!(writer, ", ")?;
+            let map_v = match value.as_ref() {
+                FieldType::Scalar(scalar) if scalar.is_numeric() => {
+                    write!(
+                        writer,
+                        "::pbjson::private::NumberDeserialize<{}>",
+                        scalar.rust_type()
+                    )?;
+                    "v.0"
+                }
+                FieldType::Scalar(ScalarType::Bytes) => {
+                    // https://github.com/tokio-rs/prost/issues/531
+                    panic!("bytes are not currently supported as map values")
+                }
+                FieldType::Enum(path) => {
+                    write!(writer, "{}", config.rust_type(path))?;
+                    "v as i32"
+                }
+                FieldType::Map(_, _) => panic!("protobuf disallows nested maps"),
+                _ => {
+                    write!(writer, "_")?;
+                    "v"
+                }
+            };
+
+            writeln!(writer, ">>()?")?;
+            if map_k != "k" || map_v != "v" {
+                writeln!(
+                    writer,
+                    "{}.into_iter().map(|(k,v)| ({}, {})).collect()",
+                    Indent(indent + 3),
+                    map_k,
+                    map_v,
+                )?;
+            }
+            write!(writer, "{}", Indent(indent + 1))?;
+        }
+        _ => {
+            write!(writer, "map.next_value()?",)?;
+        }
+    };
+
+    if one_of.is_some() {
+        write!(writer, ")")?;
+    }
+
+    writeln!(writer, ");")?;
+    writeln!(writer, "{}}}", Indent(indent))
+}
+
+fn write_encode_scalar_field<W: Write>(
+    indent: usize,
+    scalar: ScalarType,
+    field_modifier: FieldModifier,
+    writer: &mut W,
+) -> Result<()> {
+    let deserializer = match scalar {
+        ScalarType::Bytes => "BytesDeserialize",
+        _ if scalar.is_numeric() => "NumberDeserialize",
+        _ => return write!(writer, "map.next_value()?",),
+    };
+
+    writeln!(writer)?;
+
+    match field_modifier {
+        FieldModifier::Repeated => {
+            writeln!(
+                writer,
+                "{}map.next_value::<Vec<::pbjson::private::{}<_>>>()?",
+                Indent(indent + 1),
+                deserializer
+            )?;
+            writeln!(
+                writer,
+                "{}.into_iter().map(|x| x.0).collect()",
+                Indent(indent + 2)
+            )?;
+        }
+        _ => {
+            writeln!(
+                writer,
+                "{}map.next_value::<::pbjson::private::{}<_>>()?.0",
+                Indent(indent + 1),
+                deserializer
+            )?;
+        }
+    }
+    write!(writer, "{}", Indent(indent))
+}
--- a/pbjson_build/src/lib.rs
+++ b/pbjson_build/src/lib.rs
@ -0,0 +1,113 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::future_not_send
+)]
+
+use crate::descriptor::{Descriptor, DescriptorSet, Package};
+use crate::generator::{generate_enum, generate_message, Config};
+use crate::message::resolve_message;
+use std::io::{BufWriter, Error, ErrorKind, Result, Write};
+use std::path::PathBuf;
+
+mod descriptor;
+mod escape;
+mod generator;
+mod message;
+
+#[derive(Debug, Default)]
+pub struct Builder {
+    descriptors: descriptor::DescriptorSet,
+    out_dir: Option<PathBuf>,
+}
+
+impl Builder {
+    /// Create a new `Builder`
+    pub fn new() -> Self {
+        Self {
+            descriptors: DescriptorSet::new(),
+            out_dir: None,
+        }
+    }
+
+    /// Register an encoded `FileDescriptorSet` with this `Builder`
+    pub fn register_descriptors(&mut self, descriptors: &[u8]) -> Result<&mut Self> {
+        self.descriptors.register_encoded(descriptors)?;
+        Ok(self)
+    }
+
+    /// Generates code for all registered types where `prefixes` contains a prefix of
+    /// the fully-qualified path of the type
+    pub fn build<S: AsRef<str>>(&mut self, prefixes: &[S]) -> Result<()> {
+        let mut output: PathBuf = self.out_dir.clone().map(Ok).unwrap_or_else(|| {
+            std::env::var_os("OUT_DIR")
+                .ok_or_else(|| {
+                    Error::new(ErrorKind::Other, "OUT_DIR environment variable is not set")
+                })
+                .map(Into::into)
+        })?;
+        output.push("FILENAME");
+
+        let write_factory = move |package: &Package| {
+            output.set_file_name(format!("{}.serde.rs", package));
+
+            let file = std::fs::OpenOptions::new()
+                .write(true)
+                .truncate(true)
+                .create(true)
+                .open(&output)?;
+
+            Ok(BufWriter::new(file))
+        };
+
+        let writers = generate(&self.descriptors, prefixes, write_factory)?;
+        for (_, mut writer) in writers {
+            writer.flush()?;
+        }
+
+        Ok(())
+    }
+}
+
+fn generate<S: AsRef<str>, W: Write, F: FnMut(&Package) -> Result<W>>(
+    descriptors: &DescriptorSet,
+    prefixes: &[S],
+    mut write_factory: F,
+) -> Result<Vec<(Package, W)>> {
+    let config = Config {
+        extern_types: Default::default(),
+    };
+
+    let iter = descriptors.iter().filter(move |(t, _)| {
+        prefixes
+            .iter()
+            .any(|prefix| t.matches_prefix(prefix.as_ref()))
+    });
+
+    // Exploit the fact descriptors is ordered to group together types from the same package
+    let mut ret: Vec<(Package, W)> = Vec::new();
+    for (type_path, descriptor) in iter {
+        let writer = match ret.last_mut() {
+            Some((package, writer)) if package == type_path.package() => writer,
+            _ => {
+                let package = type_path.package();
+                ret.push((package.clone(), write_factory(package)?));
+                &mut ret.last_mut().unwrap().1
+            }
+        };
+
+        match descriptor {
+            Descriptor::Enum(descriptor) => generate_enum(&config, type_path, descriptor, writer)?,
+            Descriptor::Message(descriptor) => {
+                if let Some(message) = resolve_message(descriptors, descriptor) {
+                    generate_message(&config, &message, writer)?
+                }
+            }
+        }
+    }
+
+    Ok(ret)
+}
--- a/pbjson_build/src/message.rs
+++ b/pbjson_build/src/message.rs
@ -0,0 +1,275 @@
+//! The raw descriptor format is not very easy to work with, a fact not aided
+//! by prost making almost all members of proto2 syntax message optional
+//!
+//! This module therefore extracts a slightly less obtuse representation of a
+//! message that can be used by the code generation logic
+
+use prost_types::{
+    field_descriptor_proto::{Label, Type},
+    FieldDescriptorProto,
+};
+
+use crate::descriptor::{Descriptor, DescriptorSet, MessageDescriptor, Syntax, TypeName, TypePath};
+use crate::escape::escape_ident;
+
+#[derive(Debug, Clone, Copy)]
+pub enum ScalarType {
+    F64,
+    F32,
+    I32,
+    I64,
+    U32,
+    U64,
+    Bool,
+    String,
+    Bytes,
+}
+
+impl ScalarType {
+    pub fn rust_type(&self) -> &'static str {
+        match self {
+            ScalarType::F64 => "f64",
+            ScalarType::F32 => "f32",
+            ScalarType::I32 => "i32",
+            ScalarType::I64 => "i64",
+            ScalarType::U32 => "u32",
+            ScalarType::U64 => "u64",
+            ScalarType::Bool => "bool",
+            ScalarType::String => "String",
+            ScalarType::Bytes => "Vec<u8>",
+        }
+    }
+
+    pub fn is_numeric(&self) -> bool {
+        matches!(
+            self,
+            ScalarType::F64
+                | ScalarType::F32
+                | ScalarType::I32
+                | ScalarType::I64
+                | ScalarType::U32
+                | ScalarType::U64
+        )
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum FieldType {
+    Scalar(ScalarType),
+    Enum(TypePath),
+    Message(TypePath),
+    Map(ScalarType, Box<FieldType>),
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum FieldModifier {
+    Required,
+    Optional,
+    UseDefault,
+    Repeated,
+}
+
+impl FieldModifier {
+    pub fn is_required(&self) -> bool {
+        matches!(self, Self::Required)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Field {
+    pub name: String,
+    pub json_name: Option<String>,
+    pub field_modifier: FieldModifier,
+    pub field_type: FieldType,
+}
+
+impl Field {
+    pub fn rust_type_name(&self) -> String {
+        use heck::CamelCase;
+        self.name.to_camel_case()
+    }
+
+    pub fn rust_field_name(&self) -> String {
+        use heck::SnakeCase;
+        escape_ident(self.name.to_snake_case())
+    }
+
+    pub fn json_name(&self) -> String {
+        use heck::MixedCase;
+        self.json_name
+            .clone()
+            .unwrap_or_else(|| self.name.to_mixed_case())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct OneOf {
+    pub name: String,
+    pub path: TypePath,
+    pub fields: Vec<Field>,
+}
+
+impl OneOf {
+    pub fn rust_field_name(&self) -> String {
+        use heck::SnakeCase;
+        escape_ident(self.name.to_snake_case())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Message {
+    pub path: TypePath,
+    pub fields: Vec<Field>,
+    pub one_ofs: Vec<OneOf>,
+}
+
+impl Message {
+    pub fn all_fields(&self) -> impl Iterator<Item = &Field> + '_ {
+        self.fields
+            .iter()
+            .chain(self.one_ofs.iter().flat_map(|one_of| one_of.fields.iter()))
+    }
+}
+
+/// Resolve the provided message descriptor into a slightly less obtuse representation
+///
+/// Returns None if the provided provided message is auto-generated
+pub fn resolve_message(
+    descriptors: &DescriptorSet,
+    message: &MessageDescriptor,
+) -> Option<Message> {
+    if message.is_map() {
+        return None;
+    }
+
+    let mut fields = Vec::new();
+    let mut one_of_fields = vec![Vec::new(); message.one_of.len()];
+
+    for field in &message.fields {
+        let field_type = field_type(descriptors, field);
+        let field_modifier = field_modifier(message, field, &field_type);
+
+        let resolved = Field {
+            name: field.name.clone().expect("expected field to have name"),
+            json_name: field.json_name.clone(),
+            field_type,
+            field_modifier,
+        };
+
+        // Treat synthetic one-of as normal
+        let proto3_optional = field.proto3_optional.unwrap_or(false);
+        match (field.oneof_index, proto3_optional) {
+            (Some(idx), false) => one_of_fields[idx as usize].push(resolved),
+            _ => fields.push(resolved),
+        }
+    }
+
+    let mut one_ofs = Vec::new();
+
+    for (fields, descriptor) in one_of_fields.into_iter().zip(&message.one_of) {
+        // Might be empty in the event of a synthetic one-of
+        if !fields.is_empty() {
+            let name = descriptor.name.clone().expect("oneof with no name");
+            let path = message.path.child(TypeName::new(&name));
+
+            one_ofs.push(OneOf { name, path, fields })
+        }
+    }
+
+    Some(Message {
+        path: message.path.clone(),
+        fields,
+        one_ofs,
+    })
+}
+
+fn field_modifier(
+    message: &MessageDescriptor,
+    field: &FieldDescriptorProto,
+    field_type: &FieldType,
+) -> FieldModifier {
+    let label = Label::from_i32(field.label.expect("expected label")).expect("valid label");
+    if field.proto3_optional.unwrap_or(false) {
+        assert_eq!(label, Label::Optional);
+        return FieldModifier::Optional;
+    }
+
+    if field.oneof_index.is_some() {
+        assert_eq!(label, Label::Optional);
+        return FieldModifier::Optional;
+    }
+
+    if matches!(field_type, FieldType::Map(_, _)) {
+        assert_eq!(label, Label::Repeated);
+        return FieldModifier::Repeated;
+    }
+
+    match label {
+        Label::Optional => match message.syntax {
+            Syntax::Proto2 => FieldModifier::Optional,
+            Syntax::Proto3 => match field_type {
+                FieldType::Message(_) => FieldModifier::Optional,
+                _ => FieldModifier::UseDefault,
+            },
+        },
+        Label::Required => FieldModifier::Required,
+        Label::Repeated => FieldModifier::Repeated,
+    }
+}
+
+fn field_type(descriptors: &DescriptorSet, field: &FieldDescriptorProto) -> FieldType {
+    match field.type_name.as_ref() {
+        Some(type_name) => resolve_type(descriptors, type_name.as_str()),
+        None => {
+            let scalar =
+                match Type::from_i32(field.r#type.expect("expected type")).expect("valid type") {
+                    Type::Double => ScalarType::F64,
+                    Type::Float => ScalarType::F32,
+                    Type::Int64 | Type::Sfixed64 | Type::Sint64 => ScalarType::I64,
+                    Type::Int32 | Type::Sfixed32 | Type::Sint32 => ScalarType::I32,
+                    Type::Uint64 | Type::Fixed64 => ScalarType::U64,
+                    Type::Uint32 | Type::Fixed32 => ScalarType::U32,
+                    Type::Bool => ScalarType::Bool,
+                    Type::String => ScalarType::String,
+                    Type::Bytes => ScalarType::Bytes,
+                    Type::Message | Type::Enum | Type::Group => panic!("no type name specified"),
+                };
+            FieldType::Scalar(scalar)
+        }
+    }
+}
+
+fn resolve_type(descriptors: &DescriptorSet, type_name: &str) -> FieldType {
+    assert!(
+        type_name.starts_with('.'),
+        "pbjson does not currently support resolving relative types"
+    );
+    let maybe_descriptor = descriptors
+        .iter()
+        .find(|(path, _)| path.matches_prefix(type_name));
+
+    match maybe_descriptor {
+        Some((path, Descriptor::Enum(_))) => FieldType::Enum(path.clone()),
+        Some((path, Descriptor::Message(descriptor))) => match descriptor.is_map() {
+            true => {
+                assert_eq!(descriptor.fields.len(), 2, "expected map to have 2 fields");
+                let key = &descriptor.fields[0];
+                let value = &descriptor.fields[1];
+
+                assert_eq!("key", key.name());
+                assert_eq!("value", value.name());
+
+                let key_type = match field_type(descriptors, key) {
+                    FieldType::Scalar(scalar) => scalar,
+                    _ => panic!("non scalar map key"),
+                };
+                let value_type = field_type(descriptors, value);
+                FieldType::Map(key_type, Box::new(value_type))
+            }
+            // Note: This may actually be a group but it is non-trivial to detect this,
+            // they're deprecated, and pbjson doesn't need to be able to distinguish
+            false => FieldType::Message(path.clone()),
+        },
+        None => panic!("failed to resolve type: {}", type_name),
+    }
+}
--- a/pbjson_test/Cargo.toml
+++ b/pbjson_test/Cargo.toml
@ -0,0 +1,18 @@
+[package]
+name = "pbjson_test"
+version = "0.1.0"
+authors = ["Raphael Taylor-Davies <r.taylordavies@googlemail.com>"]
+edition = "2018"
+description = "Test resources for pbjson converion"
+
+[dependencies]
+prost = "0.8"
+pbjson = { path = "../pbjson" }
+serde = { version = "1.0", features = ["derive"] }
+
+[dev-dependencies]
+serde_json = "1.0"
+
+[build-dependencies]
+prost-build = "0.8"
+pbjson_build = { path = "../pbjson_build" }
--- a/pbjson_test/build.rs
+++ b/pbjson_test/build.rs
@ -0,0 +1,33 @@
+//! Compiles Protocol Buffers definitions into native Rust types
+
+use std::env;
+use std::path::PathBuf;
+
+type Error = Box<dyn std::error::Error>;
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+fn main() -> Result<()> {
+    let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
+
+    let proto_files = vec![root.join("syntax3.proto")];
+
+    // Tell cargo to recompile if any of these proto files are changed
+    for proto_file in &proto_files {
+        println!("cargo:rerun-if-changed={}", proto_file.display());
+    }
+
+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
+    prost_build::Config::new()
+        .file_descriptor_set_path(&descriptor_path)
+        .compile_well_known_types()
+        .disable_comments(&["."])
+        .bytes(&[".test"])
+        .compile_protos(&proto_files, &[root])?;
+
+    let descriptor_set = std::fs::read(descriptor_path)?;
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".test"])?;
+
+    Ok(())
+}
--- a/pbjson_test/protos/syntax3.proto
+++ b/pbjson_test/protos/syntax3.proto
@ -0,0 +1,72 @@
+syntax = "proto3";
+
+package test.syntax3;
+
+message Empty {}
+
+message KitchenSink {
+  // Standard enum
+  enum Value {
+    VALUE_UNKNOWN = 0;
+    VALUE_A = 45;
+    VALUE_B = 63;
+  }
+
+  // An enumeration without prefixed variants
+  enum Prefix {
+    UNKNOWN = 0;
+    A = 66;
+    B = 20;
+  }
+
+  int32 i32 = 1;
+  optional int32 optional_i32 = 2;
+  repeated int32 repeated_i32 = 3;
+
+  uint32 u32 = 4;
+  optional uint32 optional_u32 = 5;
+  repeated uint32 repeated_u32 = 6;
+
+  int64 i64 = 7;
+  optional int64  optional_i64 = 8;
+  repeated int64 repeated_i64 = 9;
+
+  uint64 u64 = 10;
+  optional uint64 optional_u64 = 11;
+  repeated uint64 repeated_u64 = 12;
+
+  Value value = 13;
+  optional Value optional_value = 14;
+  repeated Value repeated_value = 15;
+
+  Prefix prefix = 16;
+  Empty empty = 17;
+
+  map<string, string> string_dict = 18;
+  map<string, Empty> message_dict = 19;
+  map<string, Prefix> enum_dict = 20;
+  map<int64, Prefix> int64_dict = 21;
+  map<int32, Prefix> int32_dict = 22;
+  map<int32, uint64> integer_dict = 23;
+
+  bool bool = 24;
+  optional bool optional_bool = 25;
+  repeated bool repeated_bool = 26;
+
+  oneof one_of {
+    int32 one_of_i32 = 27;
+    bool one_of_bool = 28;
+    Value one_of_value = 29;
+    Empty one_of_message = 30;
+  }
+
+  bytes bytes = 31;
+  optional bytes optional_bytes = 32;
+  repeated bytes repeated_bytes = 33;
+
+  // Bytes support is currently broken - https://github.com/tokio-rs/prost/issues/531
+  //  map<string, bytes> bytes_dict = 34;
+
+  string string = 35;
+  optional string optional_string = 36;
+}
--- a/pbjson_test/src/lib.rs
+++ b/pbjson_test/src/lib.rs
@ -0,0 +1,241 @@
+include!(concat!(env!("OUT_DIR"), "/test.syntax3.rs"));
+include!(concat!(env!("OUT_DIR"), "/test.syntax3.serde.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_empty() {
+        let message = Empty {};
+
+        let encoded = serde_json::to_string(&message).unwrap();
+        let _decoded: Empty = serde_json::from_str(&encoded).unwrap();
+
+        let err = serde_json::from_str::<Empty>("343").unwrap_err();
+        assert_eq!(
+            err.to_string().as_str(),
+            "invalid type: integer `343`, expected struct test.syntax3.Empty at line 1 column 3"
+        );
+
+        let err = serde_json::from_str::<Empty>("{\"foo\": \"bar\"}").unwrap_err();
+        assert_eq!(
+            err.to_string().as_str(),
+            "unknown field `foo`, there are no fields at line 1 column 6"
+        );
+    }
+
+    #[test]
+    fn test_kitchen_sink() {
+        let mut decoded: KitchenSink = serde_json::from_str("{}").unwrap();
+
+        let verify_encode = |decoded: &KitchenSink, expected: &str| {
+            assert_eq!(serde_json::to_string(&decoded).unwrap().as_str(), expected);
+        };
+
+        let verify_decode = |decoded: &KitchenSink, expected: &str| {
+            assert_eq!(decoded, &serde_json::from_str(expected).unwrap());
+        };
+
+        let verify = |decoded: &KitchenSink, expected: &str| {
+            verify_encode(decoded, expected);
+            verify_decode(decoded, expected);
+        };
+
+        verify(&decoded, "{}");
+        decoded.i32 = 24;
+        verify(&decoded, r#"{"i32":24}"#);
+        decoded.i32 = 0;
+        verify_decode(&decoded, "{}");
+
+        // Explicit optional fields can distinguish between no value and default value
+        decoded.optional_i32 = Some(2);
+        verify(&decoded, r#"{"optionalI32":2}"#);
+
+        decoded.optional_i32 = Some(0);
+        verify(&decoded, r#"{"optionalI32":0}"#);
+
+        // Can also decode from string
+        verify_decode(&decoded, r#"{"optionalI32":"0"}"#);
+
+        decoded.optional_i32 = None;
+        verify_decode(&decoded, "{}");
+
+        // 64-bit integers are encoded as strings
+        decoded.i64 = 123125;
+        verify(&decoded, r#"{"i64":"123125"}"#);
+
+        decoded.i64 = 0;
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_i64 = Some(532);
+        verify(&decoded, r#"{"optionalI64":"532"}"#);
+
+        decoded.optional_i64 = Some(0);
+        verify(&decoded, r#"{"optionalI64":"0"}"#);
+
+        // Can also decode from non-string
+        verify_decode(&decoded, r#"{"optionalI64":0}"#);
+
+        decoded.optional_i64 = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.u64 = 34346;
+        decoded.u32 = 567094456;
+        decoded.optional_u32 = Some(0);
+        decoded.optional_u64 = Some(3);
+        verify(
+            &decoded,
+            r#"{"u32":567094456,"optionalU32":0,"u64":"34346","optionalU64":"3"}"#,
+        );
+
+        decoded.u64 = 0;
+        decoded.u32 = 0;
+        decoded.optional_u32 = None;
+        decoded.optional_u64 = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_i32 = vec![0, 23, 5, 6, 2, 34];
+        verify(&decoded, r#"{"repeatedI32":[0,23,5,6,2,34]}"#);
+        // Can also mix in some strings
+        verify_decode(&decoded, r#"{"repeatedI32":[0,"23",5,6,"2",34]}"#);
+
+        decoded.repeated_i32 = vec![];
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_u64 = vec![0, 532, 2];
+        verify(&decoded, r#"{"repeatedU64":["0","532","2"]}"#);
+        // Can also mix in some non-strings
+        verify_decode(&decoded, r#"{"repeatedU64":["0",532,"2"]}"#);
+
+        decoded.repeated_u64 = vec![];
+        verify_decode(&decoded, "{}");
+
+        // Enumerations should be encoded as strings
+        decoded.value = kitchen_sink::Value::A as i32;
+        verify(&decoded, r#"{"value":"VALUE_A"}"#);
+
+        // Can also use variant number
+        verify_decode(&decoded, r#"{"value":45}"#);
+
+        decoded.value = kitchen_sink::Value::Unknown as i32;
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_value = Some(kitchen_sink::Value::Unknown as i32);
+        verify(&decoded, r#"{"optionalValue":"VALUE_UNKNOWN"}"#);
+
+        // Can also use variant number
+        verify_decode(&decoded, r#"{"optionalValue":0}"#);
+
+        decoded.optional_value = None;
+        verify_decode(&decoded, "{}");
+
+        decoded
+            .string_dict
+            .insert("foo".to_string(), "bar".to_string());
+        verify(&decoded, r#"{"stringDict":{"foo":"bar"}}"#);
+
+        decoded.string_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded
+            .int32_dict
+            .insert(343, kitchen_sink::Prefix::A as i32);
+        // Dictionary keys should always be strings
+        // Enum dictionary values should be encoded as strings
+        verify(&decoded, r#"{"int32Dict":{"343":"A"}}"#);
+        // Enum dictionary values can be decoded from integers
+        verify_decode(&decoded, r#"{"int32Dict":{"343":66}}"#);
+
+        decoded.int32_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        // 64-bit dictionary values should be encoded as strings
+        decoded.integer_dict.insert(12, 13);
+        verify(&decoded, r#"{"integerDict":{"12":"13"}}"#);
+        // 64-bit dictionary values can be decoded from numeric types
+        verify_decode(&decoded, r#"{"integerDict":{"12":13}}"#);
+
+        decoded.integer_dict = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfI32(0));
+        verify(&decoded, r#"{"oneOfI32":0}"#);
+        // Can also specify string
+        verify_decode(&decoded, r#"{"oneOfI32":"0"}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfI32(12));
+        verify(&decoded, r#"{"oneOfI32":12}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfBool(false));
+        verify(&decoded, r#"{"oneOfBool":false}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfBool(true));
+        verify(&decoded, r#"{"oneOfBool":true}"#);
+
+        decoded.one_of = Some(kitchen_sink::OneOf::OneOfValue(
+            kitchen_sink::Value::B as i32,
+        ));
+        verify(&decoded, r#"{"oneOfValue":"VALUE_B"}"#);
+        // Can also specify enum variant
+        verify_decode(&decoded, r#"{"oneOfValue":63}"#);
+
+        decoded.one_of = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_value = vec![
+            kitchen_sink::Value::B as i32,
+            kitchen_sink::Value::B as i32,
+            kitchen_sink::Value::A as i32,
+        ];
+        verify(
+            &decoded,
+            r#"{"repeatedValue":["VALUE_B","VALUE_B","VALUE_A"]}"#,
+        );
+        verify_decode(&decoded, r#"{"repeatedValue":[63,"VALUE_B","VALUE_A"]}"#);
+
+        decoded.repeated_value = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.bytes = prost::bytes::Bytes::from_static(b"kjkjkj");
+        verify(&decoded, r#"{"bytes":"a2pramtq"}"#);
+
+        decoded.bytes = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_bytes = Some(prost::bytes::Bytes::from_static(b"kjkjkj"));
+        verify(&decoded, r#"{"optionalBytes":"a2pramtq"}"#);
+
+        decoded.optional_bytes = Some(Default::default());
+        verify(&decoded, r#"{"optionalBytes":""}"#);
+
+        decoded.optional_bytes = None;
+        verify_decode(&decoded, "{}");
+
+        decoded.repeated_bytes = vec![
+            prost::bytes::Bytes::from_static(b"sdfsd"),
+            prost::bytes::Bytes::from_static(b"fghfg"),
+        ];
+        verify(&decoded, r#"{"repeatedBytes":["c2Rmc2Q=","ZmdoZmc="]}"#);
+
+        decoded.repeated_bytes = Default::default();
+        verify_decode(&decoded, "{}");
+
+        // decoded.bytes_dict.insert(
+        //     "test".to_string(),
+        //     prost::bytes::Bytes::from_static(b"asdf"),
+        // );
+        // verify(&decoded, r#"{"bytesDict":{"test":"YXNkZgo="}}"#);
+        //
+        // decoded.bytes_dict = Default::default();
+        // verify_decode(&decoded, "{}");
+
+        decoded.string = "test".to_string();
+        verify(&decoded, r#"{"string":"test"}"#);
+
+        decoded.string = Default::default();
+        verify_decode(&decoded, "{}");
+
+        decoded.optional_string = Some(String::new());
+        verify(&decoded, r#"{"optionalString":""}"#);
+    }
+}
--- a/predicate/src/predicate.rs
+++ b/predicate/src/predicate.rs
@ -161,6 +161,51 @@ impl Predicate {
    pub fn is_empty(&self) -> bool {
        self == &EMPTY_PREDICATE
    }
+
+    /// Add each range [start, stop] of the delete_predicates into the predicate in
+    /// the form "time < start OR time > stop" to eliminate that range from the query
+    pub fn add_delete_ranges<S>(&mut self, delete_predicates: &[S])
+    where
+        S: AsRef<Self>,
+    {
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            if let Some(range) = pred.range {
+                let expr = col(TIME_COLUMN_NAME)
+                    .lt(lit(range.start))
+                    .or(col(TIME_COLUMN_NAME).gt(lit(range.end)));
+                self.exprs.push(expr);
+            }
+        }
+    }
+
+    /// Add a list of disjunctive negated expressions.
+    /// Example: there are two deletes as follows
+    ///   . Delete_1: WHERE city != "Boston"  AND temp = 70
+    ///   . Delete 2: WHERE state = "NY" AND route != "I90"
+    /// The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
+    ///    NOT(city != "Boston"  AND temp = 70),  NOT(state = "NY" AND route != "I90") which means
+    ///   [NOT(city = Boston") OR NOT(temp = 70)], [NOT(state = "NY") OR NOT(route != "I90")]
+    pub fn add_delete_exprs<S>(&mut self, delete_predicates: &[S])
+    where
+        S: AsRef<Self>,
+    {
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            let mut expr: Option<Expr> = None;
+            for exp in &pred.exprs {
+                match expr {
+                    None => expr = Some(exp.clone().not()),
+                    Some(e) => expr = Some(e.or(exp.clone().not())),
+                }
+            }
+            if let Some(e) = expr {
+                self.exprs.push(e);
+            }
+        }
+    }
 }

 impl fmt::Display for Predicate {
--- a/query/src/exec/context.rs
+++ b/query/src/exec/context.rs
@ -11,18 +11,19 @@ use datafusion::{
    logical_plan::{LogicalPlan, UserDefinedLogicalNode},
    physical_plan::{
        coalesce_partitions::CoalescePartitionsExec,
-        collect, displayable,
+        displayable,
        planner::{DefaultPhysicalPlanner, ExtensionPlanner},
        ExecutionPlan, PhysicalPlanner, SendableRecordBatchStream,
    },
    prelude::*,
 };
+use futures::TryStreamExt;
 use observability_deps::tracing::{debug, trace};
 use trace::{ctx::SpanContext, span::SpanRecorder};

 use crate::exec::{
    fieldlist::{FieldList, IntoFieldList},
-    query_tracing::send_metrics_to_tracing,
+    query_tracing::TracedStream,
    schema_pivot::{SchemaPivotExec, SchemaPivotNode},
    seriesset::{SeriesSetConverter, SeriesSetItem},
    split::StreamSplitExec,
@ -272,45 +273,63 @@ impl IOxExecutionContext {
    /// Executes the logical plan using DataFusion on a separate
    /// thread pool and produces RecordBatches
    pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
-        let ctx = self.child_ctx("collect");
        debug!(
            "Running plan, physical:\n{}",
            displayable(physical_plan.as_ref()).indent()
        );
+        let ctx = self.child_ctx("collect");
+        let stream = ctx.execute_stream(physical_plan).await?;

-        let res = ctx.run(collect(Arc::clone(&physical_plan))).await;
-
-        // send metrics to tracing, even on error
-        ctx.save_metrics(physical_plan);
-        res
+        ctx.run(
+            stream
+                .err_into() // convert to DataFusionError
+                .try_collect(),
+        )
+        .await
    }

-    /// Executes the physical plan and produces a RecordBatchStream to stream
-    /// over the result that iterates over the results.
-    pub async fn execute(
+    /// Executes the physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream(
        &self,
        physical_plan: Arc<dyn ExecutionPlan>,
    ) -> Result<SendableRecordBatchStream> {
        match physical_plan.output_partitioning().partition_count() {
            0 => unreachable!(),
-            1 => self.execute_partition(physical_plan, 0).await,
+            1 => self.execute_stream_partitioned(physical_plan, 0).await,
            _ => {
                // Merge into a single partition
-                self.execute_partition(Arc::new(CoalescePartitionsExec::new(physical_plan)), 0)
-                    .await
+                self.execute_stream_partitioned(
+                    Arc::new(CoalescePartitionsExec::new(physical_plan)),
+                    0,
+                )
+                .await
            }
        }
    }

-    /// Executes a single partition of a physical plan and produces a RecordBatchStream to stream
-    /// over the result that iterates over the results.
-    pub async fn execute_partition(
+    /// Executes a single partition of a physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream_partitioned(
        &self,
        physical_plan: Arc<dyn ExecutionPlan>,
        partition: usize,
    ) -> Result<SendableRecordBatchStream> {
-        self.run(async move { physical_plan.execute(partition).await })
-            .await
+        let span = self
+            .recorder
+            .span()
+            .map(|span| span.child("execute_stream_partitioned"));
+
+        self.run(async move {
+            let stream = physical_plan.execute(partition).await?;
+            let stream = TracedStream::new(stream, span, physical_plan);
+            Ok(Box::pin(stream) as _)
+        })
+        .await
    }

    /// Executes the SeriesSetPlans on the query executor, in
@ -349,7 +368,7 @@ impl IOxExecutionContext {

                    let physical_plan = ctx.prepare_plan(&plan)?;

-                    let it = ctx.execute(physical_plan).await?;
+                    let it = ctx.execute_stream(physical_plan).await?;

                    SeriesSetConverter::default()
                        .convert(
@ -486,19 +505,4 @@ impl IOxExecutionContext {
            recorder: self.recorder.child(name),
        }
    }
-
-    /// Saves any DataFusion metrics that are currently present in
-    /// `physical_plan` to the span recorder so they show up in
-    /// distributed traces (e.g. Jaeger)
-    ///
-    /// This function should be invoked after `physical_plan` has
-    /// fully `collect`ed, meaning that `PhysicalPlan::execute()` has
-    /// been invoked and the resulting streams have been completely
-    /// consumed. Calling `save_metrics` metrics prior to this point
-    /// may result in saving incomplete information.
-    pub fn save_metrics(&self, physical_plan: Arc<dyn ExecutionPlan>) {
-        if let Some(span) = self.recorder.span() {
-            send_metrics_to_tracing(span, physical_plan.as_ref())
-        }
-    }
 }
--- a/query/src/exec/query_tracing.rs
+++ b/query/src/exec/query_tracing.rs
@ -1,15 +1,67 @@
 //! This module contains the code to map DataFusion metrics to `Span`s
 //! for use in distributed tracing (e.g. Jaeger)

-use std::{borrow::Cow, fmt};
+use std::{borrow::Cow, fmt, sync::Arc};

+use arrow::record_batch::RecordBatch;
 use chrono::{DateTime, Utc};
 use datafusion::physical_plan::{
    metrics::{MetricValue, MetricsSet},
-    DisplayFormatType, ExecutionPlan,
+    DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
 };
+use futures::StreamExt;
 use observability_deps::tracing::debug;
-use trace::span::Span;
+use trace::span::{Span, SpanRecorder};
+
+/// Stream wrapper that records DataFusion `MetricSets` into IOx
+/// [`Span`]s when it is dropped.
+pub(crate) struct TracedStream {
+    inner: SendableRecordBatchStream,
+    span_recorder: SpanRecorder,
+    physical_plan: Arc<dyn ExecutionPlan>,
+}
+
+impl TracedStream {
+    /// Return a stream that records DataFusion `MetricSets` from
+    /// `physical_plan` into `span` when dropped.
+    pub(crate) fn new(
+        inner: SendableRecordBatchStream,
+        span: Option<trace::span::Span>,
+        physical_plan: Arc<dyn ExecutionPlan>,
+    ) -> Self {
+        Self {
+            inner,
+            span_recorder: SpanRecorder::new(span),
+            physical_plan,
+        }
+    }
+}
+
+impl RecordBatchStream for TracedStream {
+    fn schema(&self) -> arrow::datatypes::SchemaRef {
+        self.inner.schema()
+    }
+}
+
+impl futures::Stream for TracedStream {
+    type Item = arrow::error::Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        self.inner.poll_next_unpin(cx)
+    }
+}
+
+impl Drop for TracedStream {
+    fn drop(&mut self) {
+        if let Some(span) = self.span_recorder.span() {
+            let default_end_time = Utc::now();
+            send_metrics_to_tracing(default_end_time, span, self.physical_plan.as_ref());
+        }
+    }
+}

 /// This function translates data in DataFusion `MetricSets` into IOx
 /// [`Span`]s. It records a snapshot of the current state of the
@ -26,15 +78,7 @@ use trace::span::Span;
 /// 1. If the ExecutionPlan had no metrics
 /// 2. The total number of rows produced by the ExecutionPlan (if available)
 /// 3. The elapsed compute time taken by the ExecutionPlan
-pub(crate) fn send_metrics_to_tracing(parent_span: &Span, physical_plan: &dyn ExecutionPlan) {
-    // The parent span may be open, but since the physical_plan is
-    // assumed to be fully collected, using `now()` is a conservative
-    // estimate of the end time
-    let default_end_time = Utc::now();
-    send_metrics_to_tracing_inner(default_end_time, parent_span, physical_plan)
-}
-
-fn send_metrics_to_tracing_inner(
+fn send_metrics_to_tracing(
    default_end_time: DateTime<Utc>,
    parent_span: &Span,
    physical_plan: &dyn ExecutionPlan,
@ -101,7 +145,7 @@ fn send_metrics_to_tracing_inner(

    // recurse
    for child in physical_plan.children() {
-        send_metrics_to_tracing_inner(span_end, &span, child.as_ref())
+        send_metrics_to_tracing(span_end, &span, child.as_ref())
    }

    span.export()
@ -173,17 +217,9 @@ mod tests {
        Metric,
    };

-    use std::{
-        num::{NonZeroU128, NonZeroU64},
-        sync::Arc,
-        time::Duration,
-    };
+    use std::{sync::Arc, time::Duration};

-    use trace::{
-        ctx::{SpanContext, SpanId, TraceId},
-        span::{MetaValue, SpanStatus},
-        RingBufferTraceCollector,
-    };
+    use trace::{ctx::SpanContext, span::MetaValue, RingBufferTraceCollector};

    use super::*;

@ -193,7 +229,7 @@ mod tests {
        let exec = TestExec::new(name, Default::default());

        let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);

        let spans = traces.spans();
        assert_eq!(spans.len(), 1);
@ -224,7 +260,7 @@ mod tests {
        exec.new_child("child4", make_time_metricset(None, None));

        let traces = TraceBuilder::new();
-        send_metrics_to_tracing_inner(ts5, &traces.make_span(), &exec);
+        send_metrics_to_tracing(ts5, &traces.make_span(), &exec);

        let spans = traces.spans();
        println!("Spans: \n\n{:#?}", spans);
@ -250,7 +286,7 @@ mod tests {
        exec.metrics = None;

        let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);

        let spans = traces.spans();
        assert_eq!(spans.len(), 1);
@ -274,7 +310,7 @@ mod tests {
        add_elapsed_compute(exec.metrics_mut(), 2000, 2);

        let traces = TraceBuilder::new();
-        send_metrics_to_tracing(&traces.make_span(), &exec);
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec);

        // aggregated metrics should be reported
        let spans = traces.spans();
@ -358,23 +394,7 @@ mod tests {

        // create a new span connected to the collector
        fn make_span(&self) -> Span {
-            let collector = Arc::clone(&self.collector);
-
-            // lifted from make_span in trace/src/span.rs
-            Span {
-                name: "foo".into(),
-                ctx: SpanContext {
-                    trace_id: TraceId(NonZeroU128::new(23948923).unwrap()),
-                    parent_span_id: None,
-                    span_id: SpanId(NonZeroU64::new(3498394).unwrap()),
-                    collector: Some(collector),
-                },
-                start: None,
-                end: None,
-                status: SpanStatus::Unknown,
-                metadata: Default::default(),
-                events: vec![],
-            }
+            SpanContext::new(Arc::clone(&self.collector) as _).child("foo")
        }

        /// return all collected spans
--- a/query/src/frontend/influxrpc.rs
+++ b/query/src/frontend/influxrpc.rs
@ -524,6 +524,8 @@ impl InfluxRpcPlanner {
    /// The data is sorted on (tag_col1, tag_col2, ...) so that all
    /// rows for a particular series (groups where all tags are the
    /// same) occur together in the plan
+    // NGA todo: may need to add delete predicate here to eliminate deleted data at read time
+    //      https://github.com/influxdata/influxdb_iox/issues/2548
    pub fn read_filter<D>(&self, database: &D, predicate: Predicate) -> Result<SeriesSetPlans>
    where
        D: QueryDatabase + 'static,
--- a/query/src/lib.rs
+++ b/query/src/lib.rs
@ -18,7 +18,7 @@ use internal_types::{
    schema::{sort::SortKey, Schema, TIME_COLUMN_NAME},
    selection::Selection,
 };
-use observability_deps::tracing::trace;
+use observability_deps::tracing::{debug, trace};
 use predicate::predicate::{Predicate, PredicateMatch};

 use hashbrown::HashMap;
@ -46,7 +46,7 @@ pub trait QueryChunkMeta: Sized {
    fn schema(&self) -> Arc<Schema>;

    // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>>;
+    fn delete_predicates(&self) -> &[Arc<Predicate>];
 }

 /// A `Database` is the main trait implemented by the IOx subsystems
@ -137,6 +137,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync {
        &self,
        predicate: &Predicate,
        selection: Selection<'_>,
+        delete_predicates: &[Arc<Predicate>],
    ) -> Result<SendableRecordBatchStream, Self::Error>;

    /// Returns true if data of this chunk is sorted
@ -165,8 +166,10 @@ where
        self.as_ref().schema()
    }

-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        self.as_ref().delete_predicates()
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = self.as_ref().delete_predicates();
+        debug!(?pred, "Delete predicate in QueryChunkMeta");
+        pred
    }
 }

--- a/query/src/provider/physical.rs
+++ b/query/src/provider/physical.rs
@ -116,14 +116,18 @@ impl<C: QueryChunk + 'static> ExecutionPlan for IOxReadFilterNode<C> {
        let selection_cols = restrict_selection(selection_cols, &chunk_table_schema);
        let selection = Selection::Some(&selection_cols);

-        let stream = chunk.read_filter(&self.predicate, selection).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "Error creating scan for table {} chunk {}: {}",
-                self.table_name,
-                chunk.id(),
-                e
-            ))
-        })?;
+        let del_preds = chunk.delete_predicates();
+
+        let stream = chunk
+            .read_filter(&self.predicate, selection, del_preds)
+            .map_err(|e| {
+                DataFusionError::Execution(format!(
+                    "Error creating scan for table {} chunk {}: {}",
+                    self.table_name,
+                    chunk.id(),
+                    e
+                ))
+            })?;

        // all CPU time is now done, pass in baseline metrics to adapter
        timer.done();
--- a/query/src/test.rs
+++ b/query/src/test.rs
@ -25,6 +25,7 @@ use internal_types::{
    schema::{builder::SchemaBuilder, merge::SchemaMerger, InfluxColumnType, Schema},
    selection::Selection,
 };
+use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 use snafu::Snafu;
 use std::num::NonZeroU64;
@ -173,6 +174,9 @@ pub struct TestChunk {
    /// Return value for apply_predicate, if desired
    predicate_match: Option<PredicateMatch>,

+    /// Copy of delete predicates passed
+    delete_predicates: Vec<Arc<Predicate>>,
+
    /// Order of this chunk relative to other overlapping chunks.
    order: ChunkOrder,
 }
@ -248,6 +252,7 @@ impl TestChunk {
            table_data: Default::default(),
            saved_error: Default::default(),
            predicate_match: Default::default(),
+            delete_predicates: Default::default(),
            order: ChunkOrder::new(0),
        }
    }
@ -819,6 +824,7 @@ impl QueryChunk for TestChunk {
        &self,
        predicate: &Predicate,
        _selection: Selection<'_>,
+        _delete_predicates: &[Arc<Predicate>],
    ) -> Result<SendableRecordBatchStream, Self::Error> {
        self.check_error()?;

@ -908,12 +914,11 @@ impl QueryChunkMeta for TestChunk {
    }

    // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        // Since this is the test chunk and its focus is not (yet) on
-        // deletion, return an empty delete predicate now which means nothing is deleted
-        // from this test chunk
-        let pred: Vec<Predicate> = vec![];
-        Arc::new(pred)
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = &self.delete_predicates;
+        debug!(?pred, "Delete predicate in Test Chunk");
+
+        pred
    }
 }

@ -923,8 +928,9 @@ pub async fn raw_data(chunks: &[Arc<TestChunk>]) -> Vec<RecordBatch> {
    for c in chunks {
        let pred = Predicate::default();
        let selection = Selection::All;
+        let delete_predicates: Vec<Arc<Predicate>> = vec![];
        let mut stream = c
-            .read_filter(&pred, selection)
+            .read_filter(&pred, selection, &delete_predicates)
            .expect("Error in read_filter");
        while let Some(b) = stream.next().await {
            let b = b.expect("Error in stream");
--- a/query_tests/Cargo.toml
+++ b/query_tests/Cargo.toml
@ -10,19 +10,19 @@ description = "Tests of the query engine against different database configuratio
 [dependencies]
 async-trait = "0.1"
 data_types = { path = "../data_types" }
+datafusion = { path = "../datafusion" }
 once_cell = { version = "1.4.0", features = ["parking_lot"] }
+predicate = { path = "../predicate" }
 query = { path = "../query" }
 server = { path = "../server" }

 [dev-dependencies]
 arrow = { version = "5.0", features = ["prettyprint"] }
 arrow_util = { path = "../arrow_util" }
-datafusion = { path = "../datafusion" }
 data_types = { path = "../data_types" }
 internal_types = { path = "../internal_types" }
 metric = { path = "../metric" }
 object_store = { path = "../object_store" }
-predicate = { path = "../predicate" }
 snafu = "0.6.3"
 tempfile = "3.1.0"
 test_helpers = { path = "../test_helpers" }
--- a/query_tests/src/scenarios.rs
+++ b/query_tests/src/scenarios.rs
@ -1,12 +1,13 @@
 //! This module contains testing scenarios for Db
-
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};

+use datafusion::logical_plan::{col, lit};
 use once_cell::sync::OnceCell;

-#[allow(unused_imports, dead_code, unused_macros)]
+use predicate::predicate::{Predicate, PredicateBuilder};
+
 use query::QueryChunk;

 use async_trait::async_trait;
@ -1198,3 +1199,650 @@ impl DbSetup for ChunkOrder {
        vec![scenario]
    }
 }
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteFromMubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromMubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens when data in MUB
+        let _scenario_mub = make_delete_mub(lp_lines.clone(), pred.clone()).await;
+
+        // delete happens when data in MUB then moved to RUB
+        let scenario_rub =
+            make_delete_mub_to_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens when data in MUB then moved to RUB and then persisted
+        let scenario_rub_os = make_delete_mub_to_rub_and_os(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens when data in MUB then moved to RUB, then persisted, and then RUB is unloaded
+        let scenario_os =
+            make_delete_mub_to_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        // NGA todo: add scenario_mub in this after the deleted data is removed in the scan
+        //   right now MUB does not push predicate down so the result is not correct yet
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk moved from RUB to OS
+pub struct DeleteFromRubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromRubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added in RUB
+        // and is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens to data in RUB
+        let scenario_rub =
+            make_delete_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted
+        let scenario_rub_os =
+            make_delete_rub_to_os(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted then RUB unloaded
+        let scenario_os =
+            make_delete_rub_to_os_and_unload_rub(lp_lines.clone(), pred, table_name, partition_key)
+                .await;
+
+        // return scenarios to run queries
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for delete query test with one table and one chunk in both RUB and OS
+pub struct DeleteFromOsOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteFromOsOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the delete predicate is added to persisted chunks
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec!["cpu bar=1 10", "cpu bar=2 20"];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr = col("bar").eq(lit(i));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 15)
+            .add_expr(expr)
+            .build();
+
+        // delete happens after data is persisted but still in RUB
+        let scenario_rub_os =
+            make_delete_os_with_rub(lp_lines.clone(), pred.clone(), table_name, partition_key)
+                .await;
+
+        // delete happens after data is persisted but still in RUB and then unload RUB
+        let _scenario_rub_os_unload_rub = make_delete_os_with_rub_then_unload_rub(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens after data is persisted and RUB is unloaded
+        let _scenario_os = make_delete_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        //vec![scenario_rub_os, scenario_rub_os_unload_rub, scenario_os]
+        // NGA todo: turn the last 2 scenarios on when #2518 and #2550 are done
+        vec![scenario_rub_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromMubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromMubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens when data in MUB
+        let _scenario_mub = make_delete_mub(lp_lines.clone(), pred.clone()).await;
+
+        // delete happens when data in MUB then moved to RUB
+        let scenario_rub =
+            make_delete_mub_to_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens when data in MUB then moved to RUB and then persisted
+        let scenario_rub_os = make_delete_mub_to_rub_and_os(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens when data in MUB then moved to RUB, then persisted, and then RUB is unloaded
+        let scenario_os =
+            make_delete_mub_to_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        // NGA todo: add scenario_mub in this after the deleted data is removed in the scan
+        //   right now MUB does not push predicate down so the result is not correct yet
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromRubOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromRubOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens to data in RUB
+        let scenario_rub =
+            make_delete_rub(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted
+        let scenario_rub_os =
+            make_delete_rub_to_os(lp_lines.clone(), pred.clone(), table_name, partition_key).await;
+
+        // delete happens to data in RUB then persisted then RUB unloaded
+        let scenario_os =
+            make_delete_rub_to_os_and_unload_rub(lp_lines.clone(), pred, table_name, partition_key)
+                .await;
+
+        // return scenarios to run queries
+        vec![scenario_rub, scenario_rub_os, scenario_os]
+    }
+}
+
+#[derive(Debug)]
+/// Setup for multi-expression delete query test with one table and one chunk moved from MUB to RUB to OS
+pub struct DeleteMultiExprsFromOsOneMeasurementOneChunk {}
+#[async_trait]
+impl DbSetup for DeleteMultiExprsFromOsOneMeasurementOneChunk {
+    async fn make(&self) -> Vec<DbScenario> {
+        // The main purpose of these scenarios is the multi-expression delete predicate is added in MUB and
+        // is moved with chunk moving
+
+        // General setup for all scenarios
+        let partition_key = "1970-01-01T00";
+        let table_name = "cpu";
+        // chunk data
+        let lp_lines = vec![
+            "cpu,foo=me bar=1 10",
+            "cpu,foo=you bar=2 20",
+            "cpu,foo=me bar=1 30",
+            "cpu,foo=me bar=1 40",
+        ];
+        // delete predicate
+        let i: f64 = 1.0;
+        let expr1 = col("bar").eq(lit(i));
+        let expr2 = col("foo").eq(lit("me"));
+        let pred = PredicateBuilder::new()
+            .table("cpu")
+            .timestamp_range(0, 32)
+            .add_expr(expr1)
+            .add_expr(expr2)
+            .build();
+
+        // delete happens after data is persisted but still in RUB
+        let scenario_rub_os =
+            make_delete_os_with_rub(lp_lines.clone(), pred.clone(), table_name, partition_key)
+                .await;
+
+        // delete happens after data is persisted but still in RUB and then unload RUB
+        let _scenario_rub_os_unload_rub = make_delete_os_with_rub_then_unload_rub(
+            lp_lines.clone(),
+            pred.clone(),
+            table_name,
+            partition_key,
+        )
+        .await;
+
+        // delete happens after data is persisted and RUB is unloaded
+        let _scenario_os = make_delete_os(lp_lines.clone(), pred, table_name, partition_key).await;
+
+        // return scenarios to run queries
+        //vec![scenario_rub_os, scenario_rub_os_unload_rub, scenario_os]
+        // NGA todo: turn the last 2 scenarios on when #2518band #2550 are done
+        vec![scenario_rub_os]
+    }
+}
+
+// NGA todo: Add these scenarios after deleted data is eliminated from scan
+//  1. Many deletes, each has one or/and multi expressions
+//  2. Many different-type chunks when a delete happens
+//  3. Combination of above
+
+async fn make_delete_mub(lp_lines: Vec<&str>, pred: Predicate) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // One open MUB, no RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 1);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 0);
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // Still one but frozen MUB, no RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 1);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in MUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // No MUB, one RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB moved from MUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_rub_and_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB and OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_mub_to_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // delete data in MUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // move MUB to RUB and the delete predicate will be automatically included in RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, one RUB, no OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 0);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub_to_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB and then persisted to OS".into(),
+        db,
+    }
+}
+
+async fn make_delete_rub_to_os_and_unload_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // delete data in RUB
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in RUB then persisted to OS then RUB unloaded".into(),
+        db,
+    }
+}
+
+async fn make_delete_os_with_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, one RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 1);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS with RUB".into(),
+        db,
+    }
+}
+
+async fn make_delete_os_with_rub_then_unload_rub(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS only but the delete happens before RUB is unloaded"
+            .into(),
+        db,
+    }
+}
+
+async fn make_delete_os(
+    lp_lines: Vec<&str>,
+    pred: Predicate,
+    table_name: &str,
+    partition_key: &str,
+) -> DbScenario {
+    let db = make_db().await.db;
+    // create an open MUB
+    write_lp(&db, &lp_lines.join("\n")).await;
+    // move MUB to RUB
+    db.rollover_partition(table_name, partition_key)
+        .await
+        .unwrap();
+    db.move_chunk_to_read_buffer(table_name, partition_key, 0)
+        .await
+        .unwrap();
+    // persist RUB and the delete predicate will be automatically included in the OS chunk
+    db.persist_partition(
+        table_name,
+        partition_key,
+        Instant::now() + Duration::from_secs(1),
+    )
+    .await
+    .unwrap();
+    // remove RUB
+    db.unload_read_buffer(table_name, partition_key, 1).unwrap();
+    // delete data after persisted but RUB still available
+    db.delete("cpu", Arc::new(pred)).await.unwrap();
+    // No MUB, no RUB, one OS
+    assert_eq!(count_mutable_buffer_chunks(&db), 0);
+    assert_eq!(count_read_buffer_chunks(&db), 0);
+    assert_eq!(count_object_store_chunks(&db), 1);
+
+    DbScenario {
+        scenario_name: "Deleted data in OS and the delete happens after RUB is unloaded".into(),
+        db,
+    }
+}
--- a/query_tests/src/sql.rs
+++ b/query_tests/src/sql.rs
@ -808,3 +808,68 @@ async fn sql_select_all_different_tags_chunks() {
        &expected
    );
 }
+
+#[tokio::test]
+async fn sql_select_with_deleted_data_from_one_expr() {
+    let expected = vec![
+        "+-----+--------------------------------+",
+        "| bar | time                           |",
+        "+-----+--------------------------------+",
+        "| 2   | 1970-01-01T00:00:00.000000020Z |",
+        "+-----+--------------------------------+",
+    ];
+
+    // Data deleted when it is in MUB, and then moved to RUB and OS
+    run_sql_test_case!(
+        DeleteFromMubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in RUB, and then moved OS
+    run_sql_test_case!(
+        DeleteFromRubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in OS
+    run_sql_test_case!(
+        DeleteFromOsOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+}
+
+#[tokio::test]
+async fn sql_select_with_deleted_data_from_multi_exprs() {
+    let expected = vec![
+        "+-----+-----+--------------------------------+",
+        "| bar | foo | time                           |",
+        "+-----+-----+--------------------------------+",
+        "| 1   | me  | 1970-01-01T00:00:00.000000040Z |",
+        "| 2   | you | 1970-01-01T00:00:00.000000020Z |",
+        "+-----+-----+--------------------------------+",
+    ];
+
+    // Data deleted when it is in MUB, and then moved to RUB and OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromMubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in RUB, and then moved OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromRubOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+
+    // Data deleted when it is in OS
+    run_sql_test_case!(
+        DeleteMultiExprsFromOsOneMeasurementOneChunk {},
+        "SELECT * from cpu",
+        &expected
+    );
+}
--- a/read_buffer/src/column.rs
+++ b/read_buffer/src/column.rs
@ -2067,6 +2067,40 @@ mod test {
        assert!(matches!(row_ids, RowIDsOption::All(_)));
    }

+    #[test]
+    fn row_ids_filter_float_trimmed() {
+        let input = &[100.0, 200.0, 300.0, 2.0, 22.0, 30.0];
+
+        let col = Column::from(&input[..]);
+        let mut row_ids = col.row_ids_filter(
+            &cmp::Operator::Equal,
+            &Value::from(200.0),
+            RowIDs::new_bitmap(),
+        );
+        assert_eq!(row_ids.unwrap().to_vec(), vec![1]);
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::LT,
+            &Value::from(64000.0),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::GTE,
+            &Value::from(-1_000_000.0),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+
+        row_ids = col.row_ids_filter(
+            &cmp::Operator::NotEqual,
+            &Value::from(1_000_000.3),
+            RowIDs::new_bitmap(),
+        );
+        assert!(matches!(row_ids, RowIDsOption::All(_)));
+    }
+
    #[test]
    fn row_ids_range() {
        let input = &[100_i64, 200, 300, 2, 200, 22, 30];
--- a/read_buffer/src/column/encoding/scalar/fixed.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed.rs
@ -7,6 +7,7 @@
 //! allow results to be emitted as some logical type `L` via a transformation
 //! `T`.
 use either::Either;
+use observability_deps::tracing::debug;
 use std::cmp::Ordering;
 use std::fmt::Debug;
 use std::marker::PhantomData;
@ -373,14 +374,36 @@ where
        Some(self.transcoder.decode(max))
    }

-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst.add_range(0, self.num_rows());
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
+        // N.B, the transcoder may have changed the operator depending on the
+        // value provided.
        match op {
            cmp::Operator::GT => self.row_ids_cmp_order(&value, PartialOrd::gt, dst),
            cmp::Operator::GTE => self.row_ids_cmp_order(&value, PartialOrd::ge, dst),
            cmp::Operator::LT => self.row_ids_cmp_order(&value, PartialOrd::lt, dst),
            cmp::Operator::LTE => self.row_ids_cmp_order(&value, PartialOrd::le, dst),
-            _ => self.row_ids_equal(&value, op, dst),
+            _ => self.row_ids_equal(&value, &op, dst),
        }
    }

@ -390,8 +413,16 @@ where
        right: (L, &cmp::Operator),
        dst: RowIDs,
    ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");

        match (&left.1, &right.1) {
            (cmp::Operator::GT, cmp::Operator::LT)
@ -402,8 +433,8 @@ where
            | (cmp::Operator::LT, cmp::Operator::GTE)
            | (cmp::Operator::LTE, cmp::Operator::GT)
            | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order(
-                (&left.0, Self::ord_from_op(left.1)),
-                (&right.0, Self::ord_from_op(right.1)),
+                (&left.0, Self::ord_from_op(&left.1)),
+                (&right.0, Self::ord_from_op(&right.1)),
                dst,
            ),

--- a/read_buffer/src/column/encoding/scalar/fixed_null.rs
+++ b/read_buffer/src/column/encoding/scalar/fixed_null.rs
@ -3,6 +3,7 @@
 //! This encoding stores a column of fixed-width numerical values backed by an
 //! an Arrow array, allowing for storage of NULL values.
 use either::Either;
+use observability_deps::tracing::debug;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::mem::size_of;
@ -227,6 +228,43 @@ where
        }
        dst
    }
+
+    // Identify all row IDs that contain a non-null value.
+    fn all_non_null_row_ids(&self, mut dst: RowIDs) -> RowIDs {
+        dst.clear();
+
+        if self.null_count() == 0 {
+            dst.add_range(0, self.num_rows());
+            return dst;
+        }
+
+        let mut found = false;
+        let mut count = 0;
+        for i in 0..self.num_rows() as usize {
+            if self.arr.is_null(i) {
+                if found {
+                    // add the non-null range
+                    let (min, max) = (i as u32 - count, i as u32);
+                    dst.add_range(min, max);
+                    found = false;
+                    count = 0;
+                }
+                continue;
+            }
+
+            if !found {
+                found = true;
+            }
+            count += 1;
+        }
+
+        // add any remaining range.
+        if found {
+            let (min, max) = (self.num_rows() - count, self.num_rows());
+            dst.add_range(min, max);
+        }
+        dst
+    }
 }

 impl<P, L, T> ScalarEncoding<L> for FixedNull<P, L, T>
@ -411,14 +449,34 @@ where
        max.map(|v| self.transcoder.decode(v))
    }

-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // non-null values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst = self.all_non_null_row_ids(dst);
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
        match op {
-            cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(op), dst),
-            _ => self.row_ids_equal(value, op, dst),
+            cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
+            _ => self.row_ids_equal(value, &op, dst),
        }
    }

@ -428,8 +486,16 @@ where
        right: (L, &cmp::Operator),
        dst: RowIDs,
    ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");

        match (left.1, right.1) {
            (cmp::Operator::GT, cmp::Operator::LT)
@ -440,8 +506,8 @@ where
            | (cmp::Operator::LT, cmp::Operator::GTE)
            | (cmp::Operator::LTE, cmp::Operator::GT)
            | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order(
-                (left.0, Self::ord_from_op(left.1)),
-                (right.0, Self::ord_from_op(right.1)),
+                (left.0, Self::ord_from_op(&left.1)),
+                (right.0, Self::ord_from_op(&right.1)),
                dst,
            ),

@ -789,6 +855,38 @@ mod test {
        assert_eq!(row_ids.to_vec(), vec![1, 2, 4]);
    }

+    #[test]
+    fn row_ids_filter_range_all_non_null() {
+        let cases = vec![
+            (vec![None], vec![]),
+            (vec![None, None, None], vec![]),
+            (vec![Some(22)], vec![0_u32]),
+            (vec![Some(22), Some(3), Some(3)], vec![0, 1, 2]),
+            (vec![Some(22), None], vec![0]),
+            (
+                vec![Some(22), None, Some(1), None, Some(3), None],
+                vec![0, 2, 4],
+            ),
+            (vec![Some(22), None, None, Some(33)], vec![0, 3]),
+            (vec![None, None, Some(33)], vec![2]),
+            (
+                vec![None, None, Some(33), None, None, Some(3), Some(3), Some(1)],
+                vec![2, 5, 6, 7],
+            ),
+        ];
+
+        for (i, (data, exp)) in cases.into_iter().enumerate() {
+            let (v, _) = new_encoding(data);
+            let dst = RowIDs::new_vector();
+            assert_eq!(
+                v.all_non_null_row_ids(dst).unwrap_vector(),
+                &exp,
+                "example {:?} failed",
+                i
+            );
+        }
+    }
+
    #[test]
    fn has_non_null_value() {
        let (v, _) = new_encoding(vec![None, None]);
--- a/read_buffer/src/column/encoding/scalar/rle.rs
+++ b/read_buffer/src/column/encoding/scalar/rle.rs
@ -1,4 +1,5 @@
 use either::Either;
+use observability_deps::tracing::debug;

 use crate::column::cmp;
 use crate::column::RowIDs;
@ -288,18 +289,35 @@ where
                let left_cmp_result = next.partial_cmp(left.0);
                let right_cmp_result = next.partial_cmp(right.0);

-                // TODO(edd): eurgh I still don't understand how I got this to
-                // be correct. Need to revisit to make it simpler.
                let left_result_ok =
-                    !(left_cmp_result == Some(left_op.0) || left_cmp_result == Some(left_op.1));
+                    left_cmp_result == Some(left_op.0) || left_cmp_result == Some(left_op.1);
                let right_result_ok =
-                    !(right_cmp_result == Some(right_op.0) || right_cmp_result == Some(right_op.1));
+                    right_cmp_result == Some(right_op.0) || right_cmp_result == Some(right_op.1);

-                if !(left_result_ok || right_result_ok) {
+                if left_result_ok && right_result_ok {
                    dst.add_range(curr_logical_row_id, curr_logical_row_id + rl);
                }
-                curr_logical_row_id += rl;
            }
+            curr_logical_row_id += rl;
+        }
+
+        dst
+    }
+
+    fn all_non_null_row_ids(&self, mut dst: RowIDs) -> RowIDs {
+        dst.clear();
+
+        if self.null_count() == 0 {
+            dst.add_range(0, self.num_rows());
+            return dst;
+        }
+
+        let mut curr_logical_row_id = 0;
+        for (rl, next) in &self.run_lengths {
+            if next.is_some() {
+                dst.add_range(curr_logical_row_id, curr_logical_row_id + rl);
+            }
+            curr_logical_row_id += rl;
        }

        dst
@ -375,15 +393,34 @@ where
        self.num_rows
    }

-    fn row_ids_filter(&self, value: L, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
-        let value = self.transcoder.encode(value);
+    fn row_ids_filter(&self, value: L, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter");
+        let (value, op) = match self.transcoder.encode_comparable(value, *op) {
+            Some((value, op)) => (value, op),
+            None => {
+                // The value is not encodable. This can happen with the == or !=
+                // operator. In the case of ==, no values in the encoding could
+                // possible satisfy the expression. In the case of !=, all
+                // non-null values would satisfy the expression.
+                dst.clear();
+                return match op {
+                    cmp::Operator::Equal => dst,
+                    cmp::Operator::NotEqual => {
+                        dst = self.all_non_null_row_ids(dst);
+                        dst
+                    }
+                    op => panic!("operator {:?} not expected", op),
+                };
+            }
+        };
+        debug!(value=?value, operator=?op, encoding=?ENCODING_NAME, "row_ids_filter encoded expr");
+
        match op {
-            cmp::Operator::Equal | cmp::Operator::NotEqual => {
-                self.row_ids_cmp_equal(value, op, dst)
-            }
-            cmp::Operator::LT | cmp::Operator::LTE | cmp::Operator::GT | cmp::Operator::GTE => {
-                self.row_ids_cmp(value, op, dst)
-            }
+            cmp::Operator::GT => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::GTE => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::LT => self.row_ids_cmp(value, &op, dst),
+            cmp::Operator::LTE => self.row_ids_cmp(value, &op, dst),
+            _ => self.row_ids_cmp_equal(value, &op, dst),
        }
    }

@ -393,8 +430,16 @@ where
        right: (L, &cmp::Operator),
        dst: RowIDs,
    ) -> RowIDs {
-        let left = (self.transcoder.encode(left.0), left.1);
-        let right = (self.transcoder.encode(right.0), right.1);
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range");
+        let left = self
+            .transcoder
+            .encode_comparable(left.0, *left.1)
+            .expect("transcoder must return Some variant");
+        let right = self
+            .transcoder
+            .encode_comparable(right.0, *right.1)
+            .expect("transcoder must return Some variant");
+        debug!(left=?left, right=?right, encoding=?ENCODING_NAME, "row_ids_filter_range encoded expr");

        match (&left.1, &right.1) {
            (cmp::Operator::GT, cmp::Operator::LT)
@ -405,8 +450,8 @@ where
            | (cmp::Operator::LT, cmp::Operator::GTE)
            | (cmp::Operator::LTE, cmp::Operator::GT)
            | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range(
-                (&left.0, Self::ord_from_op(left.1)),
-                (&right.0, Self::ord_from_op(right.1)),
+                (&left.0, Self::ord_from_op(&left.1)),
+                (&right.0, Self::ord_from_op(&right.1)),
                dst,
            ),

@ -622,6 +667,16 @@ mod test {
        )
    }

+    fn new_encoding_opt(
+        values: Vec<Option<i64>>,
+    ) -> (RLE<i64, i64, Arc<MockTranscoder>>, Arc<MockTranscoder>) {
+        let mock = Arc::new(MockTranscoder::default());
+        (
+            RLE::new_from_iter_opt(values.into_iter(), Arc::clone(&mock)),
+            mock,
+        )
+    }
+
    #[test]
    fn new_from_iter() {
        let cases = vec![
@ -981,6 +1036,38 @@ mod test {
        }
    }

+    #[test]
+    fn row_ids_filter_range_all_non_null() {
+        let cases = vec![
+            (vec![None], vec![]),
+            (vec![None, None, None], vec![]),
+            (vec![Some(22)], vec![0_u32]),
+            (vec![Some(22), Some(3), Some(3)], vec![0, 1, 2]),
+            (vec![Some(22), None], vec![0]),
+            (
+                vec![Some(22), None, Some(1), None, Some(3), None],
+                vec![0, 2, 4],
+            ),
+            (vec![Some(22), None, None, Some(33)], vec![0, 3]),
+            (vec![None, None, Some(33)], vec![2]),
+            (
+                vec![None, None, Some(33), None, None, Some(3), Some(3), Some(1)],
+                vec![2, 5, 6, 7],
+            ),
+        ];
+
+        for (i, (data, exp)) in cases.into_iter().enumerate() {
+            let (v, _) = new_encoding_opt(data);
+            let dst = RowIDs::new_vector();
+            assert_eq!(
+                v.all_non_null_row_ids(dst).unwrap_vector(),
+                &exp,
+                "example {:?} failed",
+                i
+            );
+        }
+    }
+
    #[test]
    fn row_ids_filter_range() {
        let (enc, transcoder) = new_encoding(vec![
@ -1016,6 +1103,37 @@ mod test {
        assert_eq!(transcoder.encodings(), calls * 2);
    }

+    #[test]
+    fn row_ids_filter_range_nulls() {
+        let (enc, transcoder) = new_encoding_opt(vec![
+            Some(100),
+            None,
+            None,
+            None,
+            Some(100),
+            Some(101),
+            Some(101),
+        ]);
+
+        let cases = vec![
+            (
+                (100, &Operator::GTE),
+                (240, &Operator::LT),
+                vec![0, 4, 5, 6],
+            ),
+            ((100, &Operator::GT), (240, &Operator::LT), vec![5, 6]),
+            ((10, &Operator::LT), (-100, &Operator::GT), vec![]),
+            ((21, &Operator::GTE), (100, &Operator::LTE), vec![0, 4]),
+        ];
+
+        let calls = cases.len();
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(dst.unwrap_vector(), &exp);
+        }
+        assert_eq!(transcoder.encodings(), calls * 2);
+    }
+
    #[test]
    fn estimate_rle_size() {
        let cases = vec![
--- a/read_buffer/src/column/encoding/scalar/transcoders.rs
+++ b/read_buffer/src/column/encoding/scalar/transcoders.rs
@ -1,3 +1,4 @@
+use crate::column::cmp::Operator;
 use std::{
    convert::TryFrom,
    fmt::{Debug, Display},
@ -13,7 +14,17 @@ use std::{
 // `P` is a physical type that is stored directly within an encoding, `L` is
 // a logical type callers expect to be returned.
 pub trait Transcoder<P, L>: Debug + Display {
+    /// A function that encodes a logical value into a physical representation.
    fn encode(&self, _: L) -> P;
+
+    /// A function that attempts to encode a logical value, within the context
+    /// of a comparison operator, into a physical representation.
+    ///
+    /// Implementation should return a suitable operator for the physical
+    /// representation, which may differ from the provided operator.
+    fn encode_comparable(&self, _: L, _: Operator) -> Option<(P, Operator)>;
+
+    /// A function to decode a physical representation back into a logical value.
    fn decode(&self, _: P) -> L;
 }

@ -29,6 +40,10 @@ impl<T> Transcoder<T, T> for NoOpTranscoder {
        v
    }

+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        Some((v, op))
+    }
+
    fn decode(&self, v: T) -> T {
        v
    }
@ -56,13 +71,17 @@ pub struct ByteTrimmer {}
 impl<P, L> Transcoder<P, L> for ByteTrimmer
 where
    L: From<P>,
-    P: TryFrom<L>,
+    P: TryFrom<L> + PartialEq + PartialOrd,
    <P as TryFrom<L>>::Error: std::fmt::Debug,
 {
    fn encode(&self, v: L) -> P {
        P::try_from(v).unwrap()
    }

+    fn encode_comparable(&self, v: L, op: Operator) -> Option<(P, Operator)> {
+        P::try_from(v).ok().map(|p| (p, op))
+    }
+
    fn decode(&self, v: P) -> L {
        L::from(v)
    }
@ -91,6 +110,54 @@ macro_rules! make_float_trimmer {
                v as $type
            }

+            fn encode_comparable(&self, v: f64, op: Operator) -> Option<($type, Operator)> {
+                assert!(v <= <$type>::MAX as f64);
+                if v == ((v as $type) as f64) {
+                    return Some((v as $type, op));
+                }
+
+                match op {
+                    Operator::Equal => {
+                        None // no encoded values will == v
+                    }
+                    Operator::NotEqual => {
+                        None // all encoded values will != v
+                    }
+                    Operator::LT => {
+                        // convert to next highest encodable value. For example
+                        // given '< 23.2` return 24.0 encoded as the physical
+                        // type. < 23.2 is logically equivalent to < 24.0 since
+                        // there are no valid values in the domain (23.2, 24.0).
+                        Some((v.ceil() as $type, op))
+                    }
+                    Operator::LTE => {
+                        // convert to next highest encodable value and change
+                        // operator to <.
+                        // For example given '<= 23.2` return 24.0 encoded as
+                        // the physical type. <= 23.2 is logically equivalent
+                        // to < 24.0 since there are no valid values in the
+                        // domain [23.2, 24.0).
+                        Some((v.ceil() as $type, Operator::LT))
+                    }
+                    Operator::GT => {
+                        // convert to next lowest encodable value. For example
+                        // given '> 23.2` return 23.0 encoded as the physical
+                        // type. > 23.2 is logically equivalent to > 23.0 since
+                        // there are no valid values in the domain (23.0, 23.2].
+                        Some((v.floor() as $type, op))
+                    }
+                    Operator::GTE => {
+                        // convert to next lowest encodable value and change
+                        // operator to >.
+                        // For example given '>= 23.2` return 23.0 encoded as
+                        // the physical type. >= 23.2 is logically equivalent
+                        // to > 24.0 since there are no valid values in the
+                        // domain [23.2, 24.0).
+                        Some((v.floor() as $type, Operator::GT))
+                    }
+                }
+            }
+
            fn decode(&self, v: $type) -> f64 {
                v.into()
            }
@ -119,7 +186,7 @@ impl Display for FloatByteTrimmer {
 //        result.

 #[cfg(test)]
-use std::{sync::atomic::AtomicUsize, sync::atomic::Ordering, sync::Arc};
+use std::{sync::atomic, sync::atomic::AtomicUsize, sync::Arc};
 #[cfg(test)]
 /// A mock implementation of Transcoder that tracks calls to encode and decode.
 /// This is useful for testing encoder implementations.
@ -127,6 +194,7 @@ use std::{sync::atomic::AtomicUsize, sync::atomic::Ordering, sync::Arc};
 pub struct MockTranscoder {
    encoding_calls: AtomicUsize,
    decoding_calls: AtomicUsize,
+    partial_cmp_calls: AtomicUsize,
 }

 #[cfg(test)]
@ -135,6 +203,7 @@ impl Default for MockTranscoder {
        Self {
            encoding_calls: AtomicUsize::default(),
            decoding_calls: AtomicUsize::default(),
+            partial_cmp_calls: AtomicUsize::default(),
        }
    }
 }
@ -142,23 +211,28 @@ impl Default for MockTranscoder {
 #[cfg(test)]
 impl MockTranscoder {
    pub fn encodings(&self) -> usize {
-        self.encoding_calls.load(Ordering::Relaxed)
+        self.encoding_calls.load(atomic::Ordering::Relaxed)
    }

    pub fn decodings(&self) -> usize {
-        self.decoding_calls.load(Ordering::Relaxed)
+        self.decoding_calls.load(atomic::Ordering::Relaxed)
    }
 }

 #[cfg(test)]
 impl<T> Transcoder<T, T> for MockTranscoder {
    fn encode(&self, v: T) -> T {
-        self.encoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
        v
    }

+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
+        Some((v, op))
+    }
+
    fn decode(&self, v: T) -> T {
-        self.decoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.decoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
        v
    }
 }
@ -166,12 +240,17 @@ impl<T> Transcoder<T, T> for MockTranscoder {
 #[cfg(test)]
 impl<T> Transcoder<T, T> for Arc<MockTranscoder> {
    fn encode(&self, v: T) -> T {
-        self.encoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
        v
    }

+    fn encode_comparable(&self, v: T, op: Operator) -> Option<(T, Operator)> {
+        self.encoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
+        Some((v, op))
+    }
+
    fn decode(&self, v: T) -> T {
-        self.decoding_calls.fetch_add(1, Ordering::Relaxed);
+        self.decoding_calls.fetch_add(1, atomic::Ordering::Relaxed);
        v
    }
 }
--- a/read_buffer/src/column/float.rs
+++ b/read_buffer/src/column/float.rs
@ -754,4 +754,303 @@ mod test {
            //assert_eq!(dst.unwrap_vector(), &exp, "example '{} {:?}' failed", op, v);
        }
    }
+
+    #[test]
+    fn row_ids_filter_float_trimmer() {
+        let data = vec![100.0, 200.0, 100.0, 300.0, 400.0];
+
+        let float_trimmer = FloatByteTrimmer {};
+        let data_float_trimmed = data
+            .iter()
+            .cloned()
+            .map::<u16, _>(|x| float_trimmer.encode(x))
+            .collect::<Vec<u16>>();
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter(
+                data_float_trimmed.iter().cloned(),
+                float_trimmer,
+            )),
+            Box::new(Fixed::<u16, f64, _>::new(
+                data_float_trimmed.clone(),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                PrimitiveArray::from(data_float_trimmed),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_float_trimmer(enc)
+        }
+    }
+
+    fn _row_ids_filter_float_trimmer(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, 200.0, 100.0, 300.0, 400.0]
+        let cases = vec![
+            (100.0, Operator::Equal, vec![0, 2]),          // 100.0, 100.0
+            (100.0, Operator::NotEqual, vec![1, 3, 4]),    // 200.0, 300.0, 400.0
+            (100.0, Operator::LT, vec![]),                 //
+            (100.0, Operator::LTE, vec![0, 2]),            // 100.0, 100.0
+            (100.0, Operator::GT, vec![1, 3, 4]),          // 200.0, 300.0, 400.0
+            (100.0, Operator::GTE, vec![0, 1, 2, 3, 4]),   // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.0, Operator::Equal, vec![1]),             // 200.0
+            (200.0, Operator::NotEqual, vec![0, 2, 3, 4]), // 100.0, 100.0, 300.0, 400.0
+            (200.0, Operator::LT, vec![0, 2]),             // 100.0, 100.0
+            (200.0, Operator::LTE, vec![0, 1, 2]),         // 100.0, 200.0, 100.0
+            (200.0, Operator::GT, vec![3, 4]),             // 300.0, 400.0
+            (200.0, Operator::GTE, vec![1, 3, 4]),         // 200.0, 300.0, 400.0
+            (400.0, Operator::Equal, vec![4]),             // 400.0
+            (400.0, Operator::NotEqual, vec![0, 1, 2, 3]), // 100.0, 200.0, 100.0, 300.0
+            (400.0, Operator::LT, vec![0, 1, 2, 3]),       // 100.0, 200.0, 100.0, 300.0
+            (400.0, Operator::LTE, vec![0, 1, 2, 3, 4]),   // 100.0, 200.0, 100.0, 300.0, 400.0
+            (400.0, Operator::GT, vec![]),                 //
+            (400.0, Operator::GTE, vec![4]),               // 400.0
+            // Values not present in the column
+            (99.0, Operator::Equal, vec![]),                  //
+            (99.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]),  // 100.0, 200.0, 100.0, 300.0, 400.0
+            (99.0, Operator::LT, vec![]),                     //
+            (99.0, Operator::LTE, vec![]),                    //
+            (99.0, Operator::GT, vec![0, 1, 2, 3, 4]),        // 100.0, 200.0, 100.0, 300.0, 400.0
+            (99.0, Operator::GTE, vec![0, 1, 2, 3, 4]),       // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.4, Operator::Equal, vec![]),                 //
+            (200.4, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (200.4, Operator::LT, vec![0, 1, 2]),             // 100.0, 200.0, 100.0
+            (200.4, Operator::LTE, vec![0, 1, 2]),            // 100.0, 200.0, 100.0
+            (200.4, Operator::GT, vec![3, 4]),                // 300.0, 400.0
+            (200.4, Operator::GTE, vec![3, 4]),               // 300.0, 400.0
+            (201.0, Operator::Equal, vec![]),                 //
+            (201.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (201.0, Operator::LT, vec![0, 1, 2]),             // 100.0, 200.0, 100.0
+            (201.0, Operator::LTE, vec![0, 1, 2]),            // 100.0, 200.0, 100.0
+            (201.0, Operator::GT, vec![3, 4]),                // 300.0, 400.0
+            (201.0, Operator::GTE, vec![3, 4]),               // 300.0, 400.0
+            (401.0, Operator::Equal, vec![]),                 //
+            (401.0, Operator::NotEqual, vec![0, 1, 2, 3, 4]), // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::LT, vec![0, 1, 2, 3, 4]),       // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::LTE, vec![0, 1, 2, 3, 4]),      // 100.0, 200.0, 100.0, 300.0, 400.0
+            (401.0, Operator::GT, vec![]),                    //
+            (401.0, Operator::GTE, vec![]),                   //
+        ];
+
+        for (v, op, exp) in cases {
+            let dst = enc.row_ids_filter(v, &op, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{} {:?}' failed for {:?}",
+                op,
+                v,
+                enc.name()
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_float_trimmer_with_nulls() {
+        let data = vec![Some(100.0), None, None, Some(200.0), None];
+
+        let float_trimmer = FloatByteTrimmer {};
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter_opt(
+                data.iter()
+                    .cloned()
+                    .map(|x| x.map(|v| float_trimmer.encode(v))),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                data.iter()
+                    .cloned()
+                    .map(|v| v.map(|v| float_trimmer.encode(v)))
+                    .collect(),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_float_trimmer_with_nulls(enc)
+        }
+    }
+
+    fn _row_ids_filter_float_trimmer_with_nulls(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, NULL, NULL, 200.0]
+        let cases = vec![
+            (100.0, Operator::Equal, vec![0]),    // 100.0
+            (100.0, Operator::NotEqual, vec![3]), // 200.0
+            (100.0, Operator::LT, vec![]),        //
+            (100.0, Operator::LTE, vec![0]),      // 100.0
+            (100.0, Operator::GT, vec![3]),       // 200.0
+            (100.0, Operator::GTE, vec![0, 3]),   // 100.0, 200.0
+            (200.0, Operator::Equal, vec![3]),    // 200.0
+            (200.0, Operator::NotEqual, vec![0]), // 100.0
+            (200.0, Operator::LT, vec![0]),       // 100.0
+            (200.0, Operator::LTE, vec![0, 3]),   // 100.0, 200.0
+            (200.0, Operator::GT, vec![]),        //
+            (200.0, Operator::GTE, vec![3]),      // 200.0
+            // Values not present in the column
+            (99.0, Operator::Equal, vec![]),         //
+            (99.0, Operator::NotEqual, vec![0, 3]),  // 100.0, 200.0
+            (99.0, Operator::LT, vec![]),            //
+            (99.0, Operator::LTE, vec![]),           //
+            (99.0, Operator::GT, vec![0, 3]),        // 100.0, 200.0
+            (99.0, Operator::GTE, vec![0, 3]),       // 100.0, 200.0
+            (200.4, Operator::Equal, vec![]),        //
+            (200.4, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (200.4, Operator::LT, vec![0, 3]),       // 100.0,200.0
+            (200.4, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (200.4, Operator::GT, vec![]),           //
+            (200.4, Operator::GTE, vec![]),          //
+            (201.0, Operator::Equal, vec![]),        //
+            (201.0, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (201.0, Operator::LT, vec![0, 3]),       // 100.0, 200.0
+            (201.0, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (201.0, Operator::GT, vec![]),           //
+            (201.0, Operator::GTE, vec![]),          //
+            (401.0, Operator::Equal, vec![]),        //
+            (401.0, Operator::NotEqual, vec![0, 3]), // 100.0, 200.0
+            (401.0, Operator::LT, vec![0, 3]),       // 100.0, 200.0
+            (401.0, Operator::LTE, vec![0, 3]),      // 100.0, 200.0
+            (401.0, Operator::GT, vec![]),           //
+            (401.0, Operator::GTE, vec![]),          //
+        ];
+
+        for (v, op, exp) in cases {
+            let dst = enc.row_ids_filter(v, &op, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{} {:?}' failed for {:?}",
+                op,
+                v,
+                enc.name()
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_range_float_trimmer() {
+        let data = vec![100.0, 200.0, 100.0, 300.0, 400.0];
+
+        let float_trimmer = FloatByteTrimmer {};
+        let data_float_trimmed = data
+            .iter()
+            .cloned()
+            .map::<u16, _>(|x| float_trimmer.encode(x))
+            .collect::<Vec<u16>>();
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter(
+                data_float_trimmed.iter().cloned(),
+                float_trimmer,
+            )),
+            Box::new(Fixed::<u16, f64, _>::new(
+                data_float_trimmed.clone(),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                PrimitiveArray::from(data_float_trimmed),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_range_float_trimmer(enc)
+        }
+    }
+
+    fn _row_ids_filter_range_float_trimmer(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, 200.0, 100.0, 300.0, 400.0]
+        let cases = vec![
+            ((100.0, &Operator::LT), (99.0, &Operator::GT), vec![]), //
+            ((100.0, &Operator::LTE), (100.0, &Operator::GTE), vec![0, 2]), // 100.0, 100.0
+            (
+                (100.0, &Operator::GT),
+                (400.0, &Operator::LTE),
+                vec![1, 3, 4],
+            ), // 200.0, 300.0, 400.0
+            (
+                (100.0, &Operator::GTE),
+                (401.0, &Operator::LTE),
+                vec![0, 1, 2, 3, 4],
+            ), // 100.0, 200.0, 100.0, 300.0, 400.0
+            ((200.0, &Operator::LT), (99.6, &Operator::GT), vec![0, 2]), // 100.0, 100.0
+            ((200.0, &Operator::GT), (401.2, &Operator::LTE), vec![3, 4]), // 300.0, 400.0
+            (
+                (200.0, &Operator::GTE),
+                (400.9, &Operator::LT),
+                vec![1, 3, 4],
+            ), // 200.0, 300.0, 400.0
+            (
+                (99.8, &Operator::GT),
+                (500.87, &Operator::LT),
+                vec![0, 1, 2, 3, 4],
+            ), // 100.0, 200.0, 100.0, 300.0, 400.0
+        ];
+
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{:?} {:?}' failed for {:?}",
+                left,
+                right,
+                enc.name(),
+            );
+        }
+    }
+
+    #[test]
+    fn row_ids_filter_range_float_trimmer_with_nulls() {
+        let data = vec![Some(100.0), None, None, Some(200.0), None];
+
+        let float_trimmer = FloatByteTrimmer {};
+
+        let cases: Vec<Box<dyn ScalarEncoding<f64>>> = vec![
+            Box::new(RLE::<u16, f64, _>::new_from_iter_opt(
+                data.iter()
+                    .cloned()
+                    .map(|x| x.map(|v| float_trimmer.encode(v))),
+                FloatByteTrimmer {},
+            )),
+            Box::new(FixedNull::<UInt16Type, f64, _>::new(
+                data.iter()
+                    .cloned()
+                    .map(|v| v.map(|v| float_trimmer.encode(v)))
+                    .collect(),
+                FloatByteTrimmer {},
+            )),
+        ];
+
+        for enc in cases {
+            _row_ids_filter_range_float_trimmer_with_nulls(enc)
+        }
+    }
+
+    fn _row_ids_filter_range_float_trimmer_with_nulls(enc: Box<dyn ScalarEncoding<f64>>) {
+        // [100.0, NULL, NULL, 200.0, NULL]
+        let cases = vec![
+            ((100.0, &Operator::LT), (99.0, &Operator::GT), vec![]), //
+            ((100.0, &Operator::LTE), (100.0, &Operator::GTE), vec![0]), // 100.0
+            ((100.0, &Operator::GT), (400.0, &Operator::LTE), vec![3]), // 200.0
+            ((100.0, &Operator::GTE), (401.0, &Operator::LTE), vec![0, 3]), // 100.0, 200.0
+            ((200.0, &Operator::LT), (99.6, &Operator::GT), vec![0]), // 100.0
+            ((200.0, &Operator::GT), (401.2, &Operator::LTE), vec![]), //
+            ((99.8, &Operator::GT), (500.87, &Operator::LT), vec![0, 3]), // 100.0, 200.0
+        ];
+
+        for (left, right, exp) in cases {
+            let dst = enc.row_ids_filter_range(left, right, RowIDs::new_vector());
+            assert_eq!(
+                dst.unwrap_vector(),
+                &exp,
+                "example '{:?} {:?}' failed for {:?}",
+                left,
+                right,
+                enc.name(),
+            );
+        }
+    }
 }
--- a/server/src/database.rs
+++ b/server/src/database.rs
@ -21,7 +21,7 @@ use internal_types::freezable::Freezable;
 use iox_object_store::IoxObjectStore;
 use observability_deps::tracing::{error, info, warn};
 use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
-use parquet_file::catalog::api::PreservedCatalog;
+use parquet_file::catalog::core::PreservedCatalog;
 use persistence_windows::checkpoint::ReplayPlan;
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{future::Future, sync::Arc, time::Duration};
@ -63,7 +63,7 @@ pub enum Error {
    ))]
    WipePreservedCatalog {
        db_name: String,
-        source: Box<parquet_file::catalog::api::Error>,
+        source: Box<parquet_file::catalog::core::Error>,
    },

    #[snafu(display("failed to skip replay for database ({}): {}", db_name, source))]
--- a/server/src/db.rs
+++ b/server/src/db.rs
@ -32,8 +32,9 @@ use iox_object_store::IoxObjectStore;
 use mutable_buffer::chunk::{ChunkMetrics as MutableBufferChunkMetrics, MBChunk};
 use observability_deps::tracing::{debug, error, info};
 use parquet_file::catalog::{
-    api::{CatalogParquetInfo, CheckpointData, PreservedCatalog},
    cleanup::{delete_files as delete_parquet_files, get_unreferenced_parquet_files},
+    core::PreservedCatalog,
+    interface::{CatalogParquetInfo, CheckpointData},
    prune::prune_history as prune_catalog_transaction_history,
 };
 use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::PersistenceWindows};
@ -518,7 +519,7 @@ impl Db {
    pub async fn delete(
        self: &Arc<Self>,
        table_name: &str,
-        delete_predicate: &Predicate,
+        delete_predicate: Arc<Predicate>,
    ) -> Result<()> {
        // get all partitions of this table
        let table = self
@ -533,7 +534,7 @@ impl Db {
                // save the delete predicate in the chunk
                let mut chunk = chunk.write();
                chunk
-                    .add_delete_predicate(delete_predicate)
+                    .add_delete_predicate(Arc::clone(&delete_predicate))
                    .context(AddDeletePredicateError)?;
            }
        }
@ -1331,7 +1332,7 @@ mod tests {
        assert_store_sequenced_entry_failures,
        db::{
            catalog::chunk::ChunkStage,
-            test_helpers::{run_query, try_write_lp, write_lp},
+            test_helpers::{run_query, try_write_lp, write_lp, write_lp_with_time},
        },
        utils::{make_db, TestDb},
    };
@ -1875,9 +1876,8 @@ mod tests {
        let test_db = make_db().await;
        let db = Arc::new(test_db.db);

-        let time0 = Utc::now();
-        write_lp(db.as_ref(), "cpu bar=1 10").await;
-        let time1 = Utc::now();
+        let t_write1 = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu bar=1 10", t_write1).await;

        let partition_key = "1970-01-01T00";
        let mb_chunk = db
@ -1892,14 +1892,12 @@ mod tests {

        let first_old_rb_write = old_rb_chunk.time_of_first_write();
        let last_old_rb_write = old_rb_chunk.time_of_last_write();
-        assert!(time0 < first_old_rb_write);
        assert_eq!(first_old_rb_write, last_old_rb_write);
-        assert!(first_old_rb_write < time1);
+        assert_eq!(first_old_rb_write, t_write1);

        // Put new data into the mutable buffer
-        let time2 = Utc::now();
-        write_lp(db.as_ref(), "cpu bar=2 20").await;
-        let time3 = Utc::now();
+        let t_write2 = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu bar=2 20", t_write2).await;

        // now, compact it
        let compacted_rb_chunk = db.compact_partition("cpu", partition_key).await.unwrap();
@ -1917,8 +1915,7 @@ mod tests {
        let last_compacted_write = compacted_rb_chunk.time_of_last_write();
        assert_eq!(first_old_rb_write, first_compacted_write);
        assert_ne!(last_old_rb_write, last_compacted_write);
-        assert!(time2 < last_compacted_write);
-        assert!(last_compacted_write < time3);
+        assert_eq!(last_compacted_write, t_write2);

        // data should be readable
        let expected = vec![
@ -1935,7 +1932,7 @@ mod tests {

    async fn collect_read_filter(chunk: &DbChunk) -> Vec<RecordBatch> {
        chunk
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(&Default::default(), Selection::All, &[])
            .unwrap()
            .collect::<Vec<_>>()
            .await
@ -2483,12 +2480,16 @@ mod tests {
    #[tokio::test]
    async fn partition_chunk_summaries_timestamp() {
        let db = Arc::new(make_db().await.db);
-        let start = Utc::now();
-        write_lp(&db, "cpu bar=1 1").await;
-        let after_first_write = Utc::now();
-        write_lp(&db, "cpu bar=2 2").await;
+
+        let t_first_write = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1 1", t_first_write).await;
+
+        let t_second_write = Utc::now();
+        write_lp_with_time(&db, "cpu bar=2 2", t_second_write).await;
+
+        let t_close_before = Utc::now();
        db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();
-        let after_close = Utc::now();
+        let t_close_after = Utc::now();

        let mut chunk_summaries = db.chunk_summaries().unwrap();

@ -2496,59 +2497,18 @@ mod tests {

        let summary = &chunk_summaries[0];
        assert_eq!(summary.id, 0, "summary; {:#?}", summary);
-        assert!(
-            summary.time_of_first_write > start,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_of_first_write < after_close,
-            "summary; {:#?}",
-            summary
-        );
-
-        assert!(
-            summary.time_of_last_write > after_first_write,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_of_last_write < after_close,
-            "summary; {:#?}",
-            summary
-        );
-
-        assert!(
-            summary.time_closed.unwrap() > after_first_write,
-            "summary; {:#?}",
-            summary
-        );
-        assert!(
-            summary.time_closed.unwrap() < after_close,
-            "summary; {:#?}",
-            summary
-        );
+        assert_eq!(summary.time_of_first_write, t_first_write);
+        assert_eq!(summary.time_of_last_write, t_second_write);
+        assert!(t_close_before <= summary.time_closed.unwrap());
+        assert!(summary.time_closed.unwrap() <= t_close_after);
    }

-    fn assert_first_last_times_eq(chunk_summary: &ChunkSummary) {
+    fn assert_first_last_times_eq(chunk_summary: &ChunkSummary, expected: DateTime<Utc>) {
        let first_write = chunk_summary.time_of_first_write;
        let last_write = chunk_summary.time_of_last_write;

        assert_eq!(first_write, last_write);
-    }
-
-    fn assert_first_last_times_between(
-        chunk_summary: &ChunkSummary,
-        before: DateTime<Utc>,
-        after: DateTime<Utc>,
-    ) {
-        let first_write = chunk_summary.time_of_first_write;
-        let last_write = chunk_summary.time_of_last_write;
-
-        assert!(before < first_write);
-        assert!(before < last_write);
-        assert!(first_write < after);
-        assert!(last_write < after);
+        assert_eq!(first_write, expected);
    }

    fn assert_chunks_times_ordered(before: &ChunkSummary, after: &ChunkSummary) {
@ -2581,21 +2541,17 @@ mod tests {
        let db = make_db().await.db;

        // get three chunks: one open, one closed in mb and one close in rb
-        // TIME 0 ---------------------------------------------------------------------------------
-        let time0 = Utc::now();
        // In open chunk, will end up in rb/os
-        write_lp(&db, "cpu bar=1 1").await;
-        // TIME 1 ---------------------------------------------------------------------------------
-        let time1 = Utc::now();
+        let t_write1 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1 1", t_write1).await;
+
        // Move open chunk to closed
        db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();
-        // TIME 2 ---------------------------------------------------------------------------------
-        let time2 = Utc::now();
+
        // New open chunk in mb
        // This point will end up in rb/os
-        write_lp(&db, "cpu bar=1,baz=2 2").await;
-        // TIME 3 ---------------------------------------------------------------------------------
-        let time3 = Utc::now();
+        let t_write2 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=2 2", t_write2).await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2604,19 +2560,15 @@ mod tests {
        // Each chunk has one write, so both chunks should have first write == last write
        let closed_mb_t3 = chunk_summaries[0].clone();
        assert_eq!(closed_mb_t3.storage, ChunkStorage::ClosedMutableBuffer);
-        assert_first_last_times_eq(&closed_mb_t3);
-        assert_first_last_times_between(&closed_mb_t3, time0, time1);
+        assert_first_last_times_eq(&closed_mb_t3, t_write1);
        let open_mb_t3 = chunk_summaries[1].clone();
        assert_eq!(open_mb_t3.storage, ChunkStorage::OpenMutableBuffer);
-        assert_first_last_times_eq(&open_mb_t3);
-        assert_first_last_times_between(&open_mb_t3, time2, time3);
+        assert_first_last_times_eq(&open_mb_t3, t_write2);
        assert_chunks_times_ordered(&closed_mb_t3, &open_mb_t3);

        // This point makes a new open mb chunk and will end up in the closed mb chunk
-        write_lp(&db, "cpu bar=1,baz=2,frob=3 400000000000000").await;
-        // TIME 4 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time3 and time4
+        let t_write3 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=2,frob=3 400000000000000", t_write3).await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2639,9 +2591,6 @@ mod tests {
        db.move_chunk_to_read_buffer("cpu", "1970-01-01T00", 0)
            .await
            .unwrap();
-        // TIME 5 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time4 and time5

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2668,9 +2617,6 @@ mod tests {
        )
        .await
        .unwrap();
-        // TIME 6 ---------------------------------------------------------------------------------
-        // we don't need to check this value with anything because no timestamps
-        // should be between time5 and time6

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2692,8 +2638,6 @@ mod tests {

        // Move open chunk to closed
        db.rollover_partition("cpu", "1970-01-05T15").await.unwrap();
-        // TIME 7 ---------------------------------------------------------------------------------
-        let time7 = Utc::now();

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2710,9 +2654,8 @@ mod tests {

        // New open chunk in mb
        // This point will stay in this open mb chunk
-        write_lp(&db, "cpu bar=1,baz=3,blargh=3 400000000000000").await;
-        // TIME 8 ---------------------------------------------------------------------------------
-        let time8 = Utc::now();
+        let t_write4 = Utc::now();
+        write_lp_with_time(&db, "cpu bar=1,baz=3,blargh=3 400000000000000", t_write4).await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
@ -2730,8 +2673,7 @@ mod tests {
        // times should be the same
        let open_mb_t8 = chunk_summaries[2].clone();
        assert_eq!(open_mb_t8.storage, ChunkStorage::OpenMutableBuffer);
-        assert_first_last_times_eq(&open_mb_t8);
-        assert_first_last_times_between(&open_mb_t8, time7, time8);
+        assert_first_last_times_eq(&open_mb_t8, t_write4);

        let lifecycle_action = None;

--- a/server/src/db/catalog/chunk.rs
+++ b/server/src/db/catalog/chunk.rs
@ -16,7 +16,7 @@ use internal_types::{access::AccessRecorder, schema::Schema};
 use mutable_buffer::chunk::{snapshot::ChunkSnapshot as MBChunkSnapshot, MBChunk};
 use observability_deps::tracing::debug;
 use parquet_file::chunk::ParquetChunk;
-use predicate::predicate::{Predicate, PredicateBuilder};
+use predicate::predicate::Predicate;
 use read_buffer::RBChunk;
 use tracker::{TaskRegistration, TaskTracker};

@ -80,7 +80,7 @@ pub struct ChunkMetadata {
    pub schema: Arc<Schema>,

    /// Delete predicates of this chunk
-    pub delete_predicates: Arc<Vec<Predicate>>,
+    pub delete_predicates: Vec<Arc<Predicate>>,
 }

 /// Different memory representations of a frozen chunk.
@ -307,14 +307,14 @@ impl CatalogChunk {
        time_of_last_write: DateTime<Utc>,
        schema: Arc<Schema>,
        metrics: ChunkMetrics,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
        order: ChunkOrder,
    ) -> Self {
        let stage = ChunkStage::Frozen {
            meta: Arc::new(ChunkMetadata {
                table_summary: Arc::new(chunk.table_summary()),
                schema,
-                delete_predicates: Arc::clone(&delete_predicates),
+                delete_predicates,
            }),
            representation: ChunkStageFrozenRepr::ReadBuffer(Arc::new(chunk)),
        };
@ -342,7 +342,7 @@ impl CatalogChunk {
        time_of_first_write: DateTime<Utc>,
        time_of_last_write: DateTime<Utc>,
        metrics: ChunkMetrics,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
        order: ChunkOrder,
    ) -> Self {
        assert_eq!(chunk.table_name(), addr.table_name.as_ref());
@ -469,30 +469,24 @@ impl CatalogChunk {
        }
    }

-    pub fn add_delete_predicate(&mut self, delete_predicate: &Predicate) -> Result<()> {
+    pub fn add_delete_predicate(&mut self, delete_predicate: Arc<Predicate>) -> Result<()> {
+        debug!(
+            ?delete_predicate,
+            "Input delete predicate to CatalogChunk add_delete_predicate"
+        );
        match &mut self.stage {
            ChunkStage::Open { mb_chunk: _ } => {
                // Freeze/close this chunk and add delete_predicate to its frozen one
                self.freeze_with_predicate(delete_predicate)?;
            }
-            ChunkStage::Frozen { meta, .. } => {
+            ChunkStage::Frozen { meta, .. } | ChunkStage::Persisted { meta, .. } => {
                // Add the delete_predicate into the chunk's metadata
-                let mut del_preds: Vec<Predicate> = (*meta.delete_predicates).clone();
-                del_preds.push(delete_predicate.clone());
+                let mut del_preds = meta.delete_predicates.clone();
+                del_preds.push(delete_predicate);
                *meta = Arc::new(ChunkMetadata {
                    table_summary: Arc::clone(&meta.table_summary),
                    schema: Arc::clone(&meta.schema),
-                    delete_predicates: Arc::new(del_preds),
-                });
-            }
-            ChunkStage::Persisted { meta, .. } => {
-                // Add the delete_predicate into the chunk's metadata
-                let mut del_preds: Vec<Predicate> = (*meta.delete_predicates).clone();
-                del_preds.push(delete_predicate.clone());
-                *meta = Arc::new(ChunkMetadata {
-                    table_summary: Arc::clone(&meta.table_summary),
-                    schema: Arc::clone(&meta.schema),
-                    delete_predicates: Arc::new(del_preds),
+                    delete_predicates: del_preds,
                });
            }
        }
@ -500,14 +494,23 @@ impl CatalogChunk {
        Ok(())
    }

-    pub fn delete_predicates(&mut self) -> Arc<Vec<Predicate>> {
+    pub fn delete_predicates(&mut self) -> &[Arc<Predicate>] {
        match &self.stage {
            ChunkStage::Open { mb_chunk: _ } => {
                // no delete predicate for open chunk
-                Arc::new(vec![])
+                debug!("delete_predicates of Open chunk is empty");
+                &[]
+            }
+            ChunkStage::Frozen { meta, .. } => {
+                let preds = &meta.delete_predicates;
+                debug!(?preds, "delete_predicates of Frozen chunk");
+                preds
+            }
+            ChunkStage::Persisted { meta, .. } => {
+                let preds = &meta.delete_predicates;
+                debug!(?preds, "delete_predicates of Persisted chunk");
+                preds
            }
-            ChunkStage::Frozen { meta, .. } => Arc::clone(&meta.delete_predicates),
-            ChunkStage::Persisted { meta, .. } => Arc::clone(&meta.delete_predicates),
        }
    }

@ -679,7 +682,14 @@ impl CatalogChunk {
    ///
    /// This only works for chunks in the _open_ stage (chunk is converted) and the _frozen_ stage
    /// (no-op) and will fail for other stages.
-    pub fn freeze_with_predicate(&mut self, delete_predicate: &Predicate) -> Result<()> {
+    pub fn freeze_with_predicate(&mut self, delete_predicate: Arc<Predicate>) -> Result<()> {
+        self.freeze_with_delete_predicates(vec![delete_predicate])
+    }
+
+    fn freeze_with_delete_predicates(
+        &mut self,
+        delete_predicates: Vec<Arc<Predicate>>,
+    ) -> Result<()> {
        match &self.stage {
            ChunkStage::Open { mb_chunk, .. } => {
                debug!(%self.addr, row_count=mb_chunk.rows(), "freezing chunk");
@ -692,7 +702,7 @@ impl CatalogChunk {
                let metadata = ChunkMetadata {
                    table_summary: Arc::new(mb_chunk.table_summary()),
                    schema: s.full_schema(),
-                    delete_predicates: Arc::new(vec![delete_predicate.clone()]),
+                    delete_predicates,
                };

                self.stage = ChunkStage::Frozen {
@ -714,7 +724,7 @@ impl CatalogChunk {
    }

    pub fn freeze(&mut self) -> Result<()> {
-        self.freeze_with_predicate(&PredicateBuilder::default().build())
+        self.freeze_with_delete_predicates(vec![])
    }

    /// Set the chunk to the Moving state, returning a handle to the underlying storage
@ -776,7 +786,7 @@ impl CatalogChunk {
                *meta = Arc::new(ChunkMetadata {
                    table_summary: Arc::clone(&meta.table_summary),
                    schema,
-                    delete_predicates: Arc::clone(&meta.delete_predicates),
+                    delete_predicates: meta.delete_predicates.clone(),
                });

                match &representation {
@ -1151,7 +1161,7 @@ mod tests {
        expected_exprs1.push(e);

        // Add a delete predicate into a chunk the open chunk = delete simulation for open chunk
-        chunk.add_delete_predicate(&del_pred1).unwrap();
+        chunk.add_delete_predicate(Arc::new(del_pred1)).unwrap();
        // chunk must be in frozen stage now
        assert_eq!(chunk.stage().name(), "Frozen");
        // chunk must have a delete predicate
@ -1182,7 +1192,7 @@ mod tests {
        let mut expected_exprs2 = vec![];
        let e = col("cost").not_eq(lit(15));
        expected_exprs2.push(e);
-        chunk.add_delete_predicate(&del_pred2).unwrap();
+        chunk.add_delete_predicate(Arc::new(del_pred2)).unwrap();
        // chunk still must be in frozen stage now
        assert_eq!(chunk.stage().name(), "Frozen");
        // chunk must have 2 delete predicates
@ -1248,7 +1258,7 @@ mod tests {
            now,
            now,
            ChunkMetrics::new_unregistered(),
-            Arc::new(vec![] as Vec<Predicate>),
+            vec![],
            ChunkOrder::new(6),
        )
    }
--- a/server/src/db/catalog/partition.rs
+++ b/server/src/db/catalog/partition.rs
@ -176,7 +176,7 @@ impl Partition {
        time_of_first_write: DateTime<Utc>,
        time_of_last_write: DateTime<Utc>,
        schema: Arc<Schema>,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
        chunk_order: ChunkOrder,
    ) -> (u32, Arc<RwLock<CatalogChunk>>) {
        let chunk_id = Self::pick_next(&mut self.next_chunk_id, "Chunk ID Overflow");
@ -231,7 +231,7 @@ impl Partition {
        chunk: Arc<parquet_file::chunk::ParquetChunk>,
        time_of_first_write: DateTime<Utc>,
        time_of_last_write: DateTime<Utc>,
-        delete_predicates: Arc<Vec<Predicate>>,
+        delete_predicates: Vec<Arc<Predicate>>,
        chunk_order: ChunkOrder,
    ) -> Arc<RwLock<CatalogChunk>> {
        assert_eq!(chunk.table_name(), self.table_name());
@ -246,7 +246,7 @@ impl Partition {
                    time_of_first_write,
                    time_of_last_write,
                    self.metrics.new_chunk_metrics(),
-                    Arc::clone(&delete_predicates),
+                    delete_predicates,
                    chunk_order,
                )),
        );
--- a/server/src/db/chunk.rs
+++ b/server/src/db/chunk.rs
@ -121,7 +121,7 @@ impl DbChunk {
                let meta = ChunkMetadata {
                    table_summary: Arc::new(mb_chunk.table_summary()),
                    schema: snapshot.full_schema(),
-                    delete_predicates: Arc::new(vec![]), //todo: consider to use the one of the given chunk if appropriate
+                    delete_predicates: vec![], // open chunk does not have delete predicate
                };
                (state, Arc::new(meta))
            }
@ -224,6 +224,19 @@ impl DbChunk {
    pub fn time_of_last_write(&self) -> DateTime<Utc> {
        self.time_of_last_write
    }
+
+    pub fn to_rub_negated_predicates(
+        delete_predicates: &[Arc<Predicate>],
+    ) -> Result<Vec<read_buffer::Predicate>> {
+        let mut rub_preds: Vec<read_buffer::Predicate> = vec![];
+        for pred in delete_predicates {
+            let rub_pred = to_read_buffer_predicate(pred).context(PredicateConversion)?;
+            rub_preds.push(rub_pred);
+        }
+
+        debug!(?rub_preds, "RUB delete predicates");
+        Ok(rub_preds)
+    }
 }

 impl QueryChunk for DbChunk {
@ -314,11 +327,11 @@ impl QueryChunk for DbChunk {
        Ok(pred_result)
    }

-    // NGA todo: add delete predicate here to eliminate data at query time
    fn read_filter(
        &self,
        predicate: &Predicate,
        selection: Selection<'_>,
+        delete_predicates: &[Arc<Predicate>],
    ) -> Result<SendableRecordBatchStream, Self::Error> {
        // Predicate is not required to be applied for correctness. We only pushed it down
        // when possible for performance gain
@ -326,6 +339,24 @@ impl QueryChunk for DbChunk {
        debug!(?predicate, "Input Predicate to read_filter");
        self.access_recorder.record_access_now();

+        debug!(?delete_predicates, "Input Delete Predicates to read_filter");
+
+        // add negated deleted ranges to the predicate
+        let mut pred_with_deleted_ranges = predicate.clone();
+        pred_with_deleted_ranges.add_delete_ranges(delete_predicates);
+        debug!(
+            ?pred_with_deleted_ranges,
+            "Input Predicate plus deleted ranges"
+        );
+
+        // add negated deleted predicates
+        let mut pred_wth_deleted_exprs = pred_with_deleted_ranges.clone();
+        pred_wth_deleted_exprs.add_delete_exprs(delete_predicates);
+        debug!(
+            ?pred_wth_deleted_exprs,
+            "Input Predicate plus deleted ranges and deleted predicates"
+        );
+
        match &self.state {
            State::MutableBuffer { chunk, .. } => {
                let batch = chunk.read_filter(selection).context(MutableBufferChunk)?;
@ -339,12 +370,16 @@ impl QueryChunk for DbChunk {
                        Ok(predicate) => predicate,
                        Err(_) => read_buffer::Predicate::default(),
                    };
-
                debug!(?rb_predicate, "Predicate pushed down to RUB");

-                // TODO: add collection of delete predicates associated with
-                // this chunk.
-                let read_results = chunk.read_filter(rb_predicate, selection, vec![]);
+                // combine all delete expressions to RUB's negated ones
+                let negated_delete_exprs = Self::to_rub_negated_predicates(delete_predicates)?;
+                debug!(
+                    ?negated_delete_exprs,
+                    "Negated Predicate pushed down to RUB"
+                );
+
+                let read_results = chunk.read_filter(rb_predicate, selection, negated_delete_exprs);
                let schema =
                    chunk
                        .read_filter_table_schema(selection)
@ -357,13 +392,11 @@ impl QueryChunk for DbChunk {
                    schema.into(),
                )))
            }
-            State::ParquetFile { chunk, .. } => {
-                chunk
-                    .read_filter(predicate, selection)
-                    .context(ParquetFileChunkError {
-                        chunk_id: self.id(),
-                    })
-            }
+            State::ParquetFile { chunk, .. } => chunk
+                .read_filter(&pred_wth_deleted_exprs, selection)
+                .context(ParquetFileChunkError {
+                    chunk_id: self.id(),
+                }),
        }
    }

@ -503,8 +536,11 @@ impl QueryChunkMeta for DbChunk {
    }

    // return a reference to delete predicates of the chunk
-    fn delete_predicates(&self) -> Arc<Vec<Predicate>> {
-        Arc::clone(&self.meta.delete_predicates)
+    fn delete_predicates(&self) -> &[Arc<Predicate>] {
+        let pred = &self.meta.delete_predicates;
+        debug!(?pred, "Delete predicate in  DbChunk");
+
+        pred
    }
 }

@ -514,7 +550,7 @@ mod tests {
    use crate::{
        db::{
            catalog::chunk::{CatalogChunk, ChunkStage},
-            test_helpers::write_lp,
+            test_helpers::{write_lp, write_lp_with_time},
        },
        utils::make_db,
    };
@ -527,7 +563,7 @@ mod tests {
        let t2 = chunk.access_recorder().get_metrics();

        snapshot
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(&Default::default(), Selection::All, &[])
            .unwrap();
        let t3 = chunk.access_recorder().get_metrics();

@ -608,9 +644,8 @@ mod tests {
    async fn parquet_records_access() {
        let db = make_db().await.db;

-        let before_creation = Utc::now();
-        write_lp(&db, "cpu,tag=1 bar=1 1").await;
-        let after_creation = Utc::now();
+        let creation_time = Utc::now();
+        write_lp_with_time(&db, "cpu,tag=1 bar=1 1", creation_time).await;

        let id = db
            .persist_partition(
@ -632,8 +667,7 @@ mod tests {
        let first_write = chunk.time_of_first_write();
        let last_write = chunk.time_of_last_write();
        assert_eq!(first_write, last_write);
-        assert!(before_creation < first_write);
-        assert!(last_write < after_creation);
+        assert_eq!(first_write, creation_time);

        test_chunk_access(&chunk).await
    }
--- a/server/src/db/lifecycle/compact.rs
+++ b/server/src/db/lifecycle/compact.rs
@ -45,7 +45,7 @@ pub(crate) fn compact_chunks(
    let mut input_rows = 0;
    let mut time_of_first_write: Option<DateTime<Utc>> = None;
    let mut time_of_last_write: Option<DateTime<Utc>> = None;
-    let mut delete_predicates: Vec<Predicate> = vec![];
+    let mut delete_predicates: Vec<Arc<Predicate>> = vec![];
    let mut min_order = ChunkOrder::MAX;
    let query_chunks = chunks
        .into_iter()
@ -66,8 +66,7 @@ pub(crate) fn compact_chunks(
                .map(|prev_last| prev_last.max(candidate_last))
                .or(Some(candidate_last));

-            let mut preds = (*chunk.delete_predicates()).clone();
-            delete_predicates.append(&mut preds);
+            delete_predicates.extend(chunk.delete_predicates().iter().cloned());

            min_order = min_order.min(chunk.order());

@ -103,7 +102,7 @@ pub(crate) fn compact_chunks(
            ReorgPlanner::new().compact_plan(schema, query_chunks.iter().map(Arc::clone), key)?;

        let physical_plan = ctx.prepare_plan(&plan)?;
-        let stream = ctx.execute(physical_plan).await?;
+        let stream = ctx.execute_stream(physical_plan).await?;
        let rb_chunk = collect_rub(stream, &addr, metric_registry.as_ref())
            .await?
            .expect("chunk has zero rows");
@ -119,7 +118,7 @@ pub(crate) fn compact_chunks(
                time_of_first_write,
                time_of_last_write,
                schema,
-                Arc::new(delete_predicates),
+                delete_predicates,
                min_order,
            )
        };
@ -148,7 +147,7 @@ pub(crate) fn compact_chunks(
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::{db::test_helpers::write_lp, utils::make_db};
+    use crate::{db::test_helpers::write_lp_with_time, utils::make_db};
    use data_types::chunk_metadata::ChunkStorage;
    use lifecycle::{LockableChunk, LockablePartition};
    use query::QueryDatabase;
@ -158,15 +157,13 @@ mod tests {
        let test_db = make_db().await;
        let db = test_db.db;

-        let time0 = Utc::now();
-
-        write_lp(db.as_ref(), "cpu,tag1=cupcakes bar=1 10").await;
-        write_lp(db.as_ref(), "cpu,tag1=asfd,tag2=foo bar=2 20").await;
-        write_lp(db.as_ref(), "cpu,tag1=bingo,tag2=foo bar=2 10").await;
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 20").await;
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 10").await;
-
-        let time1 = Utc::now();
+        let t_first_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=cupcakes bar=1 10", t_first_write).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=asfd,tag2=foo bar=2 20", Utc::now()).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bingo,tag2=foo bar=2 10", Utc::now()).await;
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 20", Utc::now()).await;
+        let t_last_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 10", t_last_write).await;

        let partition_keys = db.partition_keys().unwrap();
        assert_eq!(partition_keys.len(), 1);
@ -182,9 +179,8 @@ mod tests {

        let (_, fut) = compact_chunks(partition.upgrade(), vec![chunk.upgrade()]).unwrap();
        // NB: perform the write before spawning the background task that performs the compaction
-        let time2 = Utc::now();
-        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 40").await;
-        let time3 = Utc::now();
+        let t_later_write = Utc::now();
+        write_lp_with_time(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 40", t_later_write).await;
        tokio::spawn(fut).await.unwrap().unwrap().unwrap();

        let mut chunk_summaries: Vec<_> = db_partition.read().chunk_summaries().collect();
@ -194,16 +190,14 @@ mod tests {
        let mub_summary = &chunk_summaries[0];
        let first_mub_write = mub_summary.time_of_first_write;
        let last_mub_write = mub_summary.time_of_last_write;
-        assert!(time2 < first_mub_write);
        assert_eq!(first_mub_write, last_mub_write);
-        assert!(first_mub_write < time3);
+        assert_eq!(first_mub_write, t_later_write);

        let rub_summary = &chunk_summaries[1];
        let first_rub_write = rub_summary.time_of_first_write;
        let last_rub_write = rub_summary.time_of_last_write;
-        assert!(time0 < first_rub_write);
-        assert!(first_rub_write < last_rub_write);
-        assert!(last_rub_write < time1);
+        assert_eq!(first_rub_write, t_first_write);
+        assert_eq!(last_rub_write, t_last_write);

        let summaries: Vec<_> = chunk_summaries
            .iter()
--- a/server/src/db/lifecycle/error.rs
+++ b/server/src/db/lifecycle/error.rs
@ -49,7 +49,7 @@ pub enum Error {

    #[snafu(display("Error while commiting transaction on preserved catalog: {}", source))]
    CommitError {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
    },

    #[snafu(display("Cannot write chunk: {}", addr))]
--- a/server/src/db/lifecycle/move_chunk.rs
+++ b/server/src/db/lifecycle/move_chunk.rs
@ -54,7 +54,7 @@ pub fn move_chunk_to_read_buffer(
            ReorgPlanner::new().compact_plan(schema, query_chunks.iter().map(Arc::clone), key)?;

        let physical_plan = ctx.prepare_plan(&plan)?;
-        let stream = ctx.execute(physical_plan).await?;
+        let stream = ctx.execute_stream(physical_plan).await?;
        let rb_chunk = collect_rub(
            stream,
            &addr.clone().into_partition(),
--- a/server/src/db/lifecycle/persist.rs
+++ b/server/src/db/lifecycle/persist.rs
@ -52,7 +52,7 @@ pub fn persist_chunks(
    let mut time_of_first_write: Option<DateTime<Utc>> = None;
    let mut time_of_last_write: Option<DateTime<Utc>> = None;
    let mut query_chunks = vec![];
-    let mut delete_predicates: Vec<Predicate> = vec![];
+    let mut delete_predicates: Vec<Arc<Predicate>> = vec![];
    let mut min_order = ChunkOrder::MAX;
    for mut chunk in chunks {
        // Sanity-check
@ -72,8 +72,7 @@ pub fn persist_chunks(
            .map(|prev_last| prev_last.max(candidate_last))
            .or(Some(candidate_last));

-        let mut preds = (*chunk.delete_predicates()).clone();
-        delete_predicates.append(&mut preds);
+        delete_predicates.extend(chunk.delete_predicates().iter().cloned());

        min_order = min_order.min(chunk.order());

@ -112,8 +111,10 @@ pub fn persist_chunks(
            "Expected split plan to produce exactly 2 partitions"
        );

-        let to_persist_stream = ctx.execute_partition(Arc::clone(&physical_plan), 0).await?;
-        let remainder_stream = ctx.execute_partition(physical_plan, 1).await?;
+        let to_persist_stream = ctx
+            .execute_stream_partitioned(Arc::clone(&physical_plan), 0)
+            .await?;
+        let remainder_stream = ctx.execute_stream_partitioned(physical_plan, 1).await?;

        let (to_persist, remainder) = futures::future::try_join(
            collect_rub(to_persist_stream, &addr, metric_registry.as_ref()),
@ -131,7 +132,6 @@ pub fn persist_chunks(
                partition_write.force_drop_chunk(id)
            }

-            let del_preds = Arc::new(delete_predicates);
            // Upsert remainder to catalog
            if let Some(remainder) = remainder {
                partition_write.create_rub_chunk(
@ -139,11 +139,13 @@ pub fn persist_chunks(
                    time_of_first_write,
                    time_of_last_write,
                    Arc::clone(&schema),
-                    Arc::clone(&del_preds),
+                    delete_predicates.clone(),
                    min_order,
                );
            }

+            // NGA todo: we hit this error if there are rows but they are deleted
+            // Need to think a way to handle this (https://github.com/influxdata/influxdb_iox/issues/2546)
            let to_persist = to_persist.expect("should be rows to persist");

            let (new_chunk_id, new_chunk) = partition_write.create_rub_chunk(
@ -151,7 +153,7 @@ pub fn persist_chunks(
                time_of_first_write,
                time_of_last_write,
                schema,
-                del_preds,
+                delete_predicates,
                min_order,
            );
            let to_persist = LockableCatalogChunk {
--- a/server/src/db/lifecycle/write.rs
+++ b/server/src/db/lifecycle/write.rs
@ -17,7 +17,7 @@ use data_types::{chunk_metadata::ChunkLifecycleAction, job::Job};
 use internal_types::selection::Selection;
 use observability_deps::tracing::{debug, warn};
 use parquet_file::{
-    catalog::api::CatalogParquetInfo,
+    catalog::interface::CatalogParquetInfo,
    chunk::{ChunkMetrics as ParquetChunkMetrics, ParquetChunk},
    metadata::IoxMetadata,
    storage::Storage,
@ -26,7 +26,7 @@ use persistence_windows::{
    checkpoint::{DatabaseCheckpoint, PartitionCheckpoint, PersistCheckpointBuilder},
    persistence_windows::FlushHandle,
 };
-use query::QueryChunk;
+use query::{QueryChunk, QueryChunkMeta};
 use snafu::ResultExt;
 use std::{future::Future, sync::Arc};
 use tracker::{TaskTracker, TrackedFuture, TrackedFutureExt};
@ -89,7 +89,11 @@ pub(super) fn write_chunk_to_object_store(

        // Get RecordBatchStream of data from the read buffer chunk
        let stream = db_chunk
-            .read_filter(&Default::default(), Selection::All)
+            .read_filter(
+                &Default::default(),
+                Selection::All,
+                db_chunk.delete_predicates(),
+            )
            .expect("read filter should be infallible");

        // check that the upcoming state change will very likely succeed
--- a/server/src/db/load.rs
+++ b/server/src/db/load.rs
@ -1,11 +1,17 @@
 //! Functionality to load a [`Catalog`](crate::db::catalog::Catalog) and other information from a
-//! [`PreservedCatalog`](parquet_file::catalog::api::PreservedCatalog).
+//! [`PreservedCatalog`](parquet_file::catalog::core::PreservedCatalog).

 use super::catalog::{chunk::ChunkStage, table::TableSchemaUpsertHandle, Catalog};
 use iox_object_store::{IoxObjectStore, ParquetFilePath};
 use observability_deps::tracing::{error, info};
 use parquet_file::{
-    catalog::api::{CatalogParquetInfo, CatalogState, ChunkCreationFailed, PreservedCatalog},
+    catalog::{
+        core::PreservedCatalog,
+        interface::{
+            CatalogParquetInfo, CatalogState, CatalogStateAddError, CatalogStateRemoveError,
+            ChunkCreationFailed,
+        },
+    },
    chunk::{ChunkMetrics as ParquetChunkMetrics, ParquetChunk},
 };
 use persistence_windows::checkpoint::{ReplayPlan, ReplayPlanner};
@ -22,17 +28,17 @@ pub enum Error {

    #[snafu(display("Cannot create new empty preserved catalog: {}", source))]
    CannotCreateCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
    },

    #[snafu(display("Cannot load preserved catalog: {}", source))]
    CannotLoadCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
    },

    #[snafu(display("Cannot wipe preserved catalog: {}", source))]
    CannotWipeCatalog {
-        source: parquet_file::catalog::api::Error,
+        source: parquet_file::catalog::core::Error,
    },
 }
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -166,8 +172,10 @@ impl CatalogState for Loader {
        &mut self,
        iox_object_store: Arc<IoxObjectStore>,
        info: CatalogParquetInfo,
-    ) -> parquet_file::catalog::api::Result<()> {
-        use parquet_file::catalog::api::{MetadataExtractFailed, ReplayPlanError, SchemaError};
+    ) -> Result<(), CatalogStateAddError> {
+        use parquet_file::catalog::interface::{
+            MetadataExtractFailed, ReplayPlanError, SchemaError,
+        };

        // extract relevant bits from parquet file metadata
        let iox_md = info
@ -212,15 +220,13 @@ impl CatalogState for Loader {
            .get_or_create_partition(&iox_md.table_name, &iox_md.partition_key);
        let mut partition = partition.write();
        if partition.chunk(iox_md.chunk_id).is_some() {
-            return Err(
-                parquet_file::catalog::api::Error::ParquetFileAlreadyExists { path: info.path },
-            );
+            return Err(CatalogStateAddError::ParquetFileAlreadyExists { path: info.path });
        }
        let schema_handle = TableSchemaUpsertHandle::new(&table_schema, &parquet_chunk.schema())
            .map_err(|e| Box::new(e) as _)
            .context(SchemaError { path: info.path })?;

-        let delete_predicates: Arc<Vec<Predicate>> = Arc::new(vec![]); // NGA todo: After Marco save delete predicate into the catalog, it will need to extract into this variable
+        let delete_predicates: Vec<Arc<Predicate>> = vec![]; // NGA todo: After Marco saves delete predicate into the catalog, it will need to get extracted into this variable
        partition.insert_object_store_only_chunk(
            iox_md.chunk_id,
            parquet_chunk,
@ -234,7 +240,7 @@ impl CatalogState for Loader {
        Ok(())
    }

-    fn remove(&mut self, path: &ParquetFilePath) -> parquet_file::catalog::api::Result<()> {
+    fn remove(&mut self, path: &ParquetFilePath) -> Result<(), CatalogStateRemoveError> {
        let mut removed_any = false;

        for partition in self.catalog.partitions() {
@ -261,7 +267,7 @@ impl CatalogState for Loader {
        if removed_any {
            Ok(())
        } else {
-            Err(parquet_file::catalog::api::Error::ParquetFileDoesNotExist { path: path.clone() })
+            Err(CatalogStateRemoveError::ParquetFileDoesNotExist { path: path.clone() })
        }
    }
 }
@ -273,7 +279,7 @@ mod tests {
    use data_types::{server_id::ServerId, DatabaseName};
    use object_store::ObjectStore;
    use parquet_file::catalog::{
-        api::CheckpointData,
+        interface::CheckpointData,
        test_helpers::{assert_catalog_state_implementation, TestCatalogState},
    };
    use std::convert::TryFrom;
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -1195,7 +1195,7 @@ mod tests {
        path::{parsed::DirsAndFileName, ObjectStorePath},
        ObjectStore, ObjectStoreApi,
    };
-    use parquet_file::catalog::{api::PreservedCatalog, test_helpers::TestCatalogState};
+    use parquet_file::catalog::{core::PreservedCatalog, test_helpers::TestCatalogState};
    use query::{exec::ExecutionContextProvider, frontend::sql::SqlQueryPlanner, QueryDatabase};
    use std::{
        convert::{TryFrom, TryInto},
--- a/src/commands/database/partition.rs
+++ b/src/commands/database/partition.rs
@ -1,6 +1,5 @@
 //! This module implements the `partition` CLI command
 use data_types::chunk_metadata::ChunkSummary;
-use data_types::job::Operation;
 use generated_types::google::FieldViolation;
 use influxdb_iox_client::{
    connection::Connection,
@ -10,7 +9,7 @@ use influxdb_iox_client::{
        PersistPartitionError, UnloadPartitionChunkError,
    },
 };
-use std::convert::{TryFrom, TryInto};
+use std::convert::TryFrom;
 use structopt::StructOpt;
 use thiserror::Error;

@ -283,10 +282,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                chunk_id,
            } = close_chunk;

-            let operation: Operation = client
+            let operation = client
                .close_partition_chunk(db_name, table_name, partition_key, chunk_id)
-                .await?
-                .try_into()?;
+                .await?;

            serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
        }
--- a/src/commands/database/recover.rs
+++ b/src/commands/database/recover.rs
@ -1,6 +1,3 @@
-use std::convert::TryInto;
-
-use data_types::job::Operation;
 use generated_types::google::FieldViolation;
 use influxdb_iox_client::{connection::Connection, management};
 use snafu::{ResultExt, Snafu};
@ -74,12 +71,10 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
                return Err(Error::NeedsTheForceError);
            }

-            let operation: Operation = client
+            let operation = client
                .wipe_persisted_catalog(db_name)
                .await
-                .context(WipeError)?
-                .try_into()
-                .context(InvalidResponse)?;
+                .context(WipeError)?;

            serde_json::to_writer_pretty(std::io::stdout(), &operation).context(WritingJson)?;
        }
--- a/src/commands/operations.rs
+++ b/src/commands/operations.rs
@ -1,11 +1,8 @@
-use data_types::job::Operation;
-use generated_types::google::FieldViolation;
 use influxdb_iox_client::{
    connection::Connection,
    management,
    operations::{self, Client},
 };
-use std::convert::TryInto;
 use structopt::StructOpt;
 use thiserror::Error;

@ -15,9 +12,6 @@ pub enum Error {
    #[error("Client error: {0}")]
    ClientError(#[from] operations::Error),

-    #[error("Received invalid response: {0}")]
-    InvalidResponse(#[from] FieldViolation),
-
    #[error("Failed to create dummy job: {0}")]
    CreateDummyJobError(#[from] management::CreateDummyJobError),

@ -68,29 +62,16 @@ enum Command {
 pub async fn command(connection: Connection, config: Config) -> Result<()> {
    match config.command {
        Command::List => {
-            let result: Result<Vec<Operation>, _> = Client::new(connection)
-                .list_operations()
-                .await?
-                .into_iter()
-                .map(|c| c.operation())
-                .map(TryInto::try_into)
-                .collect();
-            let operations = result?;
+            let operations = Client::new(connection).list_operations().await?;
            serde_json::to_writer_pretty(std::io::stdout(), &operations)?;
        }
        Command::Get { id } => {
-            let operation: Operation = Client::new(connection)
-                .get_operation(id)
-                .await?
-                .try_into()?;
+            let operation = Client::new(connection).get_operation(id).await?;
            serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
        }
        Command::Wait { id, nanos } => {
            let timeout = nanos.map(std::time::Duration::from_nanos);
-            let operation: Operation = Client::new(connection)
-                .wait_operation(id, timeout)
-                .await?
-                .try_into()?;
+            let operation = Client::new(connection).wait_operation(id, timeout).await?;
            serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
        }
        Command::Cancel { id } => {
@ -98,10 +79,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
            println!("Ok");
        }
        Command::Test { nanos } => {
-            let operation: Operation = management::Client::new(connection)
+            let operation = management::Client::new(connection)
                .create_dummy_job(nanos)
-                .await?
-                .try_into()?;
+                .await?;
            serde_json::to_writer_pretty(std::io::stdout(), &operation)?;
        }
    }
--- a/src/influxdb_ioxd.rs
+++ b/src/influxdb_ioxd.rs
@ -753,9 +753,11 @@ mod tests {
        child(prepare_sql_span, "prepare_plan").unwrap();

        let collect_span = child(ctx_span, "collect").unwrap();
+        let execute_span = child(collect_span, "execute_stream_partitioned").unwrap();
+        let coalesce_span = child(execute_span, "CoalescePartitionsEx").unwrap();

        // validate spans from DataFusion ExecutionPlan are present
-        child(collect_span, "ProjectionExec: expr").unwrap();
+        child(coalesce_span, "ProjectionExec: expr").unwrap();

        let database_not_found = root_spans[3];
        assert_eq!(database_not_found.status, SpanStatus::Err);
--- a/src/influxdb_ioxd/http.rs
+++ b/src/influxdb_ioxd/http.rs
@ -28,7 +28,7 @@ use data_types::{
 };
 use influxdb_iox_client::format::QueryOutputFormat;
 use influxdb_line_protocol::parse_lines;
-use query::{exec::ExecutionContextProvider, QueryDatabase};
+use query::exec::ExecutionContextProvider;
 use server::{ApplicationState, ConnectionManager, Error, Server as AppServer};

 // External crates
@ -392,7 +392,6 @@ where
        .get("/health", health::<M>)
        .get("/metrics", handle_metrics::<M>)
        .get("/iox/api/v1/databases/:name/query", query::<M>)
-        .get("/api/v1/partitions", list_partitions::<M>)
        .get("/debug/pprof", pprof_home::<M>)
        .get("/debug/pprof/profile", pprof_profile::<M>)
        .get("/debug/pprof/allocs", pprof_heappy_profile::<M>)
@ -644,43 +643,6 @@ async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
    Ok(Response::new(Body::from(body)))
 }

-#[derive(Deserialize, Debug)]
-/// Arguments in the query string of the request to /partitions
-struct DatabaseInfo {
-    org: String,
-    bucket: String,
-}
-
-#[tracing::instrument(level = "debug")]
-async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
-    req: Request<Body>,
-) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
-
-    let query = req.uri().query().context(ExpectedQueryString {})?;
-
-    let info: DatabaseInfo = serde_urlencoded::from_str(query).context(InvalidQueryString {
-        query_string: query,
-    })?;
-
-    let db_name =
-        org_and_bucket_to_database(&info.org, &info.bucket).context(BucketMappingError)?;
-
-    let db = server.db(&db_name)?;
-
-    let partition_keys =
-        db.partition_keys()
-            .map_err(|e| Box::new(e) as _)
-            .context(BucketByName {
-                org: &info.org,
-                bucket_name: &info.bucket,
-            })?;
-
-    let result = serde_json::to_string(&partition_keys).context(JsonGenerationError)?;
-
-    Ok(Response::new(Body::from(result)))
-}
-
 #[derive(Deserialize, Debug)]
 /// Arguments in the query string of the request to /snapshot
 struct SnapshotInfo {
--- a/src/influxdb_ioxd/rpc/management.rs
+++ b/src/influxdb_ioxd/rpc/management.rs
@ -631,7 +631,7 @@ where
                    del_predicate.exprs.push(expr);
                }

-                db.delete(&table_name, &del_predicate)
+                db.delete(&table_name, Arc::new(del_predicate))
                    .await
                    .map_err(default_db_error_handler)?;
            }
--- a/tests/end_to_end_cases/management_api.rs
+++ b/tests/end_to_end_cases/management_api.rs
@ -9,7 +9,6 @@ use generated_types::{
 };
 use influxdb_iox_client::{
    management::{Client, CreateDatabaseError},
-    operations,
    write::WriteError,
 };

@ -880,20 +879,16 @@ async fn test_close_partition_chunk() {
    assert_eq!(chunks[0].storage, ChunkStorage::OpenMutableBuffer as i32);

    // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
        .close_partition_chunk(&db_name, table_name, partition_key, 0)
        .await
        .expect("new partition chunk");

-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();

    // ensure we got a legit job description back
-    if let Some(Job::CloseChunk(close_chunk)) = meta.job {
+    if let Some(Job::CloseChunk(close_chunk)) = iox_operation.metadata.job {
        assert_eq!(close_chunk.db_name, db_name);
        assert_eq!(close_chunk.partition_key, partition_key);
        assert_eq!(close_chunk.chunk_id, 0);
@ -1020,20 +1015,16 @@ async fn test_wipe_preserved_catalog() {
    // Recover by wiping preserved catalog
    //

-    let operation = management_client
+    let iox_operation = management_client
        .wipe_persisted_catalog(&db_name)
        .await
        .expect("wipe persisted catalog");

-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();

    // ensure we got a legit job description back
-    if let Some(Job::WipePreservedCatalog(wipe_persisted_catalog)) = meta.job {
+    if let Some(Job::WipePreservedCatalog(wipe_persisted_catalog)) = iox_operation.metadata.job {
        assert_eq!(wipe_persisted_catalog.db_name, db_name);
    } else {
        panic!("unexpected job returned")
--- a/tests/end_to_end_cases/management_cli.rs
+++ b/tests/end_to_end_cases/management_cli.rs
@ -1,13 +1,12 @@
-use std::sync::Arc;
 use std::time::Duration;

 use assert_cmd::Command;
 use predicates::prelude::*;

-use data_types::chunk_metadata::ChunkAddr;
-use data_types::{
-    chunk_metadata::ChunkStorage,
-    job::{Job, Operation},
+use data_types::chunk_metadata::ChunkStorage;
+use generated_types::google::longrunning::IoxOperation;
+use generated_types::influxdata::iox::management::v1::{
+    operation_metadata::Job, CloseChunk, WipePreservedCatalog,
 };
 use test_helpers::make_temp_file;
 use write_buffer::maybe_skip_kafka_integration;
@ -112,7 +111,7 @@ async fn test_create_database() {
                .and(predicate::str::contains(format!(r#""name": "{}"#, db)))
                // validate the defaults have been set reasonably
                .and(predicate::str::contains("%Y-%m-%d %H:00:00"))
-                .and(predicate::str::contains(r#""bufferSizeHard": 104857600"#))
+                .and(predicate::str::contains(r#""bufferSizeHard": "104857600""#))
                .and(predicate::str::contains("lifecycleRules")),
        );
 }
@ -147,7 +146,7 @@ async fn test_create_database_size() {
        .assert()
        .success()
        .stdout(
-            predicate::str::contains(r#""bufferSizeHard": 1000"#)
+            predicate::str::contains(r#""bufferSizeHard": "1000""#)
                .and(predicate::str::contains("lifecycleRules")),
        );
 }
@ -765,7 +764,7 @@ async fn test_close_partition_chunk() {
    let lp_data = vec!["cpu,region=west user=23.2 100"];
    load_lp(addr, &db_name, lp_data);

-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
        &Command::cargo_bin("influxdb_iox")
            .unwrap()
            .arg("database")
@ -784,18 +783,16 @@ async fn test_close_partition_chunk() {
    )
    .expect("Expected JSON output");

-    let expected_job = Job::CompactChunk {
-        chunk: ChunkAddr {
-            db_name: Arc::from(db_name.as_str()),
-            table_name: Arc::from("cpu"),
-            partition_key: Arc::from("cpu"),
-            chunk_id: 0,
-        },
-    };
+    let expected_job = Job::CloseChunk(CloseChunk {
+        db_name,
+        table_name: "cpu".to_string(),
+        partition_key: "cpu".to_string(),
+        chunk_id: 0,
+    });

    assert_eq!(
        Some(expected_job),
-        stdout.job,
+        stdout.metadata.job,
        "operation was {:#?}",
        stdout
    );
@ -828,7 +825,7 @@ async fn test_wipe_persisted_catalog() {
    let server_fixture = fixture_broken_catalog(&db_name).await;
    let addr = server_fixture.grpc_base();

-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
        &Command::cargo_bin("influxdb_iox")
            .unwrap()
            .arg("database")
@ -845,13 +842,11 @@ async fn test_wipe_persisted_catalog() {
    )
    .expect("Expected JSON output");

-    let expected_job = Job::WipePreservedCatalog {
-        db_name: Arc::from(db_name.as_str()),
-    };
+    let expected_job = Job::WipePreservedCatalog(WipePreservedCatalog { db_name });

    assert_eq!(
        Some(expected_job),
-        stdout.job,
+        stdout.metadata.job,
        "operation was {:#?}",
        stdout
    );
--- a/tests/end_to_end_cases/operations_api.rs
+++ b/tests/end_to_end_cases/operations_api.rs
@ -17,7 +17,7 @@ async fn test_operations() {

    let nanos = vec![Duration::from_secs(20).as_nanos() as _, 1];

-    let operation = management_client
+    let iox_operation = management_client
        .create_dummy_job(nanos.clone())
        .await
        .expect("create dummy job failed");
@ -28,20 +28,15 @@ async fn test_operations() {
        .expect("list operations failed");

    assert_eq!(running_ops.len(), 1);
-    assert_eq!(running_ops[0].name(), operation.name);
+    assert_eq!(running_ops[0].operation.name, iox_operation.operation.name);

-    let id = operation.name.parse().expect("not an integer");
+    let id = iox_operation.operation.id();
+    let iox_operation = operations_client.get_operation(id).await.unwrap();

-    let meta = operations_client
-        .client_operation(id)
-        .await
-        .unwrap()
-        .metadata();
+    let job = iox_operation.metadata.job.expect("expected a job");

-    let job = meta.job.expect("expected a job");
-
-    assert_eq!(meta.total_count, 2);
-    assert_eq!(meta.pending_count, 1);
+    assert_eq!(iox_operation.metadata.total_count, 2);
+    assert_eq!(iox_operation.metadata.pending_count, 1);
    assert_eq!(
        job,
        operation_metadata::Job::Dummy(Dummy {
@ -51,14 +46,14 @@ async fn test_operations() {
    );

    // Check wait times out correctly
-    let fetched = operations_client
+    let iox_operation = operations_client
        .wait_operation(id, Some(Duration::from_micros(10)))
        .await
        .expect("failed to wait operation");

-    assert!(!fetched.done);
+    assert!(!iox_operation.operation.done);
    // Shouldn't specify wall_nanos as not complete
-    assert_eq!(meta.wall_nanos, 0);
+    assert_eq!(iox_operation.metadata.wall_nanos, 0);

    let wait = tokio::spawn(async move {
        let mut operations_client = server_fixture.operations_client();
@ -74,18 +69,15 @@ async fn test_operations() {
        .expect("failed to cancel operation");

    let waited = wait.await.unwrap();
-    let meta = operations::ClientOperation::try_new(waited.clone())
-        .unwrap()
-        .metadata();

-    assert!(waited.done);
-    assert!(meta.wall_nanos > 0);
-    assert!(meta.cpu_nanos > 0);
-    assert_eq!(meta.pending_count, 0);
-    assert_eq!(meta.total_count, 2);
-    assert_eq!(meta.cancelled_count, 1);
+    assert!(waited.operation.done);
+    assert!(waited.metadata.wall_nanos > 0);
+    assert!(waited.metadata.cpu_nanos > 0);
+    assert_eq!(waited.metadata.pending_count, 0);
+    assert_eq!(waited.metadata.total_count, 2);
+    assert_eq!(waited.metadata.cancelled_count, 1);

-    match waited.result {
+    match waited.operation.result {
        Some(operations::generated_types::operation::Result::Error(status)) => {
            assert_eq!(status.code, tonic::Code::Cancelled as i32)
        }
--- a/tests/end_to_end_cases/operations_cli.rs
+++ b/tests/end_to_end_cases/operations_cli.rs
@ -1,6 +1,7 @@
 use crate::common::server_fixture::ServerFixture;
 use assert_cmd::Command;
-use data_types::job::{Job, Operation, OperationStatus};
+use generated_types::google::longrunning::IoxOperation;
+use generated_types::influxdata::iox::management::v1::{operation_metadata::Job, Dummy};
 use predicates::prelude::*;

 #[tokio::test]
@ -9,7 +10,7 @@ async fn test_start_stop() {
    let addr = server_fixture.grpc_base();
    let duration = std::time::Duration::from_secs(10).as_nanos() as u64;

-    let stdout: Operation = serde_json::from_slice(
+    let stdout: IoxOperation = serde_json::from_slice(
        &Command::cargo_bin("influxdb_iox")
            .unwrap()
            .arg("operation")
@ -24,13 +25,13 @@ async fn test_start_stop() {
    )
    .expect("expected JSON output");

-    assert_eq!(stdout.total_count, 1);
-    match stdout.job {
-        Some(Job::Dummy { nanos, .. }) => assert_eq!(nanos, vec![duration]),
-        _ => panic!("expected dummy job got {:?}", stdout.job),
+    assert_eq!(stdout.metadata.total_count, 1);
+    match stdout.metadata.job {
+        Some(Job::Dummy(Dummy { nanos, .. })) => assert_eq!(nanos, vec![duration]),
+        _ => panic!("expected dummy job got {:?}", stdout.metadata.job),
    }

-    let operations: Vec<Operation> = serde_json::from_slice(
+    let operations: Vec<IoxOperation> = serde_json::from_slice(
        &Command::cargo_bin("influxdb_iox")
            .unwrap()
            .arg("operation")
@ -45,33 +46,33 @@ async fn test_start_stop() {
    .expect("expected JSON output");

    assert_eq!(operations.len(), 1);
-    match &operations[0].job {
-        Some(Job::Dummy { nanos, .. }) => {
+    match &operations[0].metadata.job {
+        Some(Job::Dummy(Dummy { nanos, .. })) => {
            assert_eq!(nanos.len(), 1);
            assert_eq!(nanos[0], duration);
        }
-        _ => panic!("expected dummy job got {:?}", &operations[0].job),
+        _ => panic!("expected dummy job got {:?}", &operations[0].metadata.job),
    }

-    let id = operations[0].id;
+    let name = &operations[0].operation.name;

    Command::cargo_bin("influxdb_iox")
        .unwrap()
        .arg("operation")
        .arg("cancel")
-        .arg(id.to_string())
+        .arg(name.clone())
        .arg("--host")
        .arg(addr)
        .assert()
        .success()
        .stdout(predicate::str::contains("Ok"));

-    let completed: Operation = serde_json::from_slice(
+    let completed: IoxOperation = serde_json::from_slice(
        &Command::cargo_bin("influxdb_iox")
            .unwrap()
            .arg("operation")
            .arg("wait")
-            .arg(id.to_string())
+            .arg(name.to_string())
            .arg("--host")
            .arg(addr)
            .assert()
@ -81,9 +82,8 @@ async fn test_start_stop() {
    )
    .expect("expected JSON output");

-    assert_eq!(completed.pending_count, 0);
-    assert_eq!(completed.total_count, 1);
-    assert_eq!(completed.cancelled_count, 1);
-    assert_eq!(completed.status, OperationStatus::Cancelled);
-    assert_eq!(&completed.job, &operations[0].job)
+    assert_eq!(completed.metadata.pending_count, 0);
+    assert_eq!(completed.metadata.total_count, 1);
+    assert_eq!(completed.metadata.cancelled_count, 1);
+    assert_eq!(&completed.metadata.job, &operations[0].metadata.job)
 }
--- a/tests/end_to_end_cases/persistence.rs
+++ b/tests/end_to_end_cases/persistence.rs
@ -2,7 +2,6 @@ use itertools::Itertools;

 use arrow_util::assert_batches_eq;
 use data_types::chunk_metadata::ChunkStorage;
-use influxdb_iox_client::operations;

 use crate::{
    common::server_fixture::ServerFixture,
@ -125,11 +124,11 @@ async fn test_full_lifecycle() {
        .await
        .unwrap()
        .iter()
-        .any(|operation| match operation.metadata().job {
+        .any(|operation| match &operation.metadata.job {
            Some(Job::CompactChunks(CompactChunks {
                db_name: operation_db_name,
                ..
-            })) => operation_db_name == db_name,
+            })) => operation_db_name == &db_name,
            _ => false,
        });
    assert!(performed_compaction);
@ -269,20 +268,16 @@ async fn create_readbuffer_chunk(fixture: &ServerFixture, db_name: &str) -> u32
    assert_eq!(chunks[0].storage, ChunkStorage::OpenMutableBuffer);

    // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
        .close_partition_chunk(db_name, table_name, partition_key, 0)
        .await
        .expect("new partition chunk");

-    println!("Operation response is {:?}", operation);
-    let operation_id = operation.id();
-
-    let meta = operations::ClientOperation::try_new(operation)
-        .unwrap()
-        .metadata();
+    println!("Operation response is {:?}", iox_operation);
+    let operation_id = iox_operation.operation.id();

    // ensure we got a legit job description back
-    if let Some(Job::CloseChunk(close_chunk)) = meta.job {
+    if let Some(Job::CloseChunk(close_chunk)) = iox_operation.metadata.job {
        assert_eq!(close_chunk.db_name, db_name);
        assert_eq!(close_chunk.partition_key, partition_key);
        assert_eq!(close_chunk.chunk_id, 0);
--- a/tests/end_to_end_cases/scenario.rs
+++ b/tests/end_to_end_cases/scenario.rs
@ -556,12 +556,8 @@ where
        }

        if t_start.elapsed() >= wait_time {
-            let operations = fixture.operations_client().list_operations().await.unwrap();
-            let mut operations: Vec<_> = operations
-                .into_iter()
-                .map(|x| (x.name().parse::<usize>().unwrap(), x.metadata()))
-                .collect();
-            operations.sort_by_key(|x| x.0);
+            let mut operations = fixture.operations_client().list_operations().await.unwrap();
+            operations.sort_by(|a, b| a.operation.name.cmp(&b.operation.name));

            panic!(
                "Could not find {} within {:?}.\nChunks were: {:#?}\nOperations were: {:#?}",
--- a/tests/end_to_end_cases/sql_cli.rs
+++ b/tests/end_to_end_cases/sql_cli.rs
@ -306,17 +306,20 @@ async fn test_sql_observer_operations() {
    let partition_key = "cpu";
    let table_name = "cpu";
    // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
        .close_partition_chunk(&db_name, table_name, partition_key, 0)
        .await
        .expect("new partition chunk");

-    println!("Operation response is {:?}", operation);
+    println!("Operation response is {:?}", iox_operation);

    // wait for the job to be done
    fixture
        .operations_client()
-        .wait_operation(operation.id(), Some(std::time::Duration::from_secs(1)))
+        .wait_operation(
+            iox_operation.operation.id(),
+            Some(std::time::Duration::from_secs(1)),
+        )
        .await
        .expect("failed to wait operation");

--- a/tests/end_to_end_cases/system_tables.rs
+++ b/tests/end_to_end_cases/system_tables.rs
@ -27,12 +27,12 @@ async fn test_operations() {
        .expect("write succeded");

    // Move the chunk to read buffer
-    let operation = management_client
+    let iox_operation = management_client
        .close_partition_chunk(&db_name1, table_name, partition_key, 0)
        .await
        .expect("new partition chunk");

-    let operation_id = operation.id();
+    let operation_id = iox_operation.operation.id();
    operations_client
        .wait_operation(operation_id, Some(std::time::Duration::from_secs(1)))
        .await
--- a/trace/src/ctx.rs
+++ b/trace/src/ctx.rs
@ -55,6 +55,22 @@ pub struct SpanContext {
 }

 impl SpanContext {
+    /// Create a new root span context, sent to `collector`. The
+    /// new span context has a random trace_id and span_id, and thus
+    /// is not connected to any existing span or trace.
+    pub fn new(collector: Arc<dyn TraceCollector>) -> Self {
+        let mut rng = rand::thread_rng();
+        let trace_id: u128 = rng.gen_range(1..u128::MAX);
+        let span_id: u64 = rng.gen_range(1..u64::MAX);
+
+        Self {
+            trace_id: TraceId(NonZeroU128::new(trace_id).unwrap()),
+            parent_span_id: None,
+            span_id: SpanId(NonZeroU64::new(span_id).unwrap()),
+            collector: Some(collector),
+        }
+    }
+
    /// Creates a new child of the Span described by this TraceContext
    pub fn child(&self, name: impl Into<Cow<'static, str>>) -> Span {
        Span {
@ -73,3 +89,22 @@ impl SpanContext {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::RingBufferTraceCollector;
+
+    #[test]
+    fn test_new() {
+        // two newly created spans should not have duplicated trace or span ids
+        let collector = Arc::new(RingBufferTraceCollector::new(5)) as _;
+
+        let ctx1 = SpanContext::new(Arc::clone(&collector));
+        let ctx2 = SpanContext::new(collector);
+
+        assert_ne!(ctx1.trace_id, ctx2.trace_id);
+        assert_ne!(ctx1.span_id, ctx2.span_id);
+    }
+}
--- a/trace/src/span.rs
+++ b/trace/src/span.rs
@ -201,29 +201,14 @@ impl<'a> Drop for SpanRecorder {

 #[cfg(test)]
 mod tests {
-    use std::num::{NonZeroU128, NonZeroU64};
    use std::sync::Arc;

-    use crate::ctx::{SpanId, TraceId};
    use crate::{RingBufferTraceCollector, TraceCollector};

    use super::*;

    fn make_span(collector: Arc<dyn TraceCollector>) -> Span {
-        Span {
-            name: "foo".into(),
-            ctx: SpanContext {
-                trace_id: TraceId(NonZeroU128::new(23948923).unwrap()),
-                parent_span_id: None,
-                span_id: SpanId(NonZeroU64::new(3498394).unwrap()),
-                collector: Some(collector),
-            },
-            start: None,
-            end: None,
-            status: SpanStatus::Unknown,
-            metadata: Default::default(),
-            events: vec![],
-        }
+        SpanContext::new(collector).child("foo")
    }

    #[test]