Merge branch 'main' into cn/remove-obsolete-docs-infra

2023-04-14 17:14:45 +00:00 · 2023-04-14 17:14:45 +00:00 · bc3b69ef3f
parent 8d3e285251 d55d41b174
commit bc3b69ef3f
179 changed files with 5466 additions and 7991 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -81,7 +81,6 @@ members = [
    "trogging",
    "wal",
    "workspace-hack",
-    "write_summary",
 ]
 default-members = ["influxdb_iox"]

@ -115,12 +114,18 @@ edition = "2021"
 license = "MIT OR Apache-2.0"

 [workspace.dependencies]
-arrow = { version = "36.0.0" }
-arrow-flight = { version = "36.0.0" }
-datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="b87871fdd1f4ce64201eb1f7c79a0547627f37e9", default-features = false }
-datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="b87871fdd1f4ce64201eb1f7c79a0547627f37e9" }
+arrow = { version = "37.0.0" }
+arrow-flight = { version = "37.0.0" }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="6e819d6c2b9280198c67fa16df3e54c79ce46ca2", default-features = false }
+datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="6e819d6c2b9280198c67fa16df3e54c79ce46ca2" }
 hashbrown = { version = "0.13.2" }
-parquet = { version = "36.0.0" }
+parquet = { version = "37.0.0" }
+tonic = { version = "0.9.1", features = ["tls", "tls-webpki-roots"] }
+tonic-build = { version = "0.9.1" }
+tonic-health = { version = "0.9.1" }
+tonic-reflection = { version = "0.9.1" }
+
+

 # This profile optimizes for runtime performance and small binary size at the expense of longer
 # build times. It's most suitable for final release builds.
--- a/8
+++ b/8
@ -36,20 +36,17 @@ RUN \
    du -cshx /usr/local/rustup /usr/local/cargo/registry /usr/local/cargo/git /influxdb_iox/target


-
 FROM debian:bullseye-slim

 RUN apt update \
    && apt install --yes ca-certificates gettext-base libssl1.1 --no-install-recommends \
-    && rm -rf /var/lib/{apt,dpkg,cache,log}
-
-RUN groupadd --gid 1500 iox \
+    && rm -rf /var/lib/{apt,dpkg,cache,log} \
+    && groupadd --gid 1500 iox \
    && useradd --uid 1500 --gid iox --shell /bin/bash --create-home iox

 USER iox

 RUN mkdir ~/.influxdb_iox
-RUN ls -la ~/.influxdb_iox

 ARG PACKAGE=influxdb_iox
 ENV PACKAGE=$PACKAGE
@ -57,7 +54,6 @@ ENV PACKAGE=$PACKAGE
 COPY --from=build "/root/$PACKAGE" "/usr/bin/$PACKAGE"
 COPY docker/entrypoint.sh /usr/bin/entrypoint.sh

-
 EXPOSE 8080 8082

 ENTRYPOINT ["/usr/bin/entrypoint.sh"]
--- a/arrow_util/src/dictionary.rs
+++ b/arrow_util/src/dictionary.rs
@ -153,7 +153,7 @@ impl StringDictionary<i32> {
        ))
        .len(keys.len())
        .add_buffer(keys.collect())
-        .add_child_data(self.storage.to_arrow(dictionary_nulls).data().clone())
+        .add_child_data(self.storage.to_arrow(dictionary_nulls).into_data())
        .nulls(nulls)
        // TODO consider skipping the validation checks by using
        // `build_unchecked()`
--- a/arrow_util/src/flight.rs
+++ b/arrow_util/src/flight.rs
@ -1,22 +1,24 @@
 use std::sync::Arc;

-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};

 /// Prepare an arrow Schema for transport over the Arrow Flight protocol
 ///
 /// Converts dictionary types to underlying types due to <https://github.com/apache/arrow-rs/issues/3389>
 pub fn prepare_schema_for_flight(schema: SchemaRef) -> SchemaRef {
-    let fields = schema
+    let fields: Fields = schema
        .fields()
        .iter()
        .map(|field| match field.data_type() {
-            DataType::Dictionary(_, value_type) => Field::new(
-                field.name(),
-                value_type.as_ref().clone(),
-                field.is_nullable(),
-            )
-            .with_metadata(field.metadata().clone()),
-            _ => field.clone(),
+            DataType::Dictionary(_, value_type) => Arc::new(
+                Field::new(
+                    field.name(),
+                    value_type.as_ref().clone(),
+                    field.is_nullable(),
+                )
+                .with_metadata(field.metadata().clone()),
+            ),
+            _ => Arc::clone(field),
        })
        .collect();

--- a/arrow_util/src/optimize.rs
+++ b/arrow_util/src/optimize.rs
@ -288,9 +288,9 @@ mod tests {
            Box::new(DataType::Utf8),
        ))
        .len(keys.len())
-        .add_buffer(keys.data().buffers()[0].clone())
+        .add_buffer(keys.to_data().buffers()[0].clone())
        .nulls(keys.nulls().cloned())
-        .add_child_data(values.data().clone())
+        .add_child_data(values.into_data())
        .build()
        .unwrap();

--- a/arrow_util/src/test_util.rs
+++ b/arrow_util/src/test_util.rs
@ -193,7 +193,7 @@ pub fn equalize_batch_schemas(batches: Vec<RecordBatch>) -> Result<Vec<RecordBat
 /// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet`
 ///
 /// matches `1d325760-2b20-48de-ab48-2267b034133d`
-static REGEX_UUID: Lazy<Regex> = Lazy::new(|| {
+pub static REGEX_UUID: Lazy<Regex> = Lazy::new(|| {
    Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex")
 });

@ -249,6 +249,11 @@ fn normalize_for_variable_width(s: Cow<'_, str>) -> String {
    REGEX_COL.replace_all(&s, "    |").to_string()
 }

+pub fn strip_table_lines(s: Cow<'_, str>) -> String {
+    let s = REGEX_LINESEP.replace_all(&s, "----------");
+    REGEX_COL.replace_all(&s, "").to_string()
+}
+
 fn normalize_time_ops(s: &str) -> String {
    REGEX_TIME_OP
        .replace_all(s, |c: &Captures<'_>| {
@ -276,6 +281,9 @@ pub struct Normalizer {
    /// if true, normalize filter predicates for explain plans
    /// `FilterExec: <REDACTED>`
    pub normalized_filters: bool,
+
+    /// if `true`, render tables without borders.
+    pub no_table_borders: bool,
 }

 impl Normalizer {
@ -403,5 +411,8 @@ impl Normalizer {
        if self.normalized_filters {
            output.push("-- Results After Normalizing Filters".into())
        }
+        if self.no_table_borders {
+            output.push("-- Results After No Table Borders".into())
+        }
    }
 }
--- a/authz/Cargo.toml
+++ b/authz/Cargo.toml
@ -16,5 +16,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 # crates.io dependencies in alphabetical order.
 async-trait = "0.1"
 snafu = "0.7"
-tonic = "0.8"
-
+tonic = { workspace = true }
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@ -18,7 +18,7 @@ metric = { path = "../metric" }
 object_store = "0.5.6"
 observability_deps = { path = "../observability_deps" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 snafu = "0.7"
 tempfile = "3.5.0"
 trace = { path = "../trace" }
--- a/clap_blocks/src/ingest_replica.rs
+++ b/clap_blocks/src/ingest_replica.rs
@ -1,37 +0,0 @@
-//! CLI config for the ingest_replica
-
-use crate::ingester_address::IngesterAddress;
-
-/// CLI config for the ingest_replica
-#[derive(Debug, Clone, clap::Parser)]
-#[allow(missing_copy_implementations)]
-pub struct IngestReplicaConfig {
-    /// gRPC address for the replica to talk with the ingesters. For
-    /// example:
-    ///
-    /// "http://127.0.0.1:8083"
-    ///
-    /// or
-    ///
-    /// "http://10.10.10.1:8083,http://10.10.10.2:8083"
-    ///
-    /// for multiple addresses.
-    #[clap(
-        long = "ingester-addresses",
-        env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
-        required = true,
-        num_args=1..,
-        value_delimiter = ','
-    )]
-    pub ingester_addresses: Vec<IngesterAddress>,
-
-    /// Sets how many queries the replica will handle simultaneously before
-    /// rejecting further incoming requests.
-    #[clap(
-        long = "concurrent-query-limit",
-        env = "INFLUXDB_IOX_CONCURRENT_QUERY_LIMIT",
-        default_value = "200",
-        action
-    )]
-    pub concurrent_query_limit: usize,
-}
--- a/clap_blocks/src/lib.rs
+++ b/clap_blocks/src/lib.rs
@ -16,7 +16,6 @@ pub mod authz;
 pub mod catalog_dsn;
 pub mod compactor2;
 pub mod garbage_collector;
-pub mod ingest_replica;
 pub mod ingester2;
 pub mod ingester_address;
 pub mod object_store;
--- a/clap_blocks/src/querier.rs
+++ b/clap_blocks/src/querier.rs
@ -1,50 +1,7 @@
 //! Querier-related configs.
+
 use crate::ingester_address::IngesterAddress;
-use data_types::{IngesterMapping, ShardIndex};
-use serde::Deserialize;
-use snafu::{ResultExt, Snafu};
-use std::{
-    collections::HashMap, fs, io, num::NonZeroUsize, path::PathBuf, str::FromStr, sync::Arc,
-};
-
-#[derive(Debug, Snafu)]
-#[allow(missing_docs)]
-pub enum Error {
-    #[snafu(display("Could not read shard to ingester file `{}`: {source}", file.display()))]
-    ShardToIngesterFileReading { source: io::Error, file: PathBuf },
-
-    #[snafu(display("Could not deserialize JSON from ingester config: {source}"))]
-    ShardToIngesterDeserializing { source: serde_json::Error },
-
-    #[snafu(display(
-        "Specifying `\"ignoreAll\": true` requires that both the `ingesters` and \
-        `shards` configurations are empty. `ingesters`: `{:#?}`,  `shards`: `{:#?}`",
-        ingesters,
-        shards,
-    ))]
-    IgnoreAllRequiresEmptyConfig {
-        ingesters: HashMap<Arc<str>, Arc<IngesterConfig>>,
-        shards: HashMap<ShardIndex, ShardConfig>,
-    },
-
-    #[snafu(display(
-        "Ingester `{name}` must either set the `addr` to a non-empty value or set `ignore` to true"
-    ))]
-    IngesterAddrRequired { name: Arc<str> },
-
-    #[snafu(display(
-        "Could not find ingester `{name}` specified for shard index `{shard_index}`"
-    ))]
-    IngesterNotFound {
-        shard_index: ShardIndex,
-        name: Arc<str>,
-    },
-
-    #[snafu(context(false))]
-    IngesterAddress {
-        source: crate::ingester_address::Error,
-    },
-}
+use std::num::NonZeroUsize;

 /// CLI config for querier configuration
 #[derive(Debug, Clone, PartialEq, Eq, clap::Parser)]
@ -71,144 +28,6 @@ pub struct QuerierConfig {
    )]
    pub exec_mem_pool_bytes: usize,

-    /// Path to a JSON file containing a Shard index to ingesters gRPC mapping. For example:
-    ///
-    /// ```json
-    /// {
-    ///   // Flag to ignore all ingesters and only query persisted data. Useful for development
-    ///   // or creating "cold data only" clusters.
-    ///   //
-    ///   // If this is set to `true`, having non-empty `ingesters` or `shards` is a startup
-    ///   // error.
-    ///   //
-    ///   // default: false
-    ///   "ignoreAll": false,
-    ///
-    ///   // Mapping of ingester name to config.
-    ///   //
-    ///   // default: {}
-    ///   "ingesters": {
-    ///     "i1": {
-    ///       // Ingester address as URL.
-    ///       //
-    ///       // If this is `null` but `ignore` is false, it is an error.
-    ///       //
-    ///       // default: null
-    ///       "addr": "http://ingester-1:1234"
-    ///     },
-    ///     "i2": {
-    ///       // Flag to ignore this ingester at query time and not contact it.
-    ///       //
-    ///       // default: false
-    ///       "ignore": true
-    ///     }
-    ///   },
-    ///
-    ///   // Mapping of shard indexes (as strings) to ingester names. Queries to shards that do
-    ///   // not appear in this mapping will return an error. Using an ingester name in the
-    ///   // `shards` mapping that does not appear in the `ingesters` mapping is a startup error.
-    ///   //
-    ///   // default: {}
-    ///   "shards": {
-    ///     "1": {
-    ///       // Name of an ingester from the `ingester` mapping.
-    ///       //
-    ///       // If this is `null`, queries to this shard will error.
-    ///       //
-    ///       // default: null
-    ///       "ingester": "i1"
-    ///     },
-    ///     "2": {
-    ///       "ingester": "i1"
-    ///     },
-    ///     "3": {
-    ///       "ingester": "i2"
-    ///     },
-    ///     "5": {
-    ///       // Flag to not fetch data from any ingester for queries to this shard.
-    ///       //
-    ///       // default: false
-    ///       "ignore": true
-    ///     }
-    ///   }
-    /// }
-    /// ```
-    #[clap(
-        long = "shard-to-ingesters-file",
-        env = "INFLUXDB_IOX_SHARD_TO_INGESTERS_FILE",
-        action
-    )]
-    pub shard_to_ingesters_file: Option<PathBuf>,
-
-    /// JSON containing a Shard index to ingesters gRPC mapping. For example:
-    ///
-    /// ```json
-    /// {
-    ///   // Flag to ignore all ingesters and only query persisted data. Useful for development
-    ///   // or creating "cold data only" clusters.
-    ///   //
-    ///   // If this is set to `true`, having non-empty `ingesters` or `shards` is a startup
-    ///   // error.
-    ///   //
-    ///   // default: false
-    ///   "ignoreAll": false,
-    ///
-    ///   // Mapping of ingester name to config.
-    ///   //
-    ///   // default: {}
-    ///   "ingesters": {
-    ///     "i1": {
-    ///       // Ingester address as URL.
-    ///       //
-    ///       // If this is `null` but `ignore` is false, it is an error.
-    ///       //
-    ///       // default: null
-    ///       "addr": "http://ingester-1:1234"
-    ///     },
-    ///     "i2": {
-    ///       // Flag to ignore this ingester at query time and not contact it.
-    ///       //
-    ///       // default: false
-    ///       "ignore": true
-    ///     }
-    ///   },
-    ///
-    ///   // Mapping of shard indexes (as strings) to ingester names. Queries to shards that do
-    ///   // not appear in this mapping will return an error. Using an ingester name in the
-    ///   // `shards` mapping that does not appear in the `ingesters` mapping is a startup error.
-    ///   //
-    ///   // default: {}
-    ///   "shards": {
-    ///     "1": {
-    ///       // Name of an ingester from the `ingester` mapping.
-    ///       //
-    ///       // If this is `null`, queries to this shard will error.
-    ///       //
-    ///       // default: null
-    ///       "ingester": "i1"
-    ///     },
-    ///     "2": {
-    ///       "ingester": "i1"
-    ///     },
-    ///     "3": {
-    ///       "ingester": "i2"
-    ///     },
-    ///     "5": {
-    ///       // Flag to not fetch data from any ingester for queries to this shard.
-    ///       //
-    ///       // default: false
-    ///       "ignore": true
-    ///     }
-    ///   }
-    /// }
-    /// ```
-    #[clap(
-        long = "shard-to-ingesters",
-        env = "INFLUXDB_IOX_SHARD_TO_INGESTERS",
-        action
-    )]
-    pub shard_to_ingesters: Option<String>,
-
    /// gRPC address for the router to talk with the ingesters. For
    /// example:
    ///
@ -219,8 +38,14 @@ pub struct QuerierConfig {
    /// "http://10.10.10.1:8083,http://10.10.10.2:8083"
    ///
    /// for multiple addresses.
-    #[clap(long = "ingester-addresses", env = "INFLUXDB_IOX_INGESTER_ADDRESSES", num_args=1.., value_delimiter = ',')]
-    pub ingester_addresses: Vec<String>,
+    #[clap(
+        long = "ingester-addresses",
+        env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
+        required = false,
+        num_args = 0..,
+        value_delimiter = ','
+    )]
+    pub ingester_addresses: Vec<IngesterAddress>,

    /// Size of the RAM cache used to store catalog metadata information in bytes.
    #[clap(
@ -256,11 +81,12 @@ pub struct QuerierConfig {
    /// returning results that do not include unpersisted data and enter "circuit breaker mode"
    /// to avoid continually retrying the failing connection on subsequent queries.
    ///
-    /// If circuits are open, the querier will NOT contact the ingester and no unpersisted data will be presented to the user.
+    /// If circuits are open, the querier will NOT contact the ingester and no unpersisted data
+    /// will be presented to the user.
    ///
-    /// Circuits will switch to "half open" after some jittered timeout and the querier will try to use the ingester in
-    /// question again. If this succeeds, we are back to normal, otherwise it will back off exponentially before trying
-    /// again (and again ...).
+    /// Circuits will switch to "half open" after some jittered timeout and the querier will try to
+    /// use the ingester in question again. If this succeeds, we are back to normal, otherwise it
+    /// will back off exponentially before trying again (and again ...).
    ///
    /// In a production environment the `ingester_circuit_state` metric should be monitored.
    #[clap(
@ -279,46 +105,6 @@ impl QuerierConfig {
        self.num_query_threads
    }

-    /// Return the querier config's ingester addresses. If `--shard-to-ingesters-file` is used to
-    /// specify a JSON file containing shard to ingester address mappings, this returns `Err` if
-    /// there are any problems reading, deserializing, or interpreting the file.
-
-    // When we have switched to using the RPC write path only, this method can be changed to be
-    // infallible as clap will handle failure to parse the list of strings.
-    //
-    // Switching into the RPC write path mode requires *both* the `INFLUXDB_IOX_RPC_MODE`
-    // environment variable to be specified *and* `--ingester-addresses` to be set in order to
-    // switch. Setting `INFLUXDB_IOX_RPC_MODE` and shard-to-ingesters mapping, or not setting
-    // `INFLUXDB_IOX_RPC_MODE` and setting ingester addresses, will panic.
-    pub fn ingester_addresses(&self) -> Result<IngesterAddresses, Error> {
-        if let Some(file) = &self.shard_to_ingesters_file {
-            let contents =
-                fs::read_to_string(file).context(ShardToIngesterFileReadingSnafu { file })?;
-            let map = deserialize_shard_ingester_map(&contents)?;
-            if map.is_empty() {
-                Ok(IngesterAddresses::None)
-            } else {
-                Ok(IngesterAddresses::ByShardIndex(map))
-            }
-        } else if let Some(contents) = &self.shard_to_ingesters {
-            let map = deserialize_shard_ingester_map(contents)?;
-            if map.is_empty() {
-                Ok(IngesterAddresses::None)
-            } else {
-                Ok(IngesterAddresses::ByShardIndex(map))
-            }
-        } else if !self.ingester_addresses.is_empty() {
-            Ok(IngesterAddresses::List(
-                self.ingester_addresses
-                    .iter()
-                    .map(|addr| IngesterAddress::from_str(addr))
-                    .collect::<Result<Vec<_>, _>>()?,
-            ))
-        } else {
-            Ok(IngesterAddresses::None)
-        }
-    }
-
    /// Size of the RAM cache pool for metadata in bytes.
    pub fn ram_pool_metadata_bytes(&self) -> usize {
        self.ram_pool_metadata_bytes
@ -335,131 +121,18 @@ impl QuerierConfig {
    }
 }

-fn deserialize_shard_ingester_map(
-    contents: &str,
-) -> Result<HashMap<ShardIndex, IngesterMapping>, Error> {
-    let ingesters_config: IngestersConfig =
-        serde_json::from_str(contents).context(ShardToIngesterDeserializingSnafu)?;
-
-    if ingesters_config.ignore_all
-        && (!ingesters_config.ingesters.is_empty() || !ingesters_config.shards.is_empty())
-    {
-        return IgnoreAllRequiresEmptyConfigSnafu {
-            ingesters: ingesters_config.ingesters,
-            shards: ingesters_config.shards,
-        }
-        .fail();
-    }
-
-    let mut ingester_mapping_by_name = HashMap::new();
-
-    for (name, config) in &ingesters_config.ingesters {
-        match (config.ignore, config.addr.as_ref()) {
-            (true, _) => {
-                ingester_mapping_by_name.insert(name, IngesterMapping::Ignore);
-            }
-            (false, None) => {
-                return IngesterAddrRequiredSnafu {
-                    name: Arc::clone(name),
-                }
-                .fail();
-            }
-            (false, Some(addr)) if addr.is_empty() => {
-                return IngesterAddrRequiredSnafu {
-                    name: Arc::clone(name),
-                }
-                .fail();
-            }
-            (false, Some(addr)) => {
-                ingester_mapping_by_name.insert(name, IngesterMapping::Addr(Arc::clone(addr)));
-            }
-        }
-    }
-
-    let mut map = HashMap::new();
-
-    for (shard_index, shard_config) in ingesters_config.shards {
-        if shard_config.ignore {
-            map.insert(shard_index, IngesterMapping::Ignore);
-            continue;
-        }
-        match shard_config.ingester {
-            Some(ingester) => match ingester_mapping_by_name.get(&ingester) {
-                Some(ingester_mapping) => {
-                    map.insert(shard_index, ingester_mapping.clone());
-                }
-                None => {
-                    return IngesterNotFoundSnafu {
-                        name: Arc::clone(&ingester),
-                        shard_index,
-                    }
-                    .fail();
-                }
-            },
-            None => {
-                map.insert(shard_index, IngesterMapping::NotMapped);
-            }
-        }
-    }
-
-    Ok(map)
-}
-
-/// Ingester addresses.
-#[derive(Debug, PartialEq, Eq)]
-pub enum IngesterAddresses {
-    /// A mapping from shard index to ingesters.
-    ByShardIndex(HashMap<ShardIndex, IngesterMapping>),
-
-    /// A list of ingester2 addresses.
-    List(Vec<IngesterAddress>),
-
-    /// No connections, meaning only persisted data should be used.
-    None,
-}
-
-#[derive(Debug, Deserialize, Default)]
-#[serde(rename_all = "camelCase")]
-struct IngestersConfig {
-    #[serde(default)]
-    ignore_all: bool,
-    #[serde(default)]
-    ingesters: HashMap<Arc<str>, Arc<IngesterConfig>>,
-    #[serde(default)]
-    shards: HashMap<ShardIndex, ShardConfig>,
-}
-
-/// Ingester config.
-#[derive(Debug, Deserialize)]
-pub struct IngesterConfig {
-    addr: Option<Arc<str>>,
-    #[serde(default)]
-    ignore: bool,
-}
-
-/// Shard config.
-#[derive(Debug, Deserialize)]
-pub struct ShardConfig {
-    ingester: Option<Arc<str>>,
-    #[serde(default)]
-    ignore: bool,
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
    use clap::Parser;
-    use test_helpers::assert_error;
+    use test_helpers::assert_contains;

    #[test]
    fn test_default() {
        let actual = QuerierConfig::try_parse_from(["my_binary"]).unwrap();

        assert_eq!(actual.num_query_threads(), None);
-        assert!(matches!(
-            actual.ingester_addresses().unwrap(),
-            IngesterAddresses::None,
-        ));
+        assert!(actual.ingester_addresses.is_empty());
    }

    #[test]
@ -471,26 +144,25 @@ mod tests {
            actual.num_query_threads(),
            Some(NonZeroUsize::new(42).unwrap())
        );
-        assert!(matches!(
-            actual.ingester_addresses().unwrap(),
-            IngesterAddresses::None,
-        ));
    }

    #[test]
    fn test_ingester_addresses_list() {
-        let actual = QuerierConfig::try_parse_from([
+        let querier = QuerierConfig::try_parse_from([
            "my_binary",
            "--ingester-addresses",
            "http://ingester-0:8082,http://ingester-1:8082",
        ])
        .unwrap();

-        let expected = IngesterAddresses::List(vec![
-            IngesterAddress::from_str("http://ingester-0:8082").unwrap(),
-            IngesterAddress::from_str("http://ingester-1:8082").unwrap(),
-        ]);
-        assert_eq!(actual.ingester_addresses().unwrap(), expected);
+        let actual: Vec<_> = querier
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+
+        let expected = vec!["http://ingester-0:8082/", "http://ingester-1:8082/"];
+        assert_eq!(actual, expected);
    }

    #[test]
@ -500,285 +172,15 @@ mod tests {
            "--ingester-addresses",
            "\\ingester-0:8082",
        ])
-        .unwrap()
-        .ingester_addresses();
-        assert_error!(actual, Error::IngesterAddress { .. });
-    }
+        .unwrap_err()
+        .to_string();

-    #[test]
-    fn supply_json_value() {
-        let actual = QuerierConfig::try_parse_from([
-            "my_binary",
-            "--shard-to-ingesters",
-            r#"{
-              "ignoreAll": false,
-              "ingesters": {
-                "i1": {
-                  "addr": "http://ingester-1:1234"
-                },
-                "i2": {
-                  "ignore": true
-                },
-                "i3": {
-                  "ignore": true,
-                  "addr": "http://ingester-2:2345"
-                }
-              },
-              "shards": {
-                "1": {
-                  "ingester": "i1"
-                },
-                "2": {
-                  "ingester": "i2"
-                },
-                "5": {
-                  "ignore": true
-                }
-              }
-            }"#,
-        ])
-        .unwrap();
-
-        let expected = IngesterAddresses::ByShardIndex(
-            [
-                (
-                    ShardIndex::new(1),
-                    IngesterMapping::Addr("http://ingester-1:1234".into()),
-                ),
-                (ShardIndex::new(2), IngesterMapping::Ignore),
-                (ShardIndex::new(5), IngesterMapping::Ignore),
-            ]
-            .into_iter()
-            .collect(),
+        assert_contains!(
+            actual,
+            "error: \
+            invalid value '\\ingester-0:8082' \
+            for '--ingester-addresses [<INGESTER_ADDRESSES>...]': \
+            Invalid: invalid uri character"
        );
-
-        assert_eq!(actual.ingester_addresses().unwrap(), expected);
-    }
-
-    #[test]
-    fn successful_deserialization() {
-        let contents = r#"{
-          "ignoreAll": false,
-          "ingesters": {
-            "i1": {
-              "addr": "http://ingester-1:1234"
-            },
-            "i2": {
-              "ignore": true
-            },
-            "i3": {
-              "ignore": true,
-              "addr": "http://ingester-2:2345"
-            }
-          },
-          "shards": {
-            "1": {
-              "ingester": "i1"
-            },
-            "2": {
-              "ingester": "i2"
-            },
-            "3": {
-              "ingester": "i1",
-              "ignore": true
-            },
-            "5": {
-              "ignore": true
-            }
-          }
-        }"#;
-
-        let map = deserialize_shard_ingester_map(contents).unwrap();
-
-        let expected = [
-            (
-                ShardIndex::new(1),
-                IngesterMapping::Addr("http://ingester-1:1234".into()),
-            ),
-            (ShardIndex::new(2), IngesterMapping::Ignore),
-            (ShardIndex::new(3), IngesterMapping::Ignore),
-            (ShardIndex::new(5), IngesterMapping::Ignore),
-        ]
-        .into_iter()
-        .collect();
-
-        assert_eq!(map, expected);
-    }
-
-    #[test]
-    fn unsuccessful_deserialization() {
-        let map = deserialize_shard_ingester_map("");
-        assert_error!(map, Error::ShardToIngesterDeserializing { .. });
-    }
-
-    #[test]
-    fn ignore_all_requires_empty_maps() {
-        let expected = HashMap::new();
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ignoreAll": true
-        }"#,
-        );
-        assert_eq!(map.unwrap(), expected);
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ignoreAll": true,
-            "ingesters": {},
-            "shards": {}
-        }"#,
-        );
-        assert_eq!(map.unwrap(), expected);
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ignoreAll": true,
-            "ingesters": {
-                "i1": {
-                  "addr": "http://ingester-1:1234"
-                }
-            },
-            "shards": {}
-        }"#,
-        );
-        assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ignoreAll": true,
-            "ingesters": {},
-            "shards": {
-                "1": {
-                  "ingester": "i1"
-                }
-            }
-        }"#,
-        );
-        assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ignoreAll": true,
-            "ingesters": {
-                "i1": {
-                  "addr": "http://ingester-1:1234"
-                }
-            },
-            "shards": {
-                "1": {
-                  "ingester": "i1"
-                }
-            }
-        }"#,
-        );
-        assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
-    }
-
-    #[test]
-    fn ingester_addr_must_be_specified_if_not_ignored() {
-        let map = deserialize_shard_ingester_map(
-            r#"{
-              "ingesters": {
-                  "i1": {}
-              }
-            }"#,
-        );
-        assert_error!(map, Error::IngesterAddrRequired { ref name } if name.as_ref() == "i1");
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-              "ingesters": {
-                  "i1": {
-                      "addr": ""
-                  }
-              }
-            }"#,
-        );
-        assert_error!(map, Error::IngesterAddrRequired { ref name } if name.as_ref() == "i1");
-    }
-
-    #[test]
-    fn ingester_must_be_found() {
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ingesters": {},
-            "shards": {
-                "1": {
-                  "ingester": "i1"
-                }
-            }
-        }"#,
-        );
-        assert_error!(
-            map,
-            Error::IngesterNotFound { shard_index, ref name }
-              if shard_index.get() == 1 && name.as_ref() == "i1"
-        );
-
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ingesters": {},
-            "shards": {
-                "1": {
-                  "ingester": ""
-                }
-            }
-        }"#,
-        );
-        assert_error!(
-            map,
-            Error::IngesterNotFound { shard_index, ref name }
-              if shard_index.get() == 1 && name.as_ref() == ""
-        );
-    }
-
-    #[test]
-    fn shard_to_ingester_varieties() {
-        let map = deserialize_shard_ingester_map(
-            r#"{
-            "ingesters": {
-                "i1": {
-                  "addr": "http://ingester-1:1234"
-                }
-            },
-            "shards": {
-                "1": {
-                  "ingester": "i1"
-                },
-                "2": {},
-                "3": {
-                    "ingester": null
-                },
-                "4": {
-                    "ignore": true
-                },
-                "5": {
-                    "ignore": true,
-                    "ingester": "i1"
-                },
-                "6": {
-                    "ignore": true,
-                    "ingester": null
-                }
-            }
-        }"#,
-        );
-
-        let expected = [
-            (
-                ShardIndex::new(1),
-                IngesterMapping::Addr("http://ingester-1:1234".into()),
-            ),
-            (ShardIndex::new(2), IngesterMapping::NotMapped),
-            (ShardIndex::new(3), IngesterMapping::NotMapped),
-            (ShardIndex::new(4), IngesterMapping::Ignore),
-            (ShardIndex::new(5), IngesterMapping::Ignore),
-            (ShardIndex::new(6), IngesterMapping::Ignore),
-        ]
-        .into_iter()
-        .collect();
-
-        assert_eq!(map.unwrap(), expected);
    }
 }
--- a/client_util/Cargo.toml
+++ b/client_util/Cargo.toml
@ -10,7 +10,7 @@ license.workspace = true
 http = "0.2.9"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
 thiserror = "1.0.40"
-tonic = { version = "0.8", features = ["tls", "tls-webpki-roots"] }
+tonic = { workspace = true }
 tower = "0.4"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

--- a/compactor2/src/driver.rs
+++ b/compactor2/src/driver.rs
@ -358,6 +358,16 @@ async fn execute_plan(
        // Adjust concurrency based on the column count in the partition.
        let permits = compute_permits(job_semaphore.total_permits(), partition_info.column_count());

+        info!(
+            partition_id = partition_info.partition_id.get(),
+            jobs_running = job_semaphore.holders_acquired(),
+            jobs_pending = job_semaphore.holders_pending(),
+            permits_needed = permits,
+            permits_acquired = job_semaphore.permits_acquired(),
+            permits_pending = job_semaphore.permits_pending(),
+            "requesting job semaphore",
+        );
+
        // draw semaphore BEFORE creating the DataFusion plan and drop it directly AFTER finishing the
        // DataFusion computation (but BEFORE doing any additional external IO).
        //
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@ -270,19 +270,6 @@ impl std::str::FromStr for ShardIndex {
    }
 }

-/// Potential configurations of ingester connections for the querier to associate with a shard.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum IngesterMapping {
-    /// Deliberately not mapping this shard to an ingester. If the querier gets a query for
-    /// this shard, it should return an error.
-    NotMapped,
-    /// Deliberately not contacting ingesters for this shard. If the querier gets a query for
-    /// this shard, it should only return persisted data.
-    Ignore,
-    /// The address of the ingester to contact for this shard.
-    Addr(Arc<str>),
-}
-
 /// Unique ID for a `Partition`
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type, sqlx::FromRow)]
 #[sqlx(transparent)]
@ -2300,20 +2287,6 @@ impl TimestampMinMax {
    }
 }

-/// Specifies the status of data in the ingestion process.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ShardWriteStatus {
-    /// Nothing is known about this write (e.g. it refers to a shard for which we have no
-    /// information)
-    ShardUnknown,
-    /// The data has not yet been processed by the ingester, and thus is unreadable
-    Durable,
-    /// The data is readable, but not yet persisted
-    Readable,
-    /// The data is both readable and persisted to parquet
-    Persisted,
-}
-
 #[cfg(test)]
 mod tests {
    use std::borrow::Cow;
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@ -12,7 +12,6 @@

 pub mod config;
 pub mod sender;
-pub mod sort_exprs;
 pub mod watch;

 use std::sync::Arc;
@ -20,7 +19,7 @@ use std::task::{Context, Poll};

 use datafusion::arrow::array::BooleanArray;
 use datafusion::arrow::compute::filter_record_batch;
-use datafusion::arrow::datatypes::DataType;
+use datafusion::arrow::datatypes::{DataType, Fields};
 use datafusion::common::{DataFusionError, ToDFSchema};
 use datafusion::datasource::MemTable;
 use datafusion::execution::context::TaskContext;
@ -354,12 +353,12 @@ pub fn nullable_schema(schema: SchemaRef) -> SchemaRef {
        schema
    } else {
        // make a new schema with all nullable fields
-        let new_fields = schema
+        let new_fields: Fields = schema
            .fields()
            .iter()
            .map(|f| {
                // make a copy of the field, but allow it to be nullable
-                f.clone().with_nullable(true)
+                f.as_ref().clone().with_nullable(true)
            })
            .collect();

--- a/datafusion_util/src/sort_exprs.rs
+++ b/datafusion_util/src/sort_exprs.rs
@ -1,52 +0,0 @@
-use datafusion::{
-    arrow::compute::SortOptions,
-    physical_expr::{PhysicalSortExpr, PhysicalSortRequirement},
-};
-
-/// Structure to build [`PhysicalSortRequirement`]s for ExecutionPlans.
-///
-/// Replace with `PhysicalSortExpr::from_sort_exprs` when
-/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
-/// upstream.
-pub fn requirements_from_sort_exprs<'a>(
-    exprs: impl IntoIterator<Item = &'a PhysicalSortExpr>,
-) -> Vec<PhysicalSortRequirement> {
-    exprs
-        .into_iter()
-        .cloned()
-        .map(PhysicalSortRequirement::from)
-        .collect()
-}
-
-/// Converts the `PhysicalSortRequirement` to `PhysicalSortExpr`.
-/// If required ordering is `None` for an entry, the default
-/// ordering `ASC, NULLS LAST` is used.
-///
-/// The default is picked to be consistent with
-/// PostgreSQL: <https://www.postgresql.org/docs/current/queries-order.html>
-///
-/// Replace with `PhysicalSortExpr::from` when
-/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
-/// upstream.
-pub fn into_sort_expr(requirement: PhysicalSortRequirement) -> PhysicalSortExpr {
-    let PhysicalSortRequirement { expr, options } = requirement;
-
-    let options = options.unwrap_or(SortOptions {
-        descending: false,
-        nulls_first: false,
-    });
-    PhysicalSortExpr { expr, options }
-}
-
-/// This function converts `PhysicalSortRequirement` to `PhysicalSortExpr`
-/// for each entry in the input. If required ordering is None for an entry
-/// default ordering `ASC, NULLS LAST` if given.
-///
-/// replace with PhysicalSortExpr::to_sort_exprs when
-/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
-/// upstream.
-pub fn requirements_to_sort_exprs(
-    required: impl IntoIterator<Item = PhysicalSortRequirement>,
-) -> Vec<PhysicalSortExpr> {
-    required.into_iter().map(into_sort_expr).collect()
-}
--- a/docs/flightsql.md
+++ b/docs/flightsql.md
@ -2,19 +2,19 @@

 InfluxDB IOx supports running SQL queries via [Apache Arrow Flight SQL](https://arrow.apache.org/docs/format/FlightSql.html)

-You can use either a native FlightSQL client as well as JDBC / ODBC Flight SQL  drivers
+You can use either a native FlightSQL client as well as JDBC / ODBC Flight SQL drivers

 ## JDBC:

 To use the JDBC driver with IOx:

 1. Download the driver by following the link from [Maven](https://mvnrepository.com/artifact/org.apache.arrow/flight-sql/10.0.1) or [Dremio](https://www.dremio.com/drivers/jdbc/)
-2. Use a jdbc conection of the format: `jdbc:arrow-flight-sql://hostname:port?useEncryption=false&iox-namespace-name=NAME`.
+2. Use a jdbc conection of the format: `jdbc:arrow-flight-sql://hostname:port?useEncryption=false&database=NAME`

-`hostname:port` is the host / port on which the IOx query gRPC API is running (default port is 8082), and `NAME` is the namespace name (for example, `26f7e5a4b7be365b_917b97a92e883afc`)
+`hostname:port` is the host / port on which the IOx query gRPC API is running (default port is 8082), and `NAME` is the database name (for example, `26f7e5a4b7be365b_917b97a92e883afc`)

 An example JDBC URL is:

 ```
-jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&iox-namespace-name=26f7e5a4b7be365b_917b97a92e883afc
+jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&database=26f7e5a4b7be365b_917b97a92e883afc
 ```
--- a/flightsql/Cargo.toml
+++ b/flightsql/Cargo.toml
@ -20,5 +20,5 @@ snafu = "0.7"
 once_cell = { version = "1", default-features = false }
 prost = "0.11"
 tokio = { version = "1.27", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
-tonic = "0.8"
+tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/flightsql/src/cmd.rs
+++ b/flightsql/src/cmd.rs
@ -4,8 +4,9 @@ use std::fmt::Display;

 use arrow_flight::sql::{
    ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, Any,
-    CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
-    CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery,
+    CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+    CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+    CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery,
 };
 use bytes::Bytes;
 use prost::Message;
@ -75,9 +76,20 @@ pub enum FlightSQLCommand {
    CommandGetSqlInfo(CommandGetSqlInfo),
    /// Get a list of the available catalogs. See [`CommandGetCatalogs`] for details.
    CommandGetCatalogs(CommandGetCatalogs),
+    /// Get a description of the foreign key columns in the given foreign key table
+    /// that reference the primary key or the columns representing a unique constraint
+    /// of the parent table (could be the same or a different table).
+    /// See [`CommandGetCrossReference`] for details.
+    CommandGetCrossReference(CommandGetCrossReference),
    /// Get a list of the available schemas. See [`CommandGetDbSchemas`]
    /// for details and how to interpret the parameters.
    CommandGetDbSchemas(CommandGetDbSchemas),
+    /// Get a description of the foreign key columns that reference the given
+    /// table's primary key columns (the foreign keys exported by a table) of a table.
+    /// See [`CommandGetExportedKeys`] for details.
+    CommandGetExportedKeys(CommandGetExportedKeys),
+    /// Get the foreign keys of a table. See [`CommandGetImportedKeys`] for details.
+    CommandGetImportedKeys(CommandGetImportedKeys),
    /// Get a list of primary keys. See [`CommandGetPrimaryKeys`] for details.
    CommandGetPrimaryKeys(CommandGetPrimaryKeys),
    /// Get a list of the available tables
@ -101,6 +113,37 @@ impl Display for FlightSQLCommand {
                write!(f, "CommandGetSqlInfo(...)")
            }
            Self::CommandGetCatalogs(CommandGetCatalogs {}) => write!(f, "CommandGetCatalogs"),
+            Self::CommandGetCrossReference(CommandGetCrossReference {
+                pk_catalog,
+                pk_db_schema,
+                pk_table,
+                fk_catalog,
+                fk_db_schema,
+                fk_table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetCrossReference(
+                        pk_catalog={},
+                        pk_db_schema={},
+                        pk_table={},
+                        fk_catalog={},
+                        fk_db_schema={},
+                        fk_table={}",
+                    pk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    pk_db_schema
+                        .as_ref()
+                        .map(|c| c.as_str())
+                        .unwrap_or("<NONE>"),
+                    pk_table,
+                    fk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    fk_db_schema
+                        .as_ref()
+                        .map(|c| c.as_str())
+                        .unwrap_or("<NONE>"),
+                    fk_table,
+                )
+            }
            Self::CommandGetDbSchemas(CommandGetDbSchemas {
                catalog,
                db_schema_filter_pattern,
@ -115,6 +158,32 @@ impl Display for FlightSQLCommand {
                        .unwrap_or("<NONE>")
                )
            }
+            Self::CommandGetExportedKeys(CommandGetExportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetExportedKeys(catalog={}, db_schema={}, table={})",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    table
+                )
+            }
+            Self::CommandGetImportedKeys(CommandGetImportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetImportedKeys(catalog={}, db_schema={}, table={})",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    table
+                )
+            }
            Self::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
                catalog,
                db_schema,
@ -186,8 +255,14 @@ impl FlightSQLCommand {
            Ok(Self::CommandGetSqlInfo(decoded_cmd))
        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetCatalogs>(&msg)? {
            Ok(Self::CommandGetCatalogs(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetCrossReference>(&msg)? {
+            Ok(Self::CommandGetCrossReference(decoded_cmd))
        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetDbSchemas>(&msg)? {
            Ok(Self::CommandGetDbSchemas(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetExportedKeys>(&msg)? {
+            Ok(Self::CommandGetExportedKeys(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetImportedKeys>(&msg)? {
+            Ok(Self::CommandGetImportedKeys(decoded_cmd))
        } else if let Some(decode_cmd) = Any::unpack::<CommandGetPrimaryKeys>(&msg)? {
            Ok(Self::CommandGetPrimaryKeys(decode_cmd))
        } else if let Some(decode_cmd) = Any::unpack::<CommandGetTables>(&msg)? {
@ -226,7 +301,10 @@ impl FlightSQLCommand {
            }
            FlightSQLCommand::CommandGetSqlInfo(cmd) => Any::pack(&cmd),
            FlightSQLCommand::CommandGetCatalogs(cmd) => Any::pack(&cmd),
+            FlightSQLCommand::CommandGetCrossReference(cmd) => Any::pack(&cmd),
            FlightSQLCommand::CommandGetDbSchemas(cmd) => Any::pack(&cmd),
+            FlightSQLCommand::CommandGetExportedKeys(cmd) => Any::pack(&cmd),
+            FlightSQLCommand::CommandGetImportedKeys(cmd) => Any::pack(&cmd),
            FlightSQLCommand::CommandGetPrimaryKeys(cmd) => Any::pack(&cmd),
            FlightSQLCommand::CommandGetTables(cmd) => Any::pack(&cmd),
            FlightSQLCommand::CommandGetTableTypes(cmd) => Any::pack(&cmd),
--- a/flightsql/src/planner.rs
+++ b/flightsql/src/planner.rs
@ -11,8 +11,9 @@ use arrow::{
 use arrow_flight::{
    sql::{
        ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
-        CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
-        CommandGetTableTypes, CommandGetTables, CommandStatementQuery,
+        CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+        CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+        CommandGetTables, CommandStatementQuery,
    },
    IpcMessage, SchemaAsIpc,
 };
@ -64,9 +65,18 @@ impl FlightSQLPlanner {
            FlightSQLCommand::CommandGetCatalogs(CommandGetCatalogs {}) => {
                encode_schema(get_catalogs_schema())
            }
+            FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference { .. }) => {
+                encode_schema(&GET_CROSS_REFERENCE_SCHEMA)
+            }
            FlightSQLCommand::CommandGetDbSchemas(CommandGetDbSchemas { .. }) => {
                encode_schema(get_db_schemas_schema().as_ref())
            }
+            FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys { .. }) => {
+                encode_schema(&GET_EXPORTED_KEYS_SCHEMA)
+            }
+            FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys { .. }) => {
+                encode_schema(&GET_IMPORTED_KEYS_SCHEMA)
+            }
            FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys { .. }) => {
                encode_schema(&GET_PRIMARY_KEYS_SCHEMA)
            }
@ -115,6 +125,35 @@ impl FlightSQLPlanner {
                let plan = plan_get_catalogs(ctx).await?;
                Ok(ctx.create_physical_plan(&plan).await?)
            }
+            FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference {
+                pk_catalog,
+                pk_db_schema,
+                pk_table,
+                fk_catalog,
+                fk_db_schema,
+                fk_table,
+            }) => {
+                debug!(
+                    ?pk_catalog,
+                    ?pk_db_schema,
+                    ?pk_table,
+                    ?fk_catalog,
+                    ?fk_db_schema,
+                    ?fk_table,
+                    "Planning CommandGetCrossReference query"
+                );
+                let plan = plan_get_cross_reference(
+                    ctx,
+                    pk_catalog,
+                    pk_db_schema,
+                    pk_table,
+                    fk_catalog,
+                    fk_db_schema,
+                    fk_table,
+                )
+                .await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
            FlightSQLCommand::CommandGetDbSchemas(CommandGetDbSchemas {
                catalog,
                db_schema_filter_pattern,
@ -127,6 +166,34 @@ impl FlightSQLPlanner {
                let plan = plan_get_db_schemas(ctx, catalog, db_schema_filter_pattern).await?;
                Ok(ctx.create_physical_plan(&plan).await?)
            }
+            FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                debug!(
+                    ?catalog,
+                    ?db_schema,
+                    ?table,
+                    "Planning GetExportedKeys query"
+                );
+                let plan = plan_get_exported_keys(ctx, catalog, db_schema, table).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                debug!(
+                    ?catalog,
+                    ?db_schema,
+                    ?table,
+                    "Planning CommandGetImportedKeys query"
+                );
+                let plan = plan_get_imported_keys(ctx, catalog, db_schema, table).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
            FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
                catalog,
                db_schema,
@ -272,6 +339,19 @@ async fn plan_get_catalogs(ctx: &IOxSessionContext) -> Result<LogicalPlan> {
    Ok(ctx.batch_to_logical_plan(get_catalogs(ctx.inner())?)?)
 }

+async fn plan_get_cross_reference(
+    ctx: &IOxSessionContext,
+    _pk_catalog: Option<String>,
+    _pk_db_schema: Option<String>,
+    _pk_table: String,
+    _fk_catalog: Option<String>,
+    _fk_db_schema: Option<String>,
+    _fk_table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_CROSS_REFERENCE_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
 async fn plan_get_db_schemas(
    ctx: &IOxSessionContext,
    catalog: Option<String>,
@ -281,6 +361,26 @@ async fn plan_get_db_schemas(
    Ok(ctx.batch_to_logical_plan(batch)?)
 }

+async fn plan_get_exported_keys(
+    ctx: &IOxSessionContext,
+    _catalog: Option<String>,
+    _db_schema: Option<String>,
+    _table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_EXPORTED_KEYS_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+async fn plan_get_imported_keys(
+    ctx: &IOxSessionContext,
+    _catalog: Option<String>,
+    _db_schema: Option<String>,
+    _table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_IMPORTED_KEYS_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
 async fn plan_get_primary_keys(
    ctx: &IOxSessionContext,
    _catalog: Option<String>,
@ -333,6 +433,68 @@ static TABLE_TYPES_RECORD_BATCH: Lazy<RecordBatch> = Lazy::new(|| {
    RecordBatch::try_new(Arc::clone(&GET_TABLE_TYPE_SCHEMA), vec![table_type]).unwrap()
 });

+/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name,
+/// pk_table_name, pk_key_name, then key_sequence.
+/// update_rule and delete_rule returns a byte that is equivalent to actions:
+///    - 0 = CASCADE
+///    - 1 = RESTRICT
+///    - 2 = SET NULL
+///    - 3 = NO ACTION
+///    - 4 = SET DEFAULT
+static GET_CROSS_REFERENCE_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
+static GET_EXPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
+static GET_IMPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
 static GET_PRIMARY_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
    Arc::new(Schema::new(vec![
        Field::new("catalog_name", DataType::Utf8, false),
--- a/flightsql/src/sql_info/value.rs
+++ b/flightsql/src/sql_info/value.rs
@ -1,9 +1,11 @@
+use std::sync::Arc;
+
 use arrow::{
    array::{
        Array, ArrayBuilder, ArrayData, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder,
        ListBuilder, StringBuilder, UnionArray,
    },
-    datatypes::{DataType, Field, UnionMode},
+    datatypes::{DataType, Field, UnionFields, UnionMode},
 };
 use arrow_flight::sql::SqlInfo;
 use once_cell::sync::Lazy;
@ -118,7 +120,7 @@ static UNION_TYPE: Lazy<DataType> = Lazy::new(|| {
        // treat list as nullable b/c that is what hte builders make
        Field::new(
            "string_list",
-            DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
            true,
        ),
    ];
@ -127,7 +129,7 @@ static UNION_TYPE: Lazy<DataType> = Lazy::new(|| {
    // assume they go from 0 .. num_fields
    let type_ids: Vec<i8> = (0..fields.len()).map(|v| v as i8).collect();

-    DataType::Union(fields, type_ids, UnionMode::Dense)
+    DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
 });

 impl SqlInfoUnionBuilder {
--- a/generated_types/Cargo.toml
+++ b/generated_types/Cargo.toml
@ -19,11 +19,11 @@ prost = "0.11"
 query_functions = { path = "../query_functions" }
 serde = { version = "1.0", features = ["derive"] }
 snafu = "0.7"
-tonic = "0.8"
+tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

 [build-dependencies] # In alphabetical order
-tonic-build = "0.8"
+tonic-build = { workspace = true }
 prost-build = "0.11"
 pbjson-build = "0.5"

--- a/generated_types/build.rs
+++ b/generated_types/build.rs
@ -47,7 +47,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
    let sharder_path = root.join("influxdata/iox/sharder/v1");
    let wal_path = root.join("influxdata/iox/wal/v1");
    let write_buffer_path = root.join("influxdata/iox/write_buffer/v1");
-    let write_summary_path = root.join("influxdata/iox/write_summary/v1");
    let storage_path = root.join("influxdata/platform/storage");
    let storage_errors_path = root.join("influxdata/platform/errors");

@ -59,7 +58,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
        delete_path.join("service.proto"),
        ingester_path.join("parquet_metadata.proto"),
        ingester_path.join("query.proto"),
-        ingester_path.join("write_info.proto"),
        ingester_path.join("write.proto"),
        ingester_path.join("replication.proto"),
        ingester_path.join("persist.proto"),
@ -76,7 +74,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
        sharder_path.join("sharder.proto"),
        wal_path.join("wal.proto"),
        write_buffer_path.join("write_buffer.proto"),
-        write_summary_path.join("write_summary.proto"),
        storage_path.join("predicate.proto"),
        storage_path.join("service.proto"),
        storage_path.join("source.proto"),
--- a/generated_types/protos/influxdata/iox/ingester/v1/query.proto
+++ b/generated_types/protos/influxdata/iox/ingester/v1/query.proto
@ -71,20 +71,14 @@ message IngesterQueryResponseMetadata {
  reserved 6;

  // Partition id for this batch.
-  //
-  // This field is currently NOT used by the ingester but will be soon.
  int64 partition_id = 7;

  // Optional partition status.
  //
  // If this is given, then no schema and no batch will be part of this FlightData object.
-  //
-  // This field is currently NOT used by the ingester but will be soon.
  PartitionStatus status = 8;

  // UUID of this ingester instance.
-  //
-  // This field is currently NOT used by the ingester but will be soon.
  string ingester_uuid = 9;

  // Number of Parquet files that have been persisted to object storage for this partition.
--- a/generated_types/protos/influxdata/iox/ingester/v1/write_info.proto
+++ b/generated_types/protos/influxdata/iox/ingester/v1/write_info.proto
@ -1,57 +0,0 @@
-syntax = "proto3";
-package influxdata.iox.ingester.v1;
-option go_package = "github.com/influxdata/iox/ingester/v1";
-
-// NOTE: This is an ALPHA / Internal API that is used as part of the
-// end to end tests.
-//
-// A public API is tracked here:
-// <https://github.com/influxdata/influxdb_iox/issues/4354>
-service WriteInfoService {
-  // Get information about a particular write
-  rpc GetWriteInfo(GetWriteInfoRequest) returns (GetWriteInfoResponse);
-}
-
-message GetWriteInfoRequest {
-  // The write token returned from a write that was written to one or
-  // more shards
-  string write_token = 1;
-}
-
-message GetWriteInfoResponse {
-  // Renamed from kafka_partition_infos to shard_infos
-  reserved 3;
-  reserved "kafka_partition_infos";
-
-  // Information for all shards in this write
-  repeated ShardInfo shard_infos = 4;
-}
-
-// Status of a part of a write in a particular shard
-message ShardInfo {
-  // Unique shard index
-  int32 shard_index = 1;
-
-  // the status of the data for this shard
-  ShardStatus status = 2;
-}
-
-// the state
-enum ShardStatus {
-  // Unspecified status, will result in an error.
-  SHARD_STATUS_UNSPECIFIED = 0;
-
-  // The ingester has not yet processed data in this write
-  SHARD_STATUS_DURABLE = 1;
-
-  // The ingester has processed the data in this write and it is
-  // readable (will be included in a query response)?
-  SHARD_STATUS_READABLE = 2;
-
-  // The ingester has processed the data in this write and it is both
-  // readable and completely persisted to parquet files.
-  SHARD_STATUS_PERSISTED = 3;
-
-  // The ingester does not have information about this shard
-  SHARD_STATUS_UNKNOWN = 4;
-}
--- a/generated_types/protos/influxdata/iox/write_summary/v1/write_summary.proto
+++ b/generated_types/protos/influxdata/iox/write_summary/v1/write_summary.proto
@ -1,24 +0,0 @@
-syntax = "proto3";
-package influxdata.iox.write_summary.v1;
-option go_package = "github.com/influxdata/iox/write_summary/v1";
-
-// Represents a single logical write that was partitioned and sharded
-// into multiple pieces in multiple shards (kafka partitions)
-message WriteSummary {
-  // Renamed from sequencers to shards
-  reserved 1;
-  reserved "sequencers";
-
-  // per shard index (kafka partition) information
-  repeated ShardWrite shards = 2;
-}
-
-// Per shard (kafka partition) information about what sequence
-// numbers contain part of a write
-message ShardWrite {
-  // Unique shard index (kafka partition).
-  int32 shard_index = 1;
-
-  // Which sequence numbers for this shard had data
-  repeated int64 sequence_numbers = 2;
-}
--- a/generated_types/src/lib.rs
+++ b/generated_types/src/lib.rs
@ -196,19 +196,6 @@ pub mod influxdata {
                ));
            }
        }
-
-        pub mod write_summary {
-            pub mod v1 {
-                include!(concat!(
-                    env!("OUT_DIR"),
-                    "/influxdata.iox.write_summary.v1.rs"
-                ));
-                include!(concat!(
-                    env!("OUT_DIR"),
-                    "/influxdata.iox.write_summary.v1.serde.rs"
-                ));
-            }
-        }
    }

    pub mod pbdata {
@ -281,8 +268,6 @@ pub mod compactor;
 pub mod delete_predicate;
 #[cfg(any(feature = "data_types_conversions", test))]
 pub mod ingester;
-#[cfg(any(feature = "data_types_conversions", test))]
-pub mod write_info;

 pub use prost::{DecodeError, EncodeError};

--- a/generated_types/src/write_info.rs
+++ b/generated_types/src/write_info.rs
@ -1,155 +0,0 @@
-use crate::influxdata::iox::ingester::v1 as proto;
-use data_types::ShardWriteStatus;
-use std::collections::HashMap;
-
-impl From<ShardWriteStatus> for proto::ShardStatus {
-    fn from(status: ShardWriteStatus) -> Self {
-        match status {
-            ShardWriteStatus::ShardUnknown => Self::Unknown,
-            ShardWriteStatus::Durable => Self::Durable,
-            ShardWriteStatus::Readable => Self::Readable,
-            ShardWriteStatus::Persisted => Self::Persisted,
-        }
-    }
-}
-
-impl proto::ShardStatus {
-    /// Convert the status to a number such that higher numbers are later in the data lifecycle.
-    /// For use in merging multiple write status gRPC responses into one response.
-    fn status_order(&self) -> u8 {
-        match self {
-            Self::Unspecified => panic!("Unspecified status"),
-            Self::Unknown => 0,
-            Self::Durable => 1,
-            Self::Readable => 2,
-            Self::Persisted => 3,
-        }
-    }
-}
-
-impl proto::ShardInfo {
-    fn merge(&mut self, other: &Self) {
-        let self_status = self.status();
-        let other_status = other.status();
-
-        let new_status = match self_status.status_order().cmp(&other_status.status_order()) {
-            std::cmp::Ordering::Less => other_status,
-            std::cmp::Ordering::Equal => self_status,
-            std::cmp::Ordering::Greater => self_status,
-        };
-
-        self.set_status(new_status);
-    }
-}
-
-/// "Merges" the partition information for write info responses so that the "most recent"
-/// information is returned.
-pub fn merge_responses(
-    responses: impl IntoIterator<Item = proto::GetWriteInfoResponse>,
-) -> proto::GetWriteInfoResponse {
-    // Map shard index to status
-    let mut shard_infos: HashMap<_, proto::ShardInfo> = HashMap::new();
-
-    responses
-        .into_iter()
-        .flat_map(|res| res.shard_infos.into_iter())
-        .for_each(|info| {
-            shard_infos
-                .entry(info.shard_index)
-                .and_modify(|existing_info| existing_info.merge(&info))
-                .or_insert(info);
-        });
-
-    let shard_infos = shard_infos.into_values().collect();
-
-    proto::GetWriteInfoResponse { shard_infos }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use proto::{ShardInfo, ShardStatus};
-
-    #[test]
-    fn test_merge() {
-        #[derive(Debug)]
-        struct Test<'a> {
-            left: &'a ShardInfo,
-            right: &'a ShardInfo,
-            expected: &'a ShardInfo,
-        }
-
-        let durable = ShardInfo {
-            shard_index: 1,
-            status: ShardStatus::Durable.into(),
-        };
-
-        let readable = ShardInfo {
-            shard_index: 1,
-            status: ShardStatus::Readable.into(),
-        };
-
-        let persisted = ShardInfo {
-            shard_index: 1,
-            status: ShardStatus::Persisted.into(),
-        };
-
-        let unknown = ShardInfo {
-            shard_index: 1,
-            status: ShardStatus::Unknown.into(),
-        };
-
-        let tests = vec![
-            Test {
-                left: &unknown,
-                right: &unknown,
-                expected: &unknown,
-            },
-            Test {
-                left: &unknown,
-                right: &durable,
-                expected: &durable,
-            },
-            Test {
-                left: &unknown,
-                right: &readable,
-                expected: &readable,
-            },
-            Test {
-                left: &durable,
-                right: &unknown,
-                expected: &durable,
-            },
-            Test {
-                left: &readable,
-                right: &readable,
-                expected: &readable,
-            },
-            Test {
-                left: &durable,
-                right: &durable,
-                expected: &durable,
-            },
-            Test {
-                left: &readable,
-                right: &durable,
-                expected: &readable,
-            },
-            Test {
-                left: &persisted,
-                right: &durable,
-                expected: &persisted,
-            },
-        ];
-
-        for test in tests {
-            let mut output = test.left.clone();
-
-            output.merge(test.right);
-            assert_eq!(
-                &output, test.expected,
-                "Mismatch\n\nOutput:\n{output:#?}\n\nTest:\n{test:#?}"
-            );
-        }
-    }
-}
--- a/grpc-binary-logger-proto/Cargo.toml
+++ b/grpc-binary-logger-proto/Cargo.toml
@ -7,10 +7,10 @@ license.workspace = true

 [dependencies]
 prost = "0.11"
-prost-types = { version = "0.11.7", features = ["std"] }
-tonic = "0.8"
+prost-types = { version = "0.11.9", features = ["std"] }
+tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

 [build-dependencies]
 prost-build = "0.11"
-tonic-build = "0.8"
+tonic-build = { workspace = true }
--- a/grpc-binary-logger-test-proto/Cargo.toml
+++ b/grpc-binary-logger-test-proto/Cargo.toml
@ -7,10 +7,10 @@ license.workspace = true

 [dependencies]
 prost = "0.11"
-prost-types = { version = "0.11.7", features = ["std"] }
-tonic = "0.8"
+prost-types = { version = "0.11.9", features = ["std"] }
+tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

 [build-dependencies]
 prost-build = "0.11"
-tonic-build = "0.8"
+tonic-build = { workspace = true }
--- a/grpc-binary-logger/Cargo.toml
+++ b/grpc-binary-logger/Cargo.toml
@ -16,7 +16,7 @@ hyper = "0.14"
 pin-project = "1.0"
 prost = "0.11"
 tokio = {version = "1", features = [ "rt" ]}
-tonic = "0.8"
+tonic = { workspace = true }
 tower = "0.4"
 grpc-binary-logger-proto = { path = "../grpc-binary-logger-proto" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
@ -28,4 +28,4 @@ assert_matches = "1"

 [build-dependencies]
 prost-build = "0.11"
-tonic-build = "0.8"
+tonic-build = { workspace = true }
--- a/import/Cargo.toml
+++ b/import/Cargo.toml
@ -15,10 +15,10 @@ iox_catalog = { path = "../iox_catalog" }
 object_store = { version = "0.5.6", features = ["aws"] }
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 thiserror = "1.0.40"
 tokio = { version = "1.27" }
-tonic = { version = "0.8" }
+tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

 [dev-dependencies]
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@ -10,7 +10,7 @@ bytes = "1.4"
 futures = { version = "0.3", default-features = false }
 reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 snafu = "0.7"
 url = "2.3.1"
 uuid = { version = "1", features = ["v4"] }
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@ -22,7 +22,6 @@ influxrpc_parser = { path = "../influxrpc_parser"}
 iox_catalog = { path = "../iox_catalog" }
 ioxd_common = { path = "../ioxd_common"}
 ioxd_compactor2 = { path = "../ioxd_compactor2"}
-ioxd_ingest_replica = { path = "../ioxd_ingest_replica" }
 ioxd_ingester2 = { path = "../ioxd_ingester2"}
 ioxd_garbage_collector = { path = "../ioxd_garbage_collector" }
 ioxd_querier = { path = "../ioxd_querier"}
@ -64,7 +63,7 @@ libc = { version = "0.2" }
 num_cpus = "1.15.0"
 once_cell = { version = "1.17", features = ["parking_lot"] }
 rustyline = { version = "11.0", default-features = false, features = ["with-file-history"]}
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 snafu = "0.7"
 tempfile = "3.5.0"
 thiserror = "1.0.40"
@ -72,7 +71,7 @@ tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
 tokio = { version = "1.27", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time", "io-std"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7.7", features = ["compat"] }
-tonic = "0.8"
+tonic = { workspace = true }
 uuid = { version = "1", features = ["v4"] }
 # jemalloc-sys with unprefixed_malloc_on_supported_platforms feature and heappy are mutually exclusive
 tikv-jemalloc-sys = { version = "0.5.3", optional = true, features = ["unprefixed_malloc_on_supported_platforms"] }
@ -81,11 +80,11 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 [dev-dependencies]
 # In alphabetical order
 arrow_util = { path = "../arrow_util" }
-assert_cmd = "2.0.10"
+assert_cmd = "2.0.11"
 assert_matches = "1.5"
 async-trait = "0.1"
 predicate = { path = "../predicate" }
-predicates = "3.0.2"
+predicates = "3.0.3"
 serde = "1.0.159"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
 test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
--- a/influxdb_iox/src/commands/query.rs
+++ b/influxdb_iox/src/commands/query.rs
@ -1,7 +1,7 @@
 use arrow::record_batch::RecordBatch;
 use clap::ValueEnum;
 use futures::TryStreamExt;
-use influxdb_iox_client::format::influxql::write_columnar;
+use influxdb_iox_client::format::influxql::{write_columnar, Options};
 use influxdb_iox_client::{connection::Connection, flight, format::QueryOutputFormat};
 use thiserror::Error;

@ -105,7 +105,7 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {

    match (query_lang, &format) {
        (QueryLanguage::InfluxQL, OutputFormat::Pretty) => {
-            write_columnar(std::io::stdout(), &batches)?
+            write_columnar(std::io::stdout(), &batches, Options::default())?
        }
        _ => {
            let format: QueryOutputFormat = format.into();
--- a/influxdb_iox/src/commands/run/all_in_one.rs
+++ b/influxdb_iox/src/commands/run/all_in_one.rs
@ -11,7 +11,7 @@ use clap_blocks::{
    ingester2::Ingester2Config,
    ingester_address::IngesterAddress,
    object_store::{make_object_store, ObjectStoreConfig},
-    querier::{IngesterAddresses, QuerierConfig},
+    querier::QuerierConfig,
    router2::Router2Config,
    run_config::RunConfig,
    socket_addr::SocketAddr,
@ -425,6 +425,9 @@ impl Config {
            CatalogDsnConfig::new_sqlite(local_catalog_path)
        };

+        let ingester_addresses =
+            vec![IngesterAddress::from_str(&ingester_grpc_bind_address.to_string()).unwrap()];
+
        let router_run_config = RunConfig::new(
            logging_config,
            tracing_config,
@ -458,10 +461,7 @@ impl Config {
        let router_config = Router2Config {
            query_pool_name: QUERY_POOL_NAME.to_string(),
            http_request_limit: 1_000,
-            ingester_addresses: vec![IngesterAddress::from_str(
-                &ingester_grpc_bind_address.to_string(),
-            )
-            .unwrap()],
+            ingester_addresses: ingester_addresses.clone(),
            new_namespace_retention_hours: None, // infinite retention
            namespace_autocreation_enabled: true,
            partition_key_pattern: "%Y-%m-%d".to_string(),
@ -498,10 +498,8 @@ impl Config {
        };

        let querier_config = QuerierConfig {
-            num_query_threads: None,       // will be ignored
-            shard_to_ingesters_file: None, // will be ignored
-            shard_to_ingesters: None,      // will be ignored
-            ingester_addresses: vec![ingester_grpc_bind_address.to_string()], // will be ignored
+            num_query_threads: None, // will be ignored
+            ingester_addresses,
            ram_pool_metadata_bytes: querier_ram_pool_metadata_bytes,
            ram_pool_data_bytes: querier_ram_pool_data_bytes,
            max_concurrent_queries: querier_max_concurrent_queries,
@ -660,12 +658,7 @@ pub async fn command(config: Config) -> Result<()> {
    )
    .await;

-    let ingester_addresses = IngesterAddresses::List(vec![IngesterAddress::from_str(
-        &ingester_run_config.grpc_bind_address.to_string(),
-    )
-    .unwrap()]);
-
-    info!(?ingester_addresses, "starting querier");
+    info!(ingester_addresses = ?querier_config.ingester_addresses, "starting querier");
    let querier = create_querier_server_type(QuerierServerTypeArgs {
        common_state: &common_state,
        metric_registry: Arc::clone(&metrics),
@ -673,9 +666,7 @@ pub async fn command(config: Config) -> Result<()> {
        object_store,
        exec,
        time_provider,
-        ingester_addresses,
        querier_config,
-        rpc_write: true,
        authz: authz.as_ref().map(Arc::clone),
    })
    .await?;
--- a/influxdb_iox/src/commands/run/ingest_replica.rs
+++ b/influxdb_iox/src/commands/run/ingest_replica.rs
@ -1,106 +0,0 @@
-//! Command line options for running an ingester for a router using the RPC write path to talk to.
-
-use super::main;
-use crate::process_info::{setup_metric_registry, USIZE_MAX};
-use clap_blocks::{
-    catalog_dsn::CatalogDsnConfig, ingest_replica::IngestReplicaConfig, run_config::RunConfig,
-};
-use iox_query::exec::Executor;
-use ioxd_common::{
-    server_type::{CommonServerState, CommonServerStateError},
-    Service,
-};
-use ioxd_ingest_replica::create_ingest_replica_server_type;
-use observability_deps::tracing::*;
-use std::{num::NonZeroUsize, sync::Arc};
-use thiserror::Error;
-
-#[derive(Debug, Error)]
-pub enum Error {
-    #[error("run: {0}")]
-    Run(#[from] main::Error),
-
-    #[error("invalid config: {0}")]
-    InvalidConfig(#[from] CommonServerStateError),
-
-    #[error("error initializing ingest_replica: {0}")]
-    IngestReplica(#[from] ioxd_ingest_replica::Error),
-
-    #[error("catalog DSN error: {0}")]
-    CatalogDsn(#[from] clap_blocks::catalog_dsn::Error),
-}
-
-pub type Result<T, E = Error> = std::result::Result<T, E>;
-
-#[derive(Debug, clap::Parser)]
-#[clap(
-    name = "run",
-    about = "Runs in ingest replica mode",
-    long_about = "Run the IOx ingest_replica server.\n\nThe configuration options below can be \
-    set either with the command line flags or with the specified environment \
-    variable. If there is a file named '.env' in the current working directory, \
-    it is sourced before loading the configuration.
-Configuration is loaded from the following sources (highest precedence first):
-        - command line arguments
-        - user set environment variables
-        - .env file contents
-        - pre-configured default values"
-)]
-pub struct Config {
-    #[clap(flatten)]
-    pub(crate) run_config: RunConfig,
-
-    #[clap(flatten)]
-    pub(crate) catalog_dsn: CatalogDsnConfig,
-
-    #[clap(flatten)]
-    pub(crate) ingest_replica_config: IngestReplicaConfig,
-
-    /// Specify the size of the thread-pool for query execution, and the
-    /// separate compaction thread-pool.
-    #[clap(
-        long = "exec-thread-count",
-        env = "INFLUXDB_IOX_EXEC_THREAD_COUNT",
-        default_value = "4",
-        action
-    )]
-    pub exec_thread_count: NonZeroUsize,
-
-    /// Size of memory pool used during query exec, in bytes.
-    #[clap(
-        long = "exec-mem-pool-bytes",
-        env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES",
-        default_value = &USIZE_MAX[..],
-        action
-    )]
-    exec_mem_pool_bytes: usize,
-}
-
-pub async fn command(config: Config) -> Result<()> {
-    let common_state = CommonServerState::from_config(config.run_config.clone())?;
-    let metric_registry = setup_metric_registry();
-
-    let catalog = config
-        .catalog_dsn
-        .get_catalog("ingester", Arc::clone(&metric_registry))
-        .await?;
-
-    let exec = Arc::new(Executor::new(
-        config.exec_thread_count,
-        config.exec_mem_pool_bytes,
-    ));
-
-    let server_type = create_ingest_replica_server_type(
-        &common_state,
-        catalog,
-        Arc::clone(&metric_registry),
-        &config.ingest_replica_config,
-        exec,
-    )
-    .await?;
-
-    info!("starting ingester2");
-
-    let services = vec![Service::create(server_type, common_state.run_config())];
-    Ok(main::main(common_state, services, metric_registry).await?)
-}
--- a/influxdb_iox/src/commands/run/mod.rs
+++ b/influxdb_iox/src/commands/run/mod.rs
@ -4,7 +4,6 @@ use trogging::cli::LoggingConfig;
 pub(crate) mod all_in_one;
 mod compactor2;
 mod garbage_collector;
-mod ingest_replica;
 mod ingester2;
 mod main;
 mod querier;
@ -29,9 +28,6 @@ pub enum Error {
    #[snafu(display("Error in ingester2 subcommand: {}", source))]
    Ingester2Error { source: ingester2::Error },

-    #[snafu(display("Error in ingest_replica subcommand: {}", source))]
-    IngestReplicaError { source: ingest_replica::Error },
-
    #[snafu(display("Error in all in one subcommand: {}", source))]
    AllInOneError { source: all_in_one::Error },

@ -60,7 +56,6 @@ impl Config {
            Some(Command::Querier(config)) => config.run_config.logging_config(),
            Some(Command::Router2(config)) => config.run_config.logging_config(),
            Some(Command::Ingester2(config)) => config.run_config.logging_config(),
-            Some(Command::IngestReplica(config)) => config.run_config.logging_config(),
            Some(Command::AllInOne(config)) => &config.logging_config,
            Some(Command::Test(config)) => config.run_config.logging_config(),
        }
@ -81,9 +76,6 @@ enum Command {
    /// Run the server in ingester2 mode
    Ingester2(ingester2::Config),

-    /// Run the server in ingest_replica mode
-    IngestReplica(ingest_replica::Config),
-
    /// Run the server in "all in one" mode (Default)
    AllInOne(all_in_one::Config),

@ -110,9 +102,6 @@ pub async fn command(config: Config) -> Result<()> {
        Some(Command::Ingester2(config)) => {
            ingester2::command(config).await.context(Ingester2Snafu)
        }
-        Some(Command::IngestReplica(config)) => ingest_replica::command(config)
-            .await
-            .context(IngestReplicaSnafu),
        Some(Command::AllInOne(config)) => all_in_one::command(config).await.context(AllInOneSnafu),
        Some(Command::Test(config)) => test::command(config).await.context(TestSnafu),
    }
--- a/influxdb_iox/src/commands/run/querier.rs
+++ b/influxdb_iox/src/commands/run/querier.rs
@ -29,9 +29,6 @@ pub enum Error {
    #[error("Invalid config: {0}")]
    InvalidConfigCommon(#[from] CommonServerStateError),

-    #[error("Invalid config: {0}")]
-    InvalidConfigIngester(#[from] clap_blocks::querier::Error),
-
    #[error("Catalog error: {0}")]
    Catalog(#[from] iox_catalog::interface::Error),

@ -120,7 +117,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
        info!("using the write buffer path");
    }

-    let ingester_addresses = config.querier_config.ingester_addresses()?;
+    let ingester_addresses = &config.querier_config.ingester_addresses;
    info!(?ingester_addresses, "using ingester addresses");

    let exec = Arc::new(Executor::new(
@ -135,9 +132,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
        object_store,
        exec,
        time_provider,
-        ingester_addresses,
        querier_config: config.querier_config,
-        rpc_write,
        authz: authz.as_ref().map(Arc::clone),
    })
    .await?;
--- a/influxdb_iox/tests/end_to_end_cases/flightsql.rs
+++ b/influxdb_iox/tests/end_to_end_cases/flightsql.rs
@ -2,7 +2,7 @@ use std::{collections::HashMap, path::PathBuf, sync::Arc};

 use arrow::{
    array::as_generic_binary_array,
-    datatypes::{DataType, Schema, SchemaRef, TimeUnit},
+    datatypes::{DataType, Fields, Schema, SchemaRef, TimeUnit},
    record_batch::RecordBatch,
 };
 use arrow_flight::{
@ -339,6 +339,64 @@ async fn flightsql_get_catalogs_matches_information_schema() {
    .await
 }

+#[tokio::test]
+async fn flightsql_get_cross_reference() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let primary_table_name = "primary_table";
+    let foreign_table_name = "foreign_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{primary_table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {primary_table_name},tag1=A,tag2=C val=43i 123457\n
+                 {foreign_table_name},tag1=B,tag2=D val=42i 123456\n\
+                 {foreign_table_name},tag1=C,tag2=F val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut client = flightsql_client(state.cluster());
+                    let pk_catalog: Option<String> = None;
+                    let pk_db_schema: Option<String> = None;
+                    let fk_catalog: Option<String> = None;
+                    let fk_db_schema: Option<String> = None;
+
+                    let stream = client
+                        .get_cross_reference(
+                            pk_catalog,
+                            pk_db_schema,
+                            primary_table_name.to_string(),
+                            fk_catalog,
+                            fk_db_schema,
+                            foreign_table_name.to_string(),
+                        )
+                        .await
+                        .unwrap();
+                    let batches = collect_stream(stream).await;
+
+                    insta::assert_yaml_snapshot!(
+                        batches_to_sorted_lines(&batches),
+                        @r###"
+                    ---
+                    - ++
+                    - ++
+                    "###
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 #[tokio::test]
 async fn flightsql_get_tables() {
    test_helpers::maybe_start_logging();
@ -938,6 +996,98 @@ async fn flightsql_get_db_schema_matches_information_schema() {
    .await
 }

+#[tokio::test]
+async fn flightsql_get_exported_keys() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {table_name},tag1=A,tag2=C val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut client = flightsql_client(state.cluster());
+                    let catalog: Option<String> = None;
+                    let db_schema: Option<String> = None;
+
+                    let stream = client
+                        .get_exported_keys(catalog, db_schema, table_name.to_string())
+                        .await
+                        .unwrap();
+                    let batches = collect_stream(stream).await;
+
+                    insta::assert_yaml_snapshot!(
+                        batches_to_sorted_lines(&batches),
+                        @r###"
+                    ---
+                    - ++
+                    - ++
+                    "###
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
+#[tokio::test]
+async fn flightsql_get_imported_keys() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {table_name},tag1=A,tag2=C val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut client = flightsql_client(state.cluster());
+                    let catalog: Option<String> = None;
+                    let db_schema: Option<String> = None;
+
+                    let stream = client
+                        .get_imported_keys(catalog, db_schema, table_name.to_string())
+                        .await
+                        .unwrap();
+                    let batches = collect_stream(stream).await;
+
+                    insta::assert_yaml_snapshot!(
+                        batches_to_sorted_lines(&batches),
+                        @r###"
+                    ---
+                    - ++
+                    - ++
+                    "###
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 #[tokio::test]
 async fn flightsql_get_primary_keys() {
    test_helpers::maybe_start_logging();
@ -1254,10 +1404,10 @@ async fn assert_schema(client: &mut FlightClient, cmd: Any) {
 }

 fn strip_metadata(schema: &Schema) -> SchemaRef {
-    let stripped_fields: Vec<_> = schema
+    let stripped_fields: Fields = schema
        .fields()
        .iter()
-        .map(|f| f.clone().with_metadata(HashMap::new()))
+        .map(|f| f.as_ref().clone().with_metadata(HashMap::new()))
        .collect();

    Arc::new(Schema::new(stripped_fields))
@ -1357,8 +1507,149 @@ async fn authz() {
    authz.close().await;
 }

+/// Ensure that FligthSQL API supports the following grpc header names,
+/// in addition to the existing `iox-namespace-name`
+///   1. database
+///   2. bucket
+///   3. bucket-name
+#[tokio::test]
+async fn flightsql_client_header_same_database() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {table_name},tag1=A,tag2=C val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut client = flightsql_client_helper(state.cluster(), "iox-namespace-name");
+                    for header_name in &["database", "bucket", "bucket-name"] {
+                        // different header names with the same database name
+                        client
+                            .add_header(header_name, state.cluster().namespace())
+                            .unwrap();
+                    }
+
+                    let stream = client.get_table_types().await.unwrap();
+                    let batches = collect_stream(stream).await;
+
+                    insta::assert_yaml_snapshot!(
+                        batches_to_sorted_lines(&batches),
+                        @r###"
+                    ---
+                    - +------------+
+                    - "| table_type |"
+                    - +------------+
+                    - "| BASE TABLE |"
+                    - "| VIEW       |"
+                    - +------------+
+                    "###
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
+#[tokio::test]
+async fn flightsql_client_header_different_database() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {table_name},tag1=A,tag2=C val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let mut client = flightsql_client_helper(state.cluster(), "database");
+                    client
+                        .add_header("bucket", "different_database_name")
+                        .unwrap();
+
+                    let err = client.get_table_types().await.unwrap_err();
+
+                    assert_matches!(err, FlightError::Tonic(status) => {
+                        assert_eq!(status.code(), tonic::Code::InvalidArgument);
+                        assert_contains!(status.message(), "More than one headers are found in request");
+                    }
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
+#[tokio::test]
+async fn flightsql_client_header_no_database() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    let table_name = "the_table";
+
+    // Set up the cluster  ====================================
+    let mut cluster = MiniCluster::create_shared2(database_url).await;
+
+    StepTest::new(
+        &mut cluster,
+        vec![
+            Step::WriteLineProtocol(format!(
+                "{table_name},tag1=A,tag2=B val=42i 123456\n\
+                 {table_name},tag1=A,tag2=C val=43i 123457"
+            )),
+            Step::Custom(Box::new(move |state: &mut StepTestState| {
+                async move {
+                    let connection = state.cluster().querier().querier_grpc_connection();
+                    let (channel, _headers) = connection.into_grpc_connection().into_parts();
+
+                    let mut client = FlightSqlClient::new(channel);
+
+                    let err = client.get_table_types().await.unwrap_err();
+
+                    assert_matches!(err, FlightError::Tonic(status) => {
+                        assert_eq!(status.code(), tonic::Code::InvalidArgument);
+                        assert_contains!(status.message(), "no 'database' header in request");
+                    }
+                    );
+                }
+                .boxed()
+            })),
+        ],
+    )
+    .run()
+    .await
+}
+
 /// Return a [`FlightSqlClient`] configured for use
 fn flightsql_client(cluster: &MiniCluster) -> FlightSqlClient {
+    flightsql_client_helper(cluster, "database")
+}
+
+/// Helper function for fn `flightsql_client` that returns a [`FlightSqlClient`] configured for use
+fn flightsql_client_helper(cluster: &MiniCluster, header_name: &str) -> FlightSqlClient {
    let connection = cluster.querier().querier_grpc_connection();
    let (channel, _headers) = connection.into_grpc_connection().into_parts();

@ -1366,7 +1657,7 @@ fn flightsql_client(cluster: &MiniCluster) -> FlightSqlClient {

    // Add namespace to client headers until it is fully supported by FlightSQL
    let namespace = cluster.namespace();
-    client.add_header("iox-namespace-name", namespace).unwrap();
+    client.add_header(header_name, namespace).unwrap();

    client
 }
--- a/influxdb_iox/tests/end_to_end_cases/influxql.rs
+++ b/influxdb_iox/tests/end_to_end_cases/influxql.rs
@ -28,6 +28,13 @@ async fn influxql_returns_error() {
                    "Error while planning query: This feature is not implemented: SHOW TAG KEYS"
                        .into(),
            },
+            Step::InfluxQLExpectingError {
+                query: "SHOW TAG KEYYYYYES".into(),
+                expected_error_code: tonic::Code::InvalidArgument,
+                expected_message:
+                    "Error while planning query: Error during planning: invalid SHOW TAG statement, expected KEYS or VALUES at pos 9"
+                        .into(),
+            },
        ],
    )
    .run()
--- a/influxdb_iox/tests/end_to_end_cases/mode_switching.rs
+++ b/influxdb_iox/tests/end_to_end_cases/mode_switching.rs
@ -40,58 +40,6 @@ fn ingester2_errors_without_mode_env_var() {
        ));
 }

-#[test]
-fn querier_errors_with_mode_env_var_and_shard_to_ingester_mapping() {
-    let shard_to_ingesters_json = r#"{
-          "ingesters": {
-            "i1": {
-              "addr": "arbitrary"
-            }
-          },
-          "shards": {
-            "0": {
-              "ingester": "i1"
-            }
-        }
-    }"#;
-
-    Command::cargo_bin("influxdb_iox")
-        .unwrap()
-        .env_clear()
-        .env("INFLUXDB_IOX_RPC_MODE", "2")
-        .arg("run")
-        .arg("querier")
-        .arg("--shard-to-ingesters")
-        .arg(shard_to_ingesters_json)
-        .arg("--catalog")
-        .arg("memory")
-        .timeout(Duration::from_secs(2))
-        .assert()
-        .failure()
-        .stderr(predicate::str::contains(
-            "`INFLUXDB_IOX_RPC_MODE` is set but shard to ingester mappings were provided",
-        ));
-}
-
-#[test]
-fn querier_errors_without_mode_env_var_and_ingester_addresses() {
-    Command::cargo_bin("influxdb_iox")
-        .unwrap()
-        .env_clear()
-        .arg("run")
-        .arg("querier")
-        .arg("--ingester-addresses")
-        .arg("http://arbitrary:8082")
-        .arg("--catalog")
-        .arg("memory")
-        .timeout(Duration::from_secs(2))
-        .assert()
-        .failure()
-        .stderr(predicate::str::contains(
-            "`INFLUXDB_IOX_RPC_MODE` is unset but ingester addresses were provided",
-        ));
-}
-
 #[test]
 fn querier_without_ingesters_without_mode_env_var_uses_write_buffer() {
    Command::cargo_bin("influxdb_iox")
--- a/influxdb_iox/tests/jdbc_client/Main.java
+++ b/influxdb_iox/tests/jdbc_client/Main.java
@ -127,11 +127,27 @@ public class Main {
        System.out.println("**************");
        print_result_set(md.getCatalogs());

+        System.out.println("**************");
+        System.out.println("CrossReference");
+        System.out.println("**************");
+        print_result_set(md.getCrossReference(null, null, "system", null, null, "iox"));
+
        System.out.println("**************");
        System.out.println("Schemas:");
        System.out.println("**************");
        print_result_set(md.getSchemas());

+        System.out.println("**************");
+        System.out.println("ExportedKeys");
+        System.out.println("**************");
+        print_result_set(md.getExportedKeys(null, null, "system"));
+
+
+        System.out.println("**************");
+        System.out.println("ImportedKeys");
+        System.out.println("**************");
+        print_result_set(md.getImportedKeys(null, null, "system"));
+
        System.out.println("**************");
        System.out.println("PrimaryKeys:");
        System.out.println("**************");
--- a/influxdb_iox/tests/jdbc_client/README.md
+++ b/influxdb_iox/tests/jdbc_client/README.md
@ -10,14 +10,15 @@ influxdb_iox -v

 ## Run the JDBC test

-To run the JDBC test program, specify the target namespace in the JDBC URL:
+To run the JDBC test program, specify the target database in the JDBC URL:

 ```shell
 # run the jdbc client driver program, downloading the JDBC driver if needed
-./jdbc_client "jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&iox-namespace-name=26f7e5a4b7be365b_917b97a92e883afc" query 'select * from cpu'
+./jdbc_client "jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&database=26f7e5a4b7be365b_917b97a92e883afc" query 'select * from cpu'
 ```

 # Cleanup:
+
 Clean up any intermediate files (like JDBC driver)

 ```shell
--- a/influxdb_iox/tests/query_tests2/cases/in/gapfill.sql
+++ b/influxdb_iox/tests/query_tests2/cases/in/gapfill.sql
@ -1,11 +1,15 @@
 -- Gap-filling tests
 -- IOX_SETUP: OneMeasurementTwoSeries

-- Input data
-- region=a 2000-05-05T12:20:00Z
-- region=a 2000-05-05T12:40:00Z
-- region=b 2000-05-05T12:31:00Z
-- region=b 2000-05-05T12:39:00Z
+-- Input data (by region, time)
+SELECT *
+FROM cpu
+ORDER BY REGION, TIME;
+
+-- Input data (by time)
+SELECT *
+FROM cpu
+ORDER BY TIME;

 -- IOX_COMPARE: uuid
 EXPLAIN SELECT
@ -75,3 +79,13 @@ from cpu
 where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z'
 group by minute;

+-- cpu.idle has a null value at 12:31. Interpolation should still occur,
+-- overwriting the null value.
+SELECT
+  date_bin_gapfill(interval '4 minutes', time, timestamp '1970-01-01T00:00:00Z') as four_minute,
+  interpolate(min(cpu.idle)),
+  interpolate(min(cpu."user"))
+from cpu
+where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z'
+group by four_minute;
+
--- a/influxdb_iox/tests/query_tests2/cases/in/gapfill.sql.expected
+++ b/influxdb_iox/tests/query_tests2/cases/in/gapfill.sql.expected
@ -1,21 +1,39 @@
 -- Test Setup: OneMeasurementTwoSeries
+-- SQL: SELECT * FROM cpu ORDER BY REGION, TIME;
+------+--------+----------------------+------+
+| idle | region | time                 | user |
+------+--------+----------------------+------+
+| 70.0 | a      | 2000-05-05T12:20:00Z | 23.2 |
+|      | a      | 2000-05-05T12:40:00Z | 21.0 |
+|      | b      | 2000-05-05T12:31:00Z | 25.2 |
+| 60.0 | b      | 2000-05-05T12:39:00Z | 28.9 |
+------+--------+----------------------+------+
+-- SQL: SELECT * FROM cpu ORDER BY TIME;
+------+--------+----------------------+------+
+| idle | region | time                 | user |
+------+--------+----------------------+------+
+| 70.0 | a      | 2000-05-05T12:20:00Z | 23.2 |
+|      | b      | 2000-05-05T12:31:00Z | 25.2 |
+| 60.0 | b      | 2000-05-05T12:39:00Z | 28.9 |
+|      | a      | 2000-05-05T12:40:00Z | 21.0 |
+------+--------+----------------------+------+
 -- SQL: EXPLAIN SELECT date_bin_gapfill(interval '10 minute', time, timestamp '1970-01-01T00:00:00Z') as minute, count(cpu.user) from cpu where time between timestamp '2000-05-05T12:00:00Z' and timestamp '2000-05-05T12:59:00Z' group by minute;
 -- Results After Normalizing UUIDs
 ----------
 | plan_type    | plan    |
 ----------
-| logical_plan    | Projection: date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, COUNT(cpu.user)    |
-|    |   GapFill: groupBy=[[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalDayTime("600000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None))    |
-|    |     Aggregate: groupBy=[[datebin(IntervalDayTime("600000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]]    |
+| logical_plan    | Projection: date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, COUNT(cpu.user)    |
+|    |   GapFill: groupBy=[[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None))    |
+|    |     Aggregate: groupBy=[[datebin(IntervalMonthDayNano("600000000000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]]    |
 |    |       TableScan: cpu projection=[time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)]    |
-| physical_plan    | ProjectionExec: expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as minute, COUNT(cpu.user)@1 as COUNT(cpu.user)]    |
-|    |   GapFillExec: group_expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0], aggr_expr=[COUNT(cpu.user)@1], stride=600000, time_range=Included("957528000000000000")..Included("957531540000000000")    |
-|    |     SortPreservingMergeExec: [date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC]    |
-|    |       SortExec: expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC]    |
-|    |         AggregateExec: mode=FinalPartitioned, gby=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)]    |
+| physical_plan    | ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as minute, COUNT(cpu.user)@1 as COUNT(cpu.user)]    |
+|    |   GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0], aggr_expr=[COUNT(cpu.user)@1], stride=600000000000, time_range=Included("957528000000000000")..Included("957531540000000000")    |
+|    |     SortPreservingMergeExec: [date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC]    |
+|    |       SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC]    |
+|    |         AggregateExec: mode=FinalPartitioned, gby=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)]    |
 |    |           CoalesceBatchesExec: target_batch_size=8192    |
-|    |             RepartitionExec: partitioning=Hash([Column { name: "date_bin_gapfill(IntervalDayTime(\"600000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 0 }], 4), input_partitions=4    |
-|    |               AggregateExec: mode=Partial, gby=[datebin(600000, time@0, 0) as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)]    |
+|    |             RepartitionExec: partitioning=Hash([Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 0 }], 4), input_partitions=4    |
+|    |               AggregateExec: mode=Partial, gby=[datebin(600000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)]    |
 |    |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1    |
 |    |                   CoalesceBatchesExec: target_batch_size=8192    |
 |    |                     FilterExec: time@0 >= 957528000000000000 AND time@0 <= 957531540000000000    |
@ -85,18 +103,18 @@
 ----------
 | plan_type    | plan    |
 ----------
-| logical_plan    | Projection: cpu.region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, AVG(cpu.user)    |
-|    |   GapFill: groupBy=[[cpu.region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalDayTime("600000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None))    |
-|    |     Aggregate: groupBy=[[cpu.region, datebin(IntervalDayTime("600000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[AVG(cpu.user)]]    |
+| logical_plan    | Projection: cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, AVG(cpu.user)    |
+|    |   GapFill: groupBy=[[cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None))    |
+|    |     Aggregate: groupBy=[[cpu.region, datebin(IntervalMonthDayNano("600000000000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[AVG(cpu.user)]]    |
 |    |       TableScan: cpu projection=[region, time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)]    |
-| physical_plan    | ProjectionExec: expr=[region@0 as region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as minute, AVG(cpu.user)@2 as AVG(cpu.user)]    |
-|    |   GapFillExec: group_expr=[region@0, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1], aggr_expr=[LOCF(AVG(cpu.user)@2)], stride=600000, time_range=Included("957528000000000000")..Included("957531540000000000")    |
-|    |     SortPreservingMergeExec: [region@0 ASC,date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC]    |
-|    |       SortExec: expr=[region@0 ASC,date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC]    |
-|    |         AggregateExec: mode=FinalPartitioned, gby=[region@0 as region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)]    |
+| physical_plan    | ProjectionExec: expr=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as minute, AVG(cpu.user)@2 as AVG(cpu.user)]    |
+|    |   GapFillExec: group_expr=[region@0, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1], aggr_expr=[LOCF(AVG(cpu.user)@2)], stride=600000000000, time_range=Included("957528000000000000")..Included("957531540000000000")    |
+|    |     SortPreservingMergeExec: [region@0 ASC,date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC]    |
+|    |       SortExec: expr=[region@0 ASC,date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC]    |
+|    |         AggregateExec: mode=FinalPartitioned, gby=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)]    |
 |    |           CoalesceBatchesExec: target_batch_size=8192    |
-|    |             RepartitionExec: partitioning=Hash([Column { name: "region", index: 0 }, Column { name: "date_bin_gapfill(IntervalDayTime(\"600000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 1 }], 4), input_partitions=4    |
-|    |               AggregateExec: mode=Partial, gby=[region@0 as region, datebin(600000, time@1, 0) as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)]    |
+|    |             RepartitionExec: partitioning=Hash([Column { name: "region", index: 0 }, Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 1 }], 4), input_partitions=4    |
+|    |               AggregateExec: mode=Partial, gby=[region@0 as region, datebin(600000000000, time@1, 0) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)]    |
 |    |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1    |
 |    |                   CoalesceBatchesExec: target_batch_size=8192    |
 |    |                     FilterExec: time@1 >= 957528000000000000 AND time@1 <= 957531540000000000    |
@ -152,4 +170,16 @@
 | 2000-05-05T12:38:00Z | 70.0          |
 | 2000-05-05T12:39:00Z | 60.0          |
 | 2000-05-05T12:40:00Z | 60.0          |
-+----------------------+---------------+
+----------------------+---------------+
+-- SQL: SELECT date_bin_gapfill(interval '4 minutes', time, timestamp '1970-01-01T00:00:00Z') as four_minute, interpolate(min(cpu.idle)), interpolate(min(cpu."user")) from cpu where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z' group by four_minute;
+----------------------+---------------+---------------+
+| four_minute          | MIN(cpu.idle) | MIN(cpu.user) |
+----------------------+---------------+---------------+
+| 2000-05-05T12:16:00Z |               |               |
+| 2000-05-05T12:20:00Z | 70.0          | 23.2          |
+| 2000-05-05T12:24:00Z | 67.5          | 24.2          |
+| 2000-05-05T12:28:00Z | 65.0          | 25.2          |
+| 2000-05-05T12:32:00Z | 62.5          | 27.05         |
+| 2000-05-05T12:36:00Z | 60.0          | 28.9          |
+| 2000-05-05T12:40:00Z |               | 21.0          |
+----------------------+---------------+---------------+
--- a/influxdb_iox/tests/query_tests2/cases/in/influxql_metadata.influxql.expected
+++ b/influxdb_iox/tests/query_tests2/cases/in/influxql_metadata.influxql.expected
@ -1,104 +1,204 @@
 -- Test Setup: InfluxQLSelectSupport
 -- InfluxQL: SHOW FIELD KEYS;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_idle   | float     |
-| cpu              | usage_system | float     |
-| disk             | bytes_free   | integer   |
-| disk             | bytes_used   | integer   |
-| m0               | f64          | float     |
-| m0               | i64          | integer   |
-| m0               | str          | string    |
-| m1               | f64          | float     |
-| m1               | i64          | integer   |
-| m1               | str          | string    |
-| m2               | f64          | float     |
-| m3               | u64          | unsigned  |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_idle   | float     |
+| usage_system | float     |
+--------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_free | integer   |
+| bytes_used | integer   |
+------------+-----------+
+name: m0
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
+name: m1
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
+name: m2
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+----------+-----------+
+name: m3
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| u64      | unsigned  |
+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS LIMIT 2;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_idle   | float     |
-| cpu              | usage_system | float     |
-| disk             | bytes_free   | integer   |
-| disk             | bytes_used   | integer   |
-| m0               | f64          | float     |
-| m0               | i64          | integer   |
-| m1               | f64          | float     |
-| m1               | i64          | integer   |
-| m2               | f64          | float     |
-| m3               | u64          | unsigned  |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_idle   | float     |
+| usage_system | float     |
+--------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_free | integer   |
+| bytes_used | integer   |
+------------+-----------+
+name: m0
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+----------+-----------+
+name: m1
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+----------+-----------+
+name: m2
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+----------+-----------+
+name: m3
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| u64      | unsigned  |
+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS OFFSET 1;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_system | float     |
-| disk             | bytes_used   | integer   |
-| m0               | i64          | integer   |
-| m0               | str          | string    |
-| m1               | i64          | integer   |
-| m1               | str          | string    |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_system | float     |
+--------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_used | integer   |
+------------+-----------+
+name: m0
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
+name: m1
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS LIMIT 1 OFFSET 2;
-+------------------+----------+-----------+
-| iox::measurement | fieldKey | fieldType |
-+------------------+----------+-----------+
-| m0               | str      | string    |
-| m1               | str      | string    |
-+------------------+----------+-----------+
+name: m0
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| str      | string    |
+----------+-----------+
+name: m1
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| str      | string    |
+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM cpu;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_idle   | float     |
-| cpu              | usage_system | float     |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_idle   | float     |
+| usage_system | float     |
+--------------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM disk,cpu,disk;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_idle   | float     |
-| cpu              | usage_system | float     |
-| disk             | bytes_free   | integer   |
-| disk             | bytes_used   | integer   |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_idle   | float     |
+| usage_system | float     |
+--------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_free | integer   |
+| bytes_used | integer   |
+------------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM cpu,disk,cpu;
-+------------------+--------------+-----------+
-| iox::measurement | fieldKey     | fieldType |
-+------------------+--------------+-----------+
-| cpu              | usage_idle   | float     |
-| cpu              | usage_system | float     |
-| disk             | bytes_free   | integer   |
-| disk             | bytes_used   | integer   |
-+------------------+--------------+-----------+
+name: cpu
+--------------+-----------+
+| fieldKey     | fieldType |
+--------------+-----------+
+| usage_idle   | float     |
+| usage_system | float     |
+--------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_free | integer   |
+| bytes_used | integer   |
+------------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM /m.*/;
-+------------------+----------+-----------+
-| iox::measurement | fieldKey | fieldType |
-+------------------+----------+-----------+
-| m0               | f64      | float     |
-| m0               | i64      | integer   |
-| m0               | str      | string    |
-| m1               | f64      | float     |
-| m1               | i64      | integer   |
-| m1               | str      | string    |
-| m2               | f64      | float     |
-| m3               | u64      | unsigned  |
-+------------------+----------+-----------+
+name: m0
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
+name: m1
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+| i64      | integer   |
+| str      | string    |
+----------+-----------+
+name: m2
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| f64      | float     |
+----------+-----------+
+name: m3
+----------+-----------+
+| fieldKey | fieldType |
+----------+-----------+
+| u64      | unsigned  |
+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM /d\isk/;
-+------------------+------------+-----------+
-| iox::measurement | fieldKey   | fieldType |
-+------------------+------------+-----------+
-| disk             | bytes_free | integer   |
-| disk             | bytes_used | integer   |
-+------------------+------------+-----------+
+name: disk
+------------+-----------+
+| fieldKey   | fieldType |
+------------+-----------+
+| bytes_free | integer   |
+| bytes_used | integer   |
+------------+-----------+
 -- InfluxQL: SHOW FIELD KEYS FROM does_not_exist;
-+------------------+----------+-----------+
-| iox::measurement | fieldKey | fieldType |
-+------------------+----------+-----------+
-+------------------+----------+-----------+
 -- InfluxQL: SHOW FIELD KEYS ON my_db;
 Error while planning query: This feature is not implemented: SHOW FIELD KEYS ON <database>
 -- InfluxQL: SHOW FIELD KEYS FROM x.my_db;
--- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql
+++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql
@ -7,49 +7,39 @@

 -- Validates expected data is returned
 -- Projection wildcard, all tags and fields
-- IOX_COMPARE: sorted
 SELECT * FROM m0;

 -- No matching measurement
 SELECT * FROM non_existent;

 -- Projection wildcard, only tags
-- IOX_COMPARE: sorted
 SELECT *::tag, f64 FROM m0;

 -- Projection wildcard, only fields
-- IOX_COMPARE: sorted
 SELECT *::field FROM m0;

 -- Projection regex, mixture of tags and fields
-- IOX_COMPARE: sorted
 SELECT /64|tag0/ FROM m0;

 -- Projection specific tags and fields
-- IOX_COMPARE: sorted
 SELECT f64, tag0 FROM m0;

 -- Explicitly select time column
-- IOX_COMPARE: sorted
 SELECT f64, tag0, time FROM m0;

 -- arithmetic operators
-- IOX_COMPARE: sorted
 SELECT f64, f64 * 2, i64, i64 + i64 FROM m0;

 -- bitwise operators
-- IOX_COMPARE: sorted
 SELECT i64, i64 & 1 FROM m0;

 -- Automatic type coercion integer → float
-- IOX_COMPARE: sorted
 SELECT f64 + i64 FROM m0;

 -- Type cast postfix operator
 SELECT f64, f64::integer FROM m0;

 -- Column alias behaviour
-- IOX_COMPARE: sorted
 SELECT f64 AS f64_2, f64, f64, f64 FROM m0 LIMIT 1;

 --
@ -57,55 +47,45 @@ SELECT f64 AS f64_2, f64, f64, f64 FROM m0 LIMIT 1;
 --

 -- Single tag
-- IOX_COMPARE: sorted
 SELECT tag0, f64 FROM m0 WHERE tag0 = 'val00';

-- IOX_COMPARE: sorted
 SELECT tag0, f64 FROM m0 WHERE tag0 =~ /^val0(1|2)/;

 -- Conjunction (AND)
-- IOX_COMPARE: sorted
 SELECT /tag(0|1)/, f64 FROM m0 WHERE tag0 = 'val00' AND tag1 = 'val10';

 -- Disjunction (OR)
-- IOX_COMPARE: sorted
 SELECT /tag(0|1)/, f64 FROM m0 WHERE tag0 = 'val00' OR tag1 = 'val10';

 -- arithmetic
-- IOX_COMPARE: sorted
 SELECT f64 FROM m0 WHERE f64 > 10 + 10;

 -- bitwise
-- IOX_COMPARE: sorted
 SELECT i64 FROM m0 WHERE i64 & 1 = 0;

 -- time bounds

 -- timestamp format %Y-%M-%D
-- IOX_COMPARE: sorted
 SELECT i64 FROM m0 WHERE time > '2022-10-31';

 -- timestamp format %Y-%M-%D %h:%m:%s
-- IOX_COMPARE: sorted
 SELECT i64 FROM m0 WHERE time > '2022-10-31 02:00:10';

 -- now() and duration
 -- NOTE: 100000d is > 270 years, so this test should be ok for a while.
 --       However, if this test is still in use in 270 years and it starts failing,
 --       try increasing the number of days 😂
-- IOX_COMPARE: sorted
 SELECT i64 FROM m0 WHERE time > now() - 100000d;

 -- NOT NULL test
 -- WHERE tag1 != '' is the equivalent to tag1 IS NOT NULL
 -- TODO(sgc): This is working, but likely by accident
-- IOX_COMPARE: sorted
 SELECT tag1, f64 FROM m0 WHERE tag1 != '';

 -- NULL test
 -- WHERE tag1 = '' is the equivalent to tag1 IS NULL
 -- TODO(sgc): Not working, as expected
-- -- IOX_COMPARE: sorted
+--
 -- SELECT tag1, f64 FROM m0 WHERE tag1 = '';

 --
@ -292,6 +272,9 @@ SELECT usage_idle, bytes_free, device, cpu FROM cpu, disk GROUP BY device, cpu;
 SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0;
 SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1;
 SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0 GROUP BY tag0;
+-- IOX_COMPARE: no_borders
+EXPLAIN SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1 GROUP BY tag0;
+-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
 -- IOX_COMPARE: sorted
 SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1 GROUP BY tag0;
 SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0 GROUP BY tag0, non_existent;
@ -304,6 +287,7 @@ SELECT COUNT(f64) as the_count, SUM(f64) + SUM(non_existent) as foo FROM m0;

 -- measurements with different schema
 SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk;
+-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
 -- IOX_COMPARE: sorted
 SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none);

@ -327,6 +311,7 @@ SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
 SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s, 1s) FILL(none);

 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk;
+-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
 -- IOX_COMPARE: sorted
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY TIME(1s) FILL(none);
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY cpu;
--- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected
+++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected
--- a/influxdb_iox/tests/query_tests2/sql_errors.rs
+++ b/influxdb_iox/tests/query_tests2/sql_errors.rs
@ -10,7 +10,7 @@ async fn schema_merge_nonexistent_column() {
        setup_name: "MultiChunkSchemaMerge",
        sql: "SELECT * from cpu where foo = 8",
        expected_error_code: tonic::Code::InvalidArgument,
-        expected_message: r#"Error while planning query: Schema error: No field named "foo". Valid fields are "cpu"."host", "cpu"."region", "cpu"."system", "cpu"."time", "cpu"."user"."#,
+        expected_message: r#"Error while planning query: Schema error: No field named foo. Valid fields are cpu.host, cpu.region, cpu.system, cpu.time, cpu.user."#,
    }
    .run()
    .await;
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@ -24,11 +24,11 @@ prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
 schema = { path = "../schema" }
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread"] }
 tokio-stream = "0.1.12"
 thiserror = "1.0.40"
-tonic = { version = "0.8" }
+tonic = { workspace = true }

 [dev-dependencies]
 insta = { version = "1" }
--- a/influxdb_iox_client/src/client.rs
+++ b/influxdb_iox_client/src/client.rs
@ -36,8 +36,5 @@ pub mod store;
 /// Client for testing purposes.
 pub mod test;

-/// Client for fetching write info
-pub mod write_info;
-
 /// Client for write API
 pub mod write;
--- a/influxdb_iox_client/src/client/flightsql.rs
+++ b/influxdb_iox_client/src/client/flightsql.rs
@ -29,9 +29,9 @@ use arrow_flight::{
    error::{FlightError, Result},
    sql::{
        ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
-        CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
-        CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery,
-        CommandStatementQuery, ProstMessageExt,
+        CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+        CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+        CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery, ProstMessageExt,
    },
    Action, FlightClient, FlightDescriptor, FlightInfo, IpcMessage, Ticket,
 };
@ -153,6 +153,56 @@ impl FlightSqlClient {
        self.do_get_with_cmd(msg.as_any()).await
    }

+    /// List a description of the foreign key columns in the given foreign key table that
+    /// reference the primary key or the columns representing a unique constraint of the
+    /// parent table (could be the same or a different table) on this server using a
+    /// [`CommandGetCrossReference`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/f0c8229f5a09fe53186df171d518430243ddf112/format/FlightSql.proto#L1405-L1477>
+    ///
+    /// pk_catalog: The catalog name where the parent table is.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// pk_db_schema: The Schema name where the parent table is.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// pk_table: The parent table name. It cannot be null.
+    ///
+    /// fk_catalog: The catalog name where the foreign table is.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// fk_db_schema: The schema name where the foreign table is.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// fk_table: The foreign table name. It cannot be null.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_cross_reference(
+        &mut self,
+        pk_catalog: Option<impl Into<String> + Send>,
+        pk_db_schema: Option<impl Into<String> + Send>,
+        pk_table: String,
+        fk_catalog: Option<impl Into<String> + Send>,
+        fk_db_schema: Option<impl Into<String> + Send>,
+        fk_table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetCrossReference {
+            pk_catalog: pk_catalog.map(|s| s.into()),
+            pk_db_schema: pk_db_schema.map(|s| s.into()),
+            pk_table,
+            fk_catalog: fk_catalog.map(|s| s.into()),
+            fk_db_schema: fk_db_schema.map(|s| s.into()),
+            fk_table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
    /// List the schemas on this server
    ///
    /// # Parameters
@ -182,6 +232,71 @@ impl FlightSqlClient {
        self.do_get_with_cmd(msg.as_any()).await
    }

+    /// List a description of the foreign key columns that reference the given
+    /// table's primary key columns (the foreign keys exported by a table) of a
+    /// table on this server using a [`CommandGetExportedKeys`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/0434ab65075ecd1d2ab9245bcd7ec6038934ed29/format/FlightSql.proto#L1307-L1352>
+    ///
+    /// catalog: Specifies the catalog to search for the foreign key table.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema: Specifies the schema to search for the foreign key table.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// table: Specifies the foreign key table to get the foreign keys for.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_exported_keys(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema: Option<impl Into<String> + Send>,
+        table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetExportedKeys {
+            catalog: catalog.map(|s| s.into()),
+            db_schema: db_schema.map(|s| s.into()),
+            table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the foreign keys of a table on this server using a
+    /// [`CommandGetImportedKeys`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/196222dbd543d6931f4a1432845add97be0db802/format/FlightSql.proto#L1354-L1403>
+    ///
+    /// catalog: Specifies the catalog to search for the primary key table.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema: Specifies the schema to search for the primary key table.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// table: Specifies the primary key table to get the foreign keys for.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_imported_keys(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema: Option<impl Into<String> + Send>,
+        table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetImportedKeys {
+            catalog: catalog.map(|s| s.into()),
+            db_schema: db_schema.map(|s| s.into()),
+            table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
    /// List the primary keys on this server using a [`CommandGetPrimaryKeys`] message.
    ///
    /// # Parameters
--- a/influxdb_iox_client/src/client/write_info.rs
+++ b/influxdb_iox_client/src/client/write_info.rs
@ -1,52 +0,0 @@
-use client_util::connection::GrpcConnection;
-
-use self::generated_types::{write_info_service_client::WriteInfoServiceClient, *};
-
-use crate::connection::Connection;
-use crate::error::Error;
-
-/// Re-export generated_types
-pub mod generated_types {
-    pub use generated_types::influxdata::iox::ingester::v1::{
-        write_info_service_client, write_info_service_server, GetWriteInfoRequest,
-        GetWriteInfoResponse, ShardInfo, ShardStatus,
-    };
-    pub use generated_types::write_info::merge_responses;
-}
-
-/// A basic client for fetching information about write tokens from a
-/// single ingester.
-///
-/// NOTE: This is an ALPHA / Internal API that is used as part of the
-/// end to end tests.
-///
-/// A public API is tracked here:
-/// <https://github.com/influxdata/influxdb_iox/issues/4354>
-#[derive(Debug, Clone)]
-pub struct Client {
-    inner: WriteInfoServiceClient<GrpcConnection>,
-}
-
-impl Client {
-    /// Creates a new client with the provided connection
-    pub fn new(connection: Connection) -> Self {
-        Self {
-            inner: WriteInfoServiceClient::new(connection.into_grpc_connection()),
-        }
-    }
-
-    /// Get the write information for a write token
-    pub async fn get_write_info(
-        &mut self,
-        write_token: &str,
-    ) -> Result<GetWriteInfoResponse, Error> {
-        let response = self
-            .inner
-            .get_write_info(GetWriteInfoRequest {
-                write_token: write_token.to_string(),
-            })
-            .await?;
-
-        Ok(response.into_inner())
-    }
-}
--- a/influxdb_iox_client/src/format/influxql.rs
+++ b/influxdb_iox_client/src/format/influxql.rs
@ -1,4 +1,5 @@
-use arrow::array::StringArray;
+use arrow::array::{Array, ArrayData, StringArray};
+use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
 use arrow::util::display::ArrayFormatter;
@ -29,9 +30,38 @@ pub enum Error {
 }
 type Result<T, E = Error> = std::result::Result<T, E>;

+/// Options for controlling how table borders are rendered.
+#[derive(Debug, Default, Clone, Copy)]
+pub enum TableBorders {
+    /// Use ASCII characters.
+    #[default]
+    Ascii,
+    /// Use UNICODE box-drawing characters.
+    Unicode,
+    /// Do not render borders.
+    None,
+}
+
+/// Options for the [`write_columnar`] function.
+#[derive(Debug, Default)]
+pub struct Options {
+    /// Specify how borders should be rendered.
+    pub borders: TableBorders,
+}
+
+impl Options {
+    fn table_preset(&self) -> &'static str {
+        match self.borders {
+            TableBorders::Ascii => "||--+-++|    ++++++",
+            TableBorders::Unicode => comfy_table::presets::UTF8_FULL,
+            TableBorders::None => comfy_table::presets::NOTHING,
+        }
+    }
+}
+
 /// Write the record batches in a columnar format.
-pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()> {
-    let options = arrow::util::display::FormatOptions::default().with_display_error(true);
+pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch], options: Options) -> Result<()> {
+    let arrow_opts = arrow::util::display::FormatOptions::default().with_display_error(true);

    let Some(schema) = batches.first().map(|b|b.schema()) else { return Ok(()) };
    let md = schema
@ -68,7 +98,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>

    let new_table = || {
        let mut table = Table::new();
-        table.load_preset("||--+-++|    ++++++");
+        table.load_preset(options.table_preset());
        table.set_header(header.clone());
        table
    };
@ -78,7 +108,9 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
    for batch in batches {
        let cols = col_indexes
            .iter()
-            .map(|idx| ArrayFormatter::try_new(batch.column(*idx), &options).map_err(Error::Arrow))
+            .map(|idx| {
+                ArrayFormatter::try_new(batch.column(*idx), &arrow_opts).map_err(Error::Arrow)
+            })
            .collect::<Result<Vec<_>>>()?;

        let measurement = batch
@ -87,6 +119,10 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
            .downcast_ref::<StringArray>()
            .expect("expected measurement column to be a StringArray");

+        // create an empty string array for any tag columns that are NULL
+        let empty: StringArray =
+            StringArray::from(ArrayData::new_null(&DataType::Utf8, measurement.len()));
+
        let tag_vals = tag_key_indexes
            .iter()
            .map(|idx| {
@ -94,7 +130,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
                    .column(*idx)
                    .as_any()
                    .downcast_ref::<StringArray>()
-                    .expect("expected tag column to be a StringArray")
+                    .unwrap_or(&empty)
            })
            .collect::<Vec<_>>();

@ -160,7 +196,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>

 #[cfg(test)]
 mod test {
-    use crate::format::influxql::write_columnar;
+    use crate::format::influxql::{write_columnar, Options};
    use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, TimestampNanosecondArray};
    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
    use arrow::record_batch::RecordBatch;
@ -241,7 +277,7 @@ mod test {
            tag_key_columns: vec![],
        });
        let mut s = Vec::<u8>::new();
-        write_columnar(&mut s, &rb).unwrap();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
        let res = String::from_utf8(s).unwrap();
        insta::assert_snapshot!(res, @r###"
        name: cpu
@ -271,7 +307,7 @@ mod test {
            }],
        });
        let mut s = Vec::<u8>::new();
-        write_columnar(&mut s, &rb).unwrap();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
        let res = String::from_utf8(s).unwrap();
        insta::assert_snapshot!(res, @r###"
        name: cpu
@ -309,7 +345,7 @@ mod test {
            }],
        });
        let mut s = Vec::<u8>::new();
-        write_columnar(&mut s, &rb).unwrap();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
        let res = String::from_utf8(s).unwrap();
        insta::assert_snapshot!(res, @r###"
        name: cpu
@ -354,7 +390,7 @@ mod test {
            ],
        });
        let mut s = Vec::<u8>::new();
-        write_columnar(&mut s, &rb).unwrap();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
        let res = String::from_utf8(s).unwrap();
        insta::assert_snapshot!(res, @r###"
        name: cpu
--- a/influxdb_storage_client/Cargo.toml
+++ b/influxdb_storage_client/Cargo.toml
@ -9,7 +9,7 @@ license.workspace = true
 client_util = { path = "../client_util" }
 generated_types = { path = "../generated_types", default-features=false, features=["data_types"] }
 prost = "0.11"
-tonic = { version = "0.8" }
+tonic = { workspace = true }
 futures-util = { version = "0.3" }
 observability_deps = { path = "../observability_deps"}
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/ingest_replica/Cargo.toml
+++ b/ingest_replica/Cargo.toml
@ -1,57 +0,0 @@
-[package]
-name = "ingest_replica"
-version.workspace = true
-authors.workspace = true
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
-arrow-flight = { workspace = true }
-arrow_util = { version = "0.1.0", path = "../arrow_util" }
-async-channel = "1.8.0"
-async-trait = "0.1.60"
-backoff = { version = "0.1.0", path = "../backoff" }
-bytes = "1.3.0"
-crossbeam-utils = "0.8.14"
-data_types = { version = "0.1.0", path = "../data_types" }
-datafusion.workspace = true
-datafusion_util = { path = "../datafusion_util" }
-flatbuffers = "23.1.21"
-futures = "0.3.25"
-generated_types = { version = "0.1.0", path = "../generated_types" }
-hashbrown.workspace = true
-iox_catalog = { version = "0.1.0", path = "../iox_catalog" }
-iox_query = { version = "0.1.0", path = "../iox_query" }
-iox_time = { path = "../iox_time" }
-metric = { version = "0.1.0", path = "../metric" }
-mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
-mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
-object_store = "0.5.2"
-observability_deps = { version = "0.1.0", path = "../observability_deps" }
-once_cell = "1.17"
-parking_lot = "0.12.1"
-parquet_file = { version = "0.1.0", path = "../parquet_file" }
-pin-project = "1.0.12"
-predicate = { version = "0.1.0", path = "../predicate" }
-prost = { version = "0.11.2", default-features = false, features = ["std"] }
-rand = "0.8.5"
-schema = { version = "0.1.0", path = "../schema" }
-service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
-thiserror = "1.0.38"
-test_helpers = { path = "../test_helpers", features = ["future_timeout"], optional = true }
-tokio = { version = "1.22", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
-tonic = "0.8.3"
-trace = { version = "0.1.0", path = "../trace" }
-uuid = "1.2.2"
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
-
-[dev-dependencies]
-assert_matches = "1.5.0"
-criterion = { version = "0.4", default-features = false, features = ["async_tokio"]}
-datafusion_util = { path = "../datafusion_util" }
-lazy_static = "1.4.0"
-mutable_batch_lp = { path = "../mutable_batch_lp" }
-paste = "1.0.11"
-tempfile = "3.3.0"
-test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
--- a/ingest_replica/README.md
+++ b/ingest_replica/README.md
--- a/ingest_replica/src/buffer.rs
+++ b/ingest_replica/src/buffer.rs
@ -1,93 +0,0 @@
-//! In memory queryable buffer of data sent from one or more ingesters. It evicts data from the
-//! buffer when persist requests are sent in.
-
-use crate::{
-    cache::SchemaCache,
-    query::{response::QueryResponse, QueryError, QueryExec},
-    BufferError, ReplicationBuffer, TableIdToMutableBatch,
-};
-use async_trait::async_trait;
-use data_types::{
-    sequence_number_set::SequenceNumberSet, NamespaceId, PartitionId, SequenceNumber, TableId,
-};
-use iox_query::exec::Executor;
-use std::sync::Arc;
-use trace::span::Span;
-use uuid::Uuid;
-
-#[derive(Debug)]
-pub(crate) struct Buffer {
-    _schema_cache: Arc<SchemaCache>,
-    _exec: Arc<Executor>,
-}
-
-impl Buffer {
-    pub(crate) fn new(_schema_cache: Arc<SchemaCache>, _exec: Arc<Executor>) -> Self {
-        Self {
-            _schema_cache,
-            _exec,
-        }
-    }
-
-    pub(crate) async fn apply_write(
-        &self,
-        _namespace_id: NamespaceId,
-        _table_batches: TableIdToMutableBatch,
-        _ingester_id: Uuid,
-        _sequence_number: SequenceNumber,
-    ) -> Result<(), BufferError> {
-        panic!("unimplemented")
-    }
-}
-
-#[async_trait]
-impl ReplicationBuffer for Buffer {
-    async fn apply_write(
-        &self,
-        namespace_id: NamespaceId,
-        table_batches: TableIdToMutableBatch,
-        ingester_id: Uuid,
-        sequence_number: SequenceNumber,
-    ) -> Result<(), BufferError> {
-        self.apply_write(namespace_id, table_batches, ingester_id, sequence_number)
-            .await
-    }
-
-    async fn apply_persist(
-        &self,
-        _ingester_id: Uuid,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _partition_id: PartitionId,
-        _sequence_set: SequenceNumberSet,
-    ) -> Result<(), BufferError> {
-        panic!("unimplemented")
-    }
-
-    async fn append_partition_buffer(
-        &self,
-        _ingester_id: Uuid,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _partition_id: PartitionId,
-        _sequence_set: SequenceNumberSet,
-        _table_batches: TableIdToMutableBatch,
-    ) -> Result<(), BufferError> {
-        panic!("unimplemented")
-    }
-}
-
-#[async_trait]
-impl QueryExec for Buffer {
-    type Response = QueryResponse;
-
-    async fn query_exec(
-        &self,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _columns: Vec<String>,
-        _span: Option<Span>,
-    ) -> Result<Self::Response, QueryError> {
-        panic!("unimplemented");
-    }
-}
--- a/ingest_replica/src/cache.rs
+++ b/ingest_replica/src/cache.rs
@ -1,250 +0,0 @@
-//! A cache of table schemas and partition sort keys for us with the buffer to answer Flight
-//! requests.
-
-use data_types::{NamespaceId, PartitionId, PartitionKey, ShardId, TableId, TableSchema};
-use iox_catalog::interface::{
-    get_table_schema_by_id, list_schemas, Catalog, Error as CatalogError,
-};
-use parking_lot::RwLock;
-use std::{collections::BTreeMap, ops::DerefMut, sync::Arc};
-use thiserror::Error;
-
-/// Errors that occur during the use of the cache.
-#[derive(Debug, Error)]
-pub enum CacheError {
-    #[error("namespace {id:?} not found")]
-    NamespaceNotFound { id: NamespaceId },
-
-    #[error("table {id:?} not found")]
-    TableNotFound { id: TableId },
-
-    #[error("partition for table {table_id:?} and partition key {partition_key:?} not found")]
-    PartitionNotFound {
-        table_id: TableId,
-        partition_key: PartitionKey,
-    },
-
-    #[error("catalog error: {0}")]
-    Catalog(#[from] CatalogError),
-}
-
-#[derive(Debug)]
-pub(crate) struct SchemaCache {
-    state: RwLock<State>,
-    catalog: Arc<dyn Catalog>,
-    transition_shard_id: ShardId,
-}
-
-#[derive(Debug, Default)]
-struct State {
-    partition_ids: BTreeMap<(TableId, PartitionKey), PartitionId>,
-    table_schemas: BTreeMap<TableId, Arc<TableSchema>>,
-}
-
-const RECENT_PARTITION_COUNT_TO_WARM: usize = 40000;
-
-impl SchemaCache {
-    pub async fn warm(&self) -> Result<(), CacheError> {
-        let namespaces = list_schemas(&*self.catalog).await?.collect::<Vec<_>>();
-        let partitions = self
-            .catalog
-            .repositories()
-            .await
-            .partitions()
-            .most_recent_n(RECENT_PARTITION_COUNT_TO_WARM)
-            .await?;
-
-        let mut state = self.state.write();
-
-        for (_namespace, schema) in namespaces {
-            for (_table_name, table_schema) in schema.tables {
-                state
-                    .table_schemas
-                    .insert(table_schema.id, Arc::new(table_schema));
-            }
-        }
-
-        for partition in partitions {
-            state
-                .partition_ids
-                .insert((partition.table_id, partition.partition_key), partition.id);
-        }
-
-        Ok(())
-    }
-
-    pub fn new(catalog: Arc<dyn Catalog>, transition_shard_id: ShardId) -> Self {
-        Self {
-            catalog,
-            state: Default::default(),
-            transition_shard_id,
-        }
-    }
-
-    pub async fn get_table_schema(
-        &self,
-        table_id: TableId,
-    ) -> Result<Arc<TableSchema>, CacheError> {
-        match self.get_table_schema_from_cache(&table_id) {
-            Some(t) => Ok(t),
-            None => {
-                let table_schema = {
-                    let mut repos = self.catalog.repositories().await;
-                    get_table_schema_by_id(table_id, repos.deref_mut()).await?
-                };
-                let table_schema = Arc::new(table_schema);
-                let mut s = self.state.write();
-                s.table_schemas.insert(table_id, Arc::clone(&table_schema));
-
-                Ok(table_schema)
-            }
-        }
-    }
-
-    fn get_table_schema_from_cache(&self, table_id: &TableId) -> Option<Arc<TableSchema>> {
-        let s = self.state.read();
-        s.table_schemas.get(table_id).cloned()
-    }
-
-    pub async fn get_table_schema_from_catalog(
-        &self,
-        table_id: TableId,
-    ) -> Result<Arc<TableSchema>, CacheError> {
-        let table_schema = {
-            let mut repos = self.catalog.repositories().await;
-            get_table_schema_by_id(table_id, repos.deref_mut()).await?
-        };
-
-        let table_schema = Arc::new(table_schema);
-        let mut s = self.state.write();
-        s.table_schemas.insert(table_id, Arc::clone(&table_schema));
-
-        Ok(table_schema)
-    }
-
-    pub async fn get_partition_id(
-        &self,
-        table_id: TableId,
-        partition_key: PartitionKey,
-    ) -> Result<PartitionId, CacheError> {
-        let id = match self.get_partition_id_from_cache(table_id, partition_key.clone()) {
-            Some(k) => k,
-            None => {
-                let partition = self
-                    .catalog
-                    .repositories()
-                    .await
-                    .partitions()
-                    .create_or_get(partition_key.clone(), self.transition_shard_id, table_id)
-                    .await?;
-                let mut s = self.state.write();
-                s.partition_ids
-                    .insert((table_id, partition_key), partition.id);
-                partition.id
-            }
-        };
-
-        Ok(id)
-    }
-
-    fn get_partition_id_from_cache(
-        &self,
-        table_id: TableId,
-        partition_key: PartitionKey,
-    ) -> Option<PartitionId> {
-        let s = self.state.read();
-        s.partition_ids.get(&(table_id, partition_key)).cloned()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use data_types::{ColumnType, Namespace, Partition, Table};
-    use iox_catalog::create_or_get_default_records;
-    use iox_catalog::mem::MemCatalog;
-    use metric::Registry;
-
-    const NAMESPACE_NAME: &str = "foo";
-    const TABLE_NAME: &str = "bar";
-    const COLUMN_NAME: &str = "time";
-    const PARTITION_KEY: &str = "2023-01-08";
-
-    #[tokio::test]
-    async fn warms_cache() {
-        let (catalog, shard_id, _namespace, table, partition) = get_test_data().await;
-
-        let cache = SchemaCache::new(catalog, shard_id);
-        assert!(cache.get_table_schema_from_cache(&table.id).is_none());
-        assert!(cache
-            .get_partition_id_from_cache(table.id, partition.partition_key.clone())
-            .is_none());
-
-        cache.warm().await.unwrap();
-        assert_eq!(
-            cache.get_table_schema_from_cache(&table.id).unwrap().id,
-            table.id
-        );
-        assert_eq!(
-            cache
-                .get_partition_id_from_cache(table.id, partition.partition_key)
-                .unwrap(),
-            partition.id
-        );
-    }
-
-    #[tokio::test]
-    async fn gets_table_schema_and_partition_id_from_catalog_if_not_in_cache() {
-        let (catalog, shard_id, _namespace, table, partition) = get_test_data().await;
-
-        let cache = SchemaCache::new(catalog, shard_id);
-        assert!(cache.get_table_schema_from_cache(&table.id).is_none());
-        assert!(cache
-            .get_partition_id_from_cache(table.id, partition.partition_key.clone())
-            .is_none());
-
-        assert_eq!(cache.get_table_schema(table.id).await.unwrap().id, table.id);
-        assert_eq!(
-            cache
-                .get_partition_id(table.id, partition.partition_key)
-                .await
-                .unwrap(),
-            partition.id
-        );
-    }
-
-    async fn get_test_data() -> (Arc<dyn Catalog>, ShardId, Namespace, Table, Partition) {
-        let catalog = MemCatalog::new(Arc::new(Registry::new()));
-
-        let mut txn = catalog.start_transaction().await.unwrap();
-        let (topic, query_pool, shards) = create_or_get_default_records(1, txn.deref_mut())
-            .await
-            .unwrap();
-
-        let shard_id = *shards.keys().next().unwrap();
-        let namespace = txn
-            .namespaces()
-            .create(NAMESPACE_NAME, None, topic.id, query_pool.id)
-            .await
-            .unwrap();
-        let table = txn
-            .tables()
-            .create_or_get(TABLE_NAME, namespace.id)
-            .await
-            .unwrap();
-        let _ = txn
-            .columns()
-            .create_or_get(COLUMN_NAME, table.id, ColumnType::Time)
-            .await
-            .unwrap();
-        let partition = txn
-            .partitions()
-            .create_or_get(PARTITION_KEY.into(), shard_id, table.id)
-            .await
-            .unwrap();
-
-        txn.commit().await.unwrap();
-
-        (Arc::new(catalog), shard_id, namespace, table, partition)
-    }
-}
--- a/ingest_replica/src/grpc.rs
+++ b/ingest_replica/src/grpc.rs
@ -1,67 +0,0 @@
-mod query;
-mod replication;
-
-use std::{fmt::Debug, sync::Arc};
-
-use arrow_flight::flight_service_server::FlightServiceServer;
-use generated_types::influxdata::iox::ingester::v1::replication_service_server::ReplicationServiceServer;
-
-use crate::ReplicationBuffer;
-use crate::{
-    query::{response::QueryResponse, QueryExec},
-    IngestReplicaRpcInterface,
-};
-
-use self::replication::ReplicationServer;
-
-/// This type is responsible for injecting internal dependencies that SHOULD NOT
-/// leak outside of the ingester crate into public gRPC handlers.
-///
-/// Configuration and external dependencies SHOULD be injected through the
-/// respective gRPC handler constructor method.
-#[derive(Debug)]
-pub(crate) struct GrpcDelegate<B> {
-    buffer: Arc<B>,
-    metrics: Arc<metric::Registry>,
-}
-
-impl<B> GrpcDelegate<B>
-where
-    B: ReplicationBuffer + QueryExec<Response = QueryResponse> + 'static,
-{
-    /// Initialise a new [`GrpcDelegate`].
-    pub(crate) fn new(buffer: Arc<B>, metrics: Arc<metric::Registry>) -> Self {
-        Self { buffer, metrics }
-    }
-}
-
-/// Implement the type-erasure trait to hide internal types from crate-external
-/// callers.
-impl<B> IngestReplicaRpcInterface for GrpcDelegate<B>
-where
-    B: ReplicationBuffer + QueryExec<Response = QueryResponse> + 'static,
-{
-    type ReplicationHandler = ReplicationServer<B>;
-    type FlightHandler = query::FlightService<Arc<B>>;
-
-    /// Return a [`ReplicationService`] gRPC implementation.
-    ///
-    /// [`ReplicationService`]: generated_types::influxdata::iox::catalog::v1::write_service_server::WriteService.
-    fn replication_service(&self) -> ReplicationServiceServer<Self::ReplicationHandler> {
-        ReplicationServiceServer::new(ReplicationServer::new(Arc::clone(&self.buffer)))
-    }
-
-    /// Return an Arrow [`FlightService`] gRPC implementation.
-    ///
-    /// [`FlightService`]: arrow_flight::flight_service_server::FlightService
-    fn query_service(
-        &self,
-        max_simultaneous_requests: usize,
-    ) -> FlightServiceServer<Self::FlightHandler> {
-        FlightServiceServer::new(query::FlightService::new(
-            Arc::clone(&self.buffer),
-            max_simultaneous_requests,
-            &self.metrics,
-        ))
-    }
-}
--- a/ingest_replica/src/grpc/query.rs
+++ b/ingest_replica/src/grpc/query.rs
@ -1,363 +0,0 @@
-use std::pin::Pin;
-
-use arrow_flight::{
-    encode::FlightDataEncoderBuilder, error::FlightError,
-    flight_service_server::FlightService as Flight, Action, ActionType, Criteria, Empty,
-    FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage,
-    PutResult, SchemaResult, Ticket,
-};
-use data_types::{NamespaceId, PartitionId, TableId};
-use flatbuffers::FlatBufferBuilder;
-use futures::{stream::BoxStream, Stream, StreamExt, TryStreamExt};
-use generated_types::influxdata::iox::ingester::v1::{self as proto, PartitionStatus};
-use metric::U64Counter;
-use observability_deps::tracing::*;
-use prost::Message;
-use thiserror::Error;
-use tokio::sync::{Semaphore, TryAcquireError};
-use tonic::{Request, Response, Streaming};
-use trace::{ctx::SpanContext, span::SpanExt};
-
-use uuid::Uuid;
-
-use crate::query::{response::QueryResponse, QueryError, QueryExec};
-
-/// Error states for the query RPC handler.
-///
-/// Note that this DOES NOT include any query-time error states - those are
-/// mapped directly from the [`QueryError`] itself.
-///
-/// Note that this isn't strictly necessary as the [`FlightService`] trait
-/// expects a [`tonic::Status`] error value, but by defining the errors here
-/// they serve as documentation of the potential error states (which are then
-/// converted into [`tonic::Status`] for the handler).
-#[derive(Debug, Error)]
-enum Error {
-    /// The payload within the Flight ticket cannot be deserialised into a
-    /// [`proto::IngesterQueryRequest`].
-    #[error("invalid flight ticket: {0}")]
-    InvalidTicket(#[from] prost::DecodeError),
-
-    /// The number of simultaneous queries being executed has been reached.
-    #[error("simultaneous query limit exceeded")]
-    RequestLimit,
-}
-
-/// Map a query-execution error into a [`tonic::Status`].
-impl From<QueryError> for tonic::Status {
-    fn from(e: QueryError) -> Self {
-        use tonic::Code;
-
-        let code = match e {
-            QueryError::TableNotFound(_, _) | QueryError::NamespaceNotFound(_) => Code::NotFound,
-        };
-
-        Self::new(code, e.to_string())
-    }
-}
-
-/// Map a gRPC handler error to a [`tonic::Status`].
-impl From<Error> for tonic::Status {
-    fn from(e: Error) -> Self {
-        use tonic::Code;
-
-        let code = match e {
-            Error::InvalidTicket(_) => {
-                debug!(error=%e, "invalid flight query ticket");
-                Code::InvalidArgument
-            }
-            Error::RequestLimit => {
-                warn!("simultaneous query limit exceeded");
-                Code::ResourceExhausted
-            }
-        };
-
-        Self::new(code, e.to_string())
-    }
-}
-
-/// Concrete implementation of the gRPC Arrow Flight Service API
-#[derive(Debug)]
-pub(crate) struct FlightService<Q> {
-    query_handler: Q,
-
-    /// A request limiter to restrict the number of simultaneous requests this
-    /// ingester services.
-    ///
-    /// This allows the ingester to drop a portion of requests when experiencing
-    /// an unusual flood of requests
-    request_sem: Semaphore,
-
-    /// Number of queries rejected due to lack of available `request_sem`
-    /// permit.
-    query_request_limit_rejected: U64Counter,
-
-    ingester_uuid: Uuid,
-}
-
-impl<Q> FlightService<Q> {
-    pub(super) fn new(
-        query_handler: Q,
-        max_simultaneous_requests: usize,
-        metrics: &metric::Registry,
-    ) -> Self {
-        let query_request_limit_rejected = metrics
-            .register_metric::<U64Counter>(
-                "query_request_limit_rejected",
-                "number of query requests rejected due to exceeding parallel request limit",
-            )
-            .recorder(&[]);
-
-        Self {
-            query_handler,
-            request_sem: Semaphore::new(max_simultaneous_requests),
-            query_request_limit_rejected,
-            ingester_uuid: Uuid::new_v4(),
-        }
-    }
-}
-
-type TonicStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send + 'static>>;
-
-#[tonic::async_trait]
-impl<Q> Flight for FlightService<Q>
-where
-    Q: QueryExec<Response = QueryResponse> + 'static,
-{
-    type HandshakeStream = TonicStream<HandshakeResponse>;
-    type ListFlightsStream = TonicStream<FlightInfo>;
-    type DoGetStream = TonicStream<FlightData>;
-    type DoPutStream = TonicStream<PutResult>;
-    type DoActionStream = TonicStream<arrow_flight::Result>;
-    type ListActionsStream = TonicStream<ActionType>;
-    type DoExchangeStream = TonicStream<FlightData>;
-
-    async fn get_schema(
-        &self,
-        _request: Request<FlightDescriptor>,
-    ) -> Result<Response<SchemaResult>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn do_get(
-        &self,
-        request: Request<Ticket>,
-    ) -> Result<Response<Self::DoGetStream>, tonic::Status> {
-        let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
-
-        // Acquire and hold a permit for the duration of this request, or return
-        // an error if the existing requests have already exhausted the
-        // allocation.
-        //
-        // Our goal is to limit the number of concurrently executing queries as
-        // a rough way of ensuring we don't explode memory by trying to do too
-        // much at the same time.
-        let _permit = match self.request_sem.try_acquire() {
-            Ok(p) => p,
-            Err(TryAcquireError::NoPermits) => {
-                warn!("simultaneous request limit exceeded - dropping query request");
-                self.query_request_limit_rejected.inc(1);
-                return Err(Error::RequestLimit)?;
-            }
-            Err(e) => panic!("request limiter error: {e}"),
-        };
-
-        let ticket = request.into_inner();
-        let request = proto::IngesterQueryRequest::decode(&*ticket.ticket).map_err(Error::from)?;
-
-        // Extract the namespace/table identifiers
-        let namespace_id = NamespaceId::new(request.namespace_id);
-        let table_id = TableId::new(request.table_id);
-
-        // Predicate pushdown is part of the API, but not implemented.
-        if let Some(p) = request.predicate {
-            warn!(predicate=?p, "ignoring query predicate (unsupported)");
-        }
-
-        let response = self
-            .query_handler
-            .query_exec(
-                namespace_id,
-                table_id,
-                request.columns,
-                span_ctx.child_span("ingester query"),
-            )
-            .await?;
-
-        let output = encode_response(response, self.ingester_uuid).map_err(tonic::Status::from);
-
-        Ok(Response::new(Box::pin(output) as Self::DoGetStream))
-    }
-
-    async fn handshake(
-        &self,
-        request: Request<Streaming<HandshakeRequest>>,
-    ) -> Result<Response<Self::HandshakeStream>, tonic::Status> {
-        let request = request.into_inner().message().await?.unwrap();
-        let response = HandshakeResponse {
-            protocol_version: request.protocol_version,
-            payload: request.payload,
-        };
-        let output = futures::stream::iter(std::iter::once(Ok(response)));
-        Ok(Response::new(Box::pin(output) as Self::HandshakeStream))
-    }
-
-    async fn list_flights(
-        &self,
-        _request: Request<Criteria>,
-    ) -> Result<Response<Self::ListFlightsStream>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn get_flight_info(
-        &self,
-        _request: Request<FlightDescriptor>,
-    ) -> Result<Response<FlightInfo>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn do_put(
-        &self,
-        _request: Request<Streaming<FlightData>>,
-    ) -> Result<Response<Self::DoPutStream>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn do_action(
-        &self,
-        _request: Request<Action>,
-    ) -> Result<Response<Self::DoActionStream>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn list_actions(
-        &self,
-        _request: Request<Empty>,
-    ) -> Result<Response<Self::ListActionsStream>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-
-    async fn do_exchange(
-        &self,
-        _request: Request<Streaming<FlightData>>,
-    ) -> Result<Response<Self::DoExchangeStream>, tonic::Status> {
-        Err(tonic::Status::unimplemented("Not yet implemented"))
-    }
-}
-
-/// Encode the partition information as a None flight data with metadata
-fn encode_partition(
-    // Partition ID.
-    partition_id: PartitionId,
-    // Partition persistence status.
-    status: PartitionStatus,
-    // Count of persisted Parquet files
-    completed_persistence_count: u64,
-    ingester_uuid: Uuid,
-) -> std::result::Result<FlightData, FlightError> {
-    let mut bytes = bytes::BytesMut::new();
-    let app_metadata = proto::IngesterQueryResponseMetadata {
-        partition_id: partition_id.get(),
-        status: Some(proto::PartitionStatus {
-            parquet_max_sequence_number: status.parquet_max_sequence_number,
-        }),
-        ingester_uuid: ingester_uuid.to_string(),
-        completed_persistence_count,
-    };
-    prost::Message::encode(&app_metadata, &mut bytes)
-        .map_err(|e| FlightError::from_external_error(Box::new(e)))?;
-
-    Ok(FlightData::new(
-        None,
-        IpcMessage(build_none_flight_msg().into()),
-        bytes.to_vec(),
-        vec![],
-    ))
-}
-
-fn build_none_flight_msg() -> Vec<u8> {
-    let mut fbb = FlatBufferBuilder::new();
-
-    let mut message = arrow::ipc::MessageBuilder::new(&mut fbb);
-    message.add_version(arrow::ipc::MetadataVersion::V5);
-    message.add_header_type(arrow::ipc::MessageHeader::NONE);
-    message.add_bodyLength(0);
-
-    let data = message.finish();
-    fbb.finish(data, None);
-
-    fbb.finished_data().to_vec()
-}
-
-/// Converts a QueryResponse into a stream of Arrow Flight [`FlightData`] response frames.
-fn encode_response(
-    response: QueryResponse,
-    ingester_uuid: Uuid,
-) -> BoxStream<'static, std::result::Result<FlightData, FlightError>> {
-    response
-        .into_partition_stream()
-        .flat_map(move |partition| {
-            let partition_id = partition.id();
-            let completed_persistence_count = partition.completed_persistence_count();
-            let head = futures::stream::once(async move {
-                encode_partition(
-                    partition_id,
-                    PartitionStatus {
-                        parquet_max_sequence_number: None,
-                    },
-                    completed_persistence_count,
-                    ingester_uuid,
-                )
-            });
-
-            match partition.into_record_batch_stream() {
-                Some(stream) => {
-                    let stream = stream.map_err(|e| FlightError::ExternalError(Box::new(e)));
-
-                    let tail = FlightDataEncoderBuilder::new().build(stream);
-
-                    head.chain(tail).boxed()
-                }
-                None => head.boxed(),
-            }
-        })
-        .boxed()
-}
-
-#[cfg(test)]
-mod tests {
-    use bytes::Bytes;
-    use tonic::Code;
-
-    use crate::query::mock_query_exec::MockQueryExec;
-
-    use super::*;
-
-    #[tokio::test]
-    async fn limits_concurrent_queries() {
-        let mut flight =
-            FlightService::new(MockQueryExec::default(), 100, &metric::Registry::default());
-
-        let req = tonic::Request::new(Ticket {
-            ticket: Bytes::new(),
-        });
-        match flight.do_get(req).await {
-            Ok(_) => panic!("expected error because of invalid ticket"),
-            Err(s) => {
-                assert_eq!(s.code(), Code::NotFound); // Mock response value
-            }
-        }
-
-        flight.request_sem = Semaphore::new(0);
-
-        let req = tonic::Request::new(Ticket {
-            ticket: Bytes::new(),
-        });
-        match flight.do_get(req).await {
-            Ok(_) => panic!("expected error because of request limit"),
-            Err(s) => {
-                assert_eq!(s.code(), Code::ResourceExhausted);
-            }
-        }
-    }
-}
--- a/ingest_replica/src/grpc/replication.rs
+++ b/ingest_replica/src/grpc/replication.rs
@ -1,223 +0,0 @@
-use std::sync::Arc;
-
-use data_types::sequence_number_set::SequenceNumberSet;
-use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, TableId};
-use generated_types::influxdata::iox::ingester::v1::{
-    self as proto, replication_service_server::ReplicationService,
-};
-use mutable_batch::writer;
-use mutable_batch_pb::decode::decode_database_batch;
-use observability_deps::tracing::*;
-use thiserror::Error;
-use tonic::{Code, Request, Response};
-use uuid::Uuid;
-
-use crate::{BufferError, ReplicationBuffer};
-
-/// A list of error states when handling a ReplicationService request.
-#[derive(Debug, Error)]
-enum ReplicationError {
-    /// The replication request did not contain a write payload.
-    #[error("replication request does not contain a payload")]
-    NoPayload,
-
-    /// The replication payload contains no tables.
-    #[error("replication request does not contain any table data")]
-    NoTables,
-
-    /// The replication request didn't contain an ingester id
-    #[error("replication request does not contain an ingester id")]
-    NoIngesterId,
-
-    /// The replication request had an invalid sequence number set
-    #[error("replication request to persist contained invalid sequence number set {0}")]
-    InvalidSequenceNumberSet(String),
-
-    /// Ingester ID not a valid UUID
-    #[error("replication request does not contain valid ingester uuid")]
-    InvalidIngesterId(#[from] uuid::Error),
-
-    /// The serialised write payload could not be read.
-    #[error(transparent)]
-    Decode(mutable_batch_pb::decode::Error),
-
-    /// An error buffering the write or persist
-    #[error("error buffering replciation request: {0}")]
-    Buffer(#[from] BufferError),
-}
-
-impl From<ReplicationError> for tonic::Status {
-    fn from(e: ReplicationError) -> Self {
-        let code = match e {
-            ReplicationError::Decode(_)
-            | ReplicationError::NoPayload
-            | ReplicationError::NoTables
-            | ReplicationError::NoIngesterId
-            | ReplicationError::InvalidIngesterId(_)
-            | ReplicationError::InvalidSequenceNumberSet(_) => Code::InvalidArgument,
-            ReplicationError::Buffer(_) => Code::Internal,
-        };
-
-        Self::new(code, e.to_string())
-    }
-}
-
-/// Convert a [`BufferError`] returned by the configured [`ReplicationBuffer`] to a
-/// [`tonic::Status`].
-impl From<BufferError> for tonic::Status {
-    fn from(e: BufferError) -> Self {
-        match e {
-            BufferError::MutableBatch(e) => map_write_error(e),
-        }
-    }
-}
-
-/// Map a [`mutable_batch::Error`] to a [`tonic::Status`].
-///
-/// This method takes care to enumerate all possible error states, so that new
-/// error additions cause a compilation failure, and therefore require the new
-/// error to be explicitly mapped to a gRPC status code.
-fn map_write_error(e: mutable_batch::Error) -> tonic::Status {
-    use tonic::Status;
-    match e {
-        mutable_batch::Error::ColumnError { .. }
-        | mutable_batch::Error::ArrowError { .. }
-        | mutable_batch::Error::InternalSchema { .. }
-        | mutable_batch::Error::ColumnNotFound { .. }
-        | mutable_batch::Error::WriterError {
-            source: writer::Error::KeyNotFound { .. } | writer::Error::InsufficientValues { .. },
-        } => Status::internal(e.to_string()),
-        mutable_batch::Error::WriterError {
-            source: writer::Error::TypeMismatch { .. },
-        } => {
-            // While a schema type conflict is ultimately a user error, if it
-            // reaches the ingester it should have already passed through schema
-            // validation in the router, and as such it is an internal system
-            // failure.
-            Status::internal(e.to_string())
-        }
-    }
-}
-
-/// A gRPC [`ReplicationService`] handler.
-///
-/// This handler accepts writes from an upstream, and applies them to the
-/// provided [`ReplicationBuffer`].
-pub(crate) struct ReplicationServer<B: ReplicationBuffer + 'static> {
-    buffer: Arc<B>,
-}
-
-impl<B: ReplicationBuffer + 'static> ReplicationServer<B> {
-    /// Instantiate a new [`ReplicationServer`]
-    pub(crate) fn new(buffer: Arc<B>) -> Self {
-        Self { buffer }
-    }
-}
-
-#[tonic::async_trait]
-impl<B: ReplicationBuffer + 'static> ReplicationService for ReplicationServer<B> {
-    /// Handle an RPC write request.
-    async fn replicate(
-        &self,
-        request: Request<proto::ReplicateRequest>,
-    ) -> Result<Response<proto::ReplicateResponse>, tonic::Status> {
-        // Extract the remote address for debugging.
-        let remote_addr = request
-            .remote_addr()
-            .map(|v| v.to_string())
-            .unwrap_or_else(|| "<unknown>".to_string());
-
-        let request = request.into_inner();
-        let ingester_id =
-            Uuid::parse_str(&request.ingester_uuid).map_err(ReplicationError::InvalidIngesterId)?;
-
-        // Extract the database batch payload
-        let payload = request.payload.ok_or(ReplicationError::NoPayload)?;
-
-        let batches = decode_database_batch(&payload).map_err(ReplicationError::Decode)?;
-        let num_tables = batches.len();
-        let sequence_number = SequenceNumber::new(request.sequence_number);
-        let namespace_id = NamespaceId::new(payload.database_id);
-        let partition_key = PartitionKey::from(payload.partition_key);
-
-        if num_tables == 0 {
-            return Err(ReplicationError::NoTables)?;
-        }
-
-        trace!(
-            remote_addr,
-            %ingester_id,
-            ?sequence_number,
-            num_tables,
-            %namespace_id,
-            %partition_key,
-            "received replicate write"
-        );
-
-        match self
-            .buffer
-            .apply_write(namespace_id, batches, ingester_id, sequence_number)
-            .await
-        {
-            Ok(()) => {}
-            Err(e) => {
-                error!(error=%e, "failed to write into buffer");
-                return Err(ReplicationError::Buffer(e))?;
-            }
-        }
-
-        Ok(Response::new(proto::ReplicateResponse {}))
-    }
-
-    async fn persist_complete(
-        &self,
-        request: Request<proto::PersistCompleteRequest>,
-    ) -> Result<Response<proto::PersistCompleteResponse>, tonic::Status> {
-        // Extract the remote address for debugging.
-        let remote_addr = request
-            .remote_addr()
-            .map(|v| v.to_string())
-            .unwrap_or_else(|| "<unknown>".to_string());
-
-        let request = request.into_inner();
-        let ingester_id =
-            Uuid::parse_str(&request.ingester_uuid).map_err(ReplicationError::InvalidIngesterId)?;
-        let namespace_id = NamespaceId::new(request.namespace_id);
-        let table_id = TableId::new(request.table_id);
-        let partition_id = PartitionId::new(request.partition_id);
-        let sequence_set =
-            SequenceNumberSet::try_from(request.croaring_sequence_number_bitmap.as_ref())
-                .map_err(ReplicationError::InvalidSequenceNumberSet)?;
-
-        trace!(
-            remote_addr,
-            ?ingester_id,
-            ?namespace_id,
-            ?table_id,
-            ?partition_id,
-        );
-
-        match self
-            .buffer
-            .apply_persist(
-                ingester_id,
-                namespace_id,
-                table_id,
-                partition_id,
-                sequence_set,
-            )
-            .await
-        {
-            Ok(()) => {}
-            Err(e) => {
-                error!(error=%e, "failed to apply persist to buffer");
-                return Err(ReplicationError::Buffer(e))?;
-            }
-        }
-
-        Ok(Response::new(proto::PersistCompleteResponse {}))
-    }
-}
-
-#[cfg(test)]
-mod tests {}
--- a/ingest_replica/src/lib.rs
+++ b/ingest_replica/src/lib.rs
@ -1,169 +0,0 @@
-//! IOx Ingest Replica implementation
-//!
-//! The Ingest Replica serves as an in memory queryable buffer of data from one or more ingesters
-//! that are persisting data. It provides horizontal scalability of query workloads on the data in
-//! ingesters that has yet to be persisted to Parquet files. It also ensures that the write path
-//! and the query path have failure isolation so that an outage in one won't create an outage in
-//! the other.
-
-#![allow(dead_code)] // Until ingest_replica is used.
-#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
-#![warn(
-    clippy::clone_on_ref_ptr,
-    clippy::dbg_macro,
-    clippy::explicit_iter_loop,
-    clippy::future_not_send,
-    clippy::todo,
-    clippy::use_self,
-    missing_copy_implementations,
-    missing_debug_implementations,
-    missing_docs
-)]
-
-mod buffer;
-mod cache;
-mod grpc;
-mod query;
-mod query_adaptor;
-
-use crate::cache::CacheError;
-use crate::{buffer::Buffer, cache::SchemaCache, grpc::GrpcDelegate};
-use arrow_flight::flight_service_server::{FlightService, FlightServiceServer};
-use async_trait::async_trait;
-use data_types::sequence_number_set::SequenceNumberSet;
-use data_types::{NamespaceId, PartitionId, SequenceNumber, TableId, TRANSITION_SHARD_INDEX};
-use generated_types::influxdata::iox::ingester::v1::replication_service_server::{
-    ReplicationService, ReplicationServiceServer,
-};
-use hashbrown::HashMap;
-use iox_catalog::interface::Catalog;
-use iox_query::exec::Executor;
-use mutable_batch::MutableBatch;
-use std::sync::Arc;
-use thiserror::Error;
-use uuid::Uuid;
-
-/// An error returned by the `ReplicationBuffer`.
-#[derive(Debug, Error)]
-pub enum BufferError {
-    /// An error from the mutable batch sent to a buffer.
-    #[error("mutable batch error: {0}")]
-    MutableBatch(#[from] mutable_batch::Error),
-}
-
-/// Acquire opaque handles to the IngestReplica RPC service implementations.
-///
-/// This trait serves as the public crate API boundary - callers external to the
-/// IngestReplica crate utilise this abstraction to acquire type erased handles to
-/// the RPC service implementations, hiding internal IngestReplica implementation
-/// details & types.
-///
-/// Callers can mock out this trait or decorate the returned implementation in
-/// order to simulate or modify the behaviour of an ingest_replica in their own tests.
-pub trait IngestReplicaRpcInterface: Send + Sync + std::fmt::Debug {
-    /// The type of the [`ReplicationService`] implementation.
-    type ReplicationHandler: ReplicationService;
-    /// The type of the [`FlightService`] implementation.
-    type FlightHandler: FlightService;
-
-    /// Acquire an opaque handle to the IngestReplica's [`ReplicationService`] RPC
-    /// handler implementation.
-    fn replication_service(&self) -> ReplicationServiceServer<Self::ReplicationHandler>;
-
-    /// Acquire an opaque handle to the Ingester's Arrow Flight
-    /// [`FlightService`] RPC handler implementation, allowing at most
-    /// `max_simultaneous_requests` queries to be running at any one time.
-    fn query_service(
-        &self,
-        max_simultaneous_requests: usize,
-    ) -> FlightServiceServer<Self::FlightHandler>;
-}
-
-/// Alias for the `TableId` to `MutableBatch` hashmap of data received in write and partition
-/// buffer requests.
-pub(crate) type TableIdToMutableBatch = HashMap<i64, MutableBatch>;
-
-/// ReplicationBuffer can receive data from the replication protocol to get buffers of partition
-/// data, individual write requests, and persistence notification to evict data from the buffer.
-#[async_trait]
-pub(crate) trait ReplicationBuffer: Send + Sync {
-    /// Apply an individual write request to the buffer. Can write many rows into many partitions.
-    async fn apply_write(
-        &self,
-        namespace_id: NamespaceId,
-        table_batches: TableIdToMutableBatch,
-        ingester_id: Uuid,
-        sequence_number: SequenceNumber,
-    ) -> Result<(), BufferError>;
-
-    /// Apply a persist operation to the buffer, which should clear out the data from the given
-    /// partition.
-    async fn apply_persist(
-        &self,
-        ingester_id: Uuid,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        partition_id: PartitionId,
-        sequence_set: SequenceNumberSet,
-    ) -> Result<(), BufferError>;
-
-    /// Append an entire partition buffer to the buffer. It should be able to evict this entire
-    /// buffer in one operation when it later receives a persist operation that has a SequenceSet
-    /// that is a superset of the one sent here.
-    async fn append_partition_buffer(
-        &self,
-        ingester_id: Uuid,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        partition_id: PartitionId,
-        sequence_set: SequenceNumberSet,
-        table_batches: TableIdToMutableBatch,
-    ) -> Result<(), BufferError>;
-}
-
-/// Errors that occur during initialisation of an `ingest_replica` instance.
-#[derive(Debug, Error)]
-pub enum InitError {
-    /// An error occurred trying to warm the schema cache
-    #[error("failed to pre-warm schema cache: {0}")]
-    WarmCache(#[from] CacheError),
-}
-
-/// Initialise a new `ingest_replica` instance, returning the gRPC service handler
-/// implementations to be bound by the caller.
-#[allow(clippy::too_many_arguments)]
-pub async fn new(
-    catalog: Arc<dyn Catalog>,
-    _ingesters: Vec<String>,
-    exec: Arc<Executor>,
-    metrics: Arc<metric::Registry>,
-) -> Result<impl IngestReplicaRpcInterface, InitError> {
-    // Create the transition shard.
-    let mut txn = catalog
-        .start_transaction()
-        .await
-        .expect("start transaction");
-    let topic = txn
-        .topics()
-        .create_or_get("iox-shared")
-        .await
-        .expect("get topic");
-    let transition_shard = txn
-        .shards()
-        .create_or_get(&topic, TRANSITION_SHARD_INDEX)
-        .await
-        .expect("create transition shard");
-    txn.commit().await.expect("commit transition shard");
-
-    let schema_cache = Arc::new(SchemaCache::new(Arc::clone(&catalog), transition_shard.id));
-    schema_cache.warm().await?;
-
-    let buffer = Arc::new(Buffer::new(schema_cache, exec));
-
-    // TODO: connect to the remote ingesters and subscribe to their data, receiving the
-    //       PartitionBufferResponses into the buffer. Note that the ReplicationService in this
-    //       GrpcDelegate must be running before the requests are sent as the ingester will
-    //       immediately start sending replciate requests.
-
-    Ok(GrpcDelegate::new(Arc::clone(&buffer), metrics))
-}
--- a/ingest_replica/src/query/instrumentation.rs
+++ b/ingest_replica/src/query/instrumentation.rs
@ -1,156 +0,0 @@
-use async_trait::async_trait;
-use data_types::{NamespaceId, TableId};
-use iox_time::{SystemProvider, TimeProvider};
-use metric::{DurationHistogram, Metric};
-use trace::span::Span;
-
-use super::QueryExec;
-use crate::query::QueryError;
-
-/// An instrumentation decorator over a [`QueryExec`] implementation.
-///
-/// This wrapper captures the latency distribution of the decorated
-/// [`QueryExec::query_exec()`] call, faceted by success/error result.
-#[derive(Debug)]
-pub(crate) struct QueryExecInstrumentation<T, P = SystemProvider> {
-    inner: T,
-    time_provider: P,
-
-    /// Query execution duration distribution for successes.
-    query_duration_success: DurationHistogram,
-
-    /// Query execution duration distribution for "not found" errors
-    query_duration_error_not_found: DurationHistogram,
-}
-
-impl<T> QueryExecInstrumentation<T> {
-    pub(crate) fn new(inner: T, metrics: &metric::Registry) -> Self {
-        // Record query duration metrics, broken down by query execution result
-        let query_duration: Metric<DurationHistogram> = metrics.register_metric(
-            "ingester_flight_query_duration",
-            "flight request query execution duration",
-        );
-        let query_duration_success = query_duration.recorder(&[("result", "success")]);
-        let query_duration_error_not_found =
-            query_duration.recorder(&[("result", "error"), ("reason", "not_found")]);
-
-        Self {
-            inner,
-            time_provider: Default::default(),
-            query_duration_success,
-            query_duration_error_not_found,
-        }
-    }
-}
-
-#[async_trait]
-impl<T, P> QueryExec for QueryExecInstrumentation<T, P>
-where
-    T: QueryExec,
-    P: TimeProvider,
-{
-    type Response = T::Response;
-
-    #[inline(always)]
-    async fn query_exec(
-        &self,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        columns: Vec<String>,
-        span: Option<Span>,
-    ) -> Result<Self::Response, QueryError> {
-        let t = self.time_provider.now();
-
-        let res = self
-            .inner
-            .query_exec(namespace_id, table_id, columns, span)
-            .await;
-
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(_) => self.query_duration_success.record(delta),
-                Err(QueryError::TableNotFound { .. } | QueryError::NamespaceNotFound { .. }) => {
-                    self.query_duration_error_not_found.record(delta)
-                }
-            };
-        }
-
-        res
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use assert_matches::assert_matches;
-    use metric::Attributes;
-
-    use super::*;
-    use crate::query::{
-        mock_query_exec::MockQueryExec,
-        response::{PartitionStream, QueryResponse},
-    };
-
-    macro_rules! test_metric {
-        (
-            $name:ident,
-            inner = $inner:expr,
-            want_metric_attr = $want_metric_attr:expr,
-            want_ret = $($want_ret:tt)+
-        ) => {
-            paste::paste! {
-                #[tokio::test]
-                async fn [<test_metric_ $name>]() {
-                    let metrics = metric::Registry::default();
-                    let decorator = QueryExecInstrumentation::new($inner, &metrics);
-
-                    // Call the decorator and assert the return value
-                    let got = decorator
-                        .query_exec(NamespaceId::new(42), TableId::new(24), vec![], None)
-                        .await;
-                    assert_matches!(got, $($want_ret)+);
-
-                    // Validate the histogram with the specified attributes saw
-                    // an observation
-                    let histogram = metrics
-                        .get_instrument::<Metric<DurationHistogram>>("ingester_flight_query_duration")
-                        .expect("failed to find metric")
-                        .get_observer(&Attributes::from(&$want_metric_attr))
-                        .expect("failed to find attributes")
-                        .fetch();
-                    assert_eq!(histogram.sample_count(), 1);
-                }
-            }
-        };
-    }
-
-    test_metric!(
-        ok,
-        inner = {
-            let stream: PartitionStream = PartitionStream::new(futures::stream::iter([]));
-            MockQueryExec::default().with_result(Ok(QueryResponse::new(stream)))
-        },
-        want_metric_attr = [("result", "success")],
-        want_ret = Ok(_)
-    );
-
-    test_metric!(
-        namespace_not_found,
-        inner = MockQueryExec::default()
-            .with_result(Err(QueryError::NamespaceNotFound(NamespaceId::new(42)))),
-        want_metric_attr = [("result", "error"), ("reason", "not_found")],
-        want_ret = Err(QueryError::NamespaceNotFound(ns)) => {
-            assert_eq!(ns, NamespaceId::new(42));
-        }
-    );
-
-    test_metric!(
-        table_not_found,
-        inner = MockQueryExec::default()
-            .with_result(Err(QueryError::TableNotFound(NamespaceId::new(42), TableId::new(24)))),
-        want_metric_attr = [("result", "error"), ("reason", "not_found")],
-        want_ret = Err(QueryError::TableNotFound(ns, t)) => {
-            assert_eq!(ns, NamespaceId::new(42));
-            assert_eq!(t, TableId::new(24));
-        }
-    );
-}
--- a/ingest_replica/src/query/mock_query_exec.rs
+++ b/ingest_replica/src/query/mock_query_exec.rs
@ -1,36 +0,0 @@
-use async_trait::async_trait;
-use data_types::{NamespaceId, TableId};
-use parking_lot::Mutex;
-use trace::span::Span;
-
-use super::{response::QueryResponse, QueryError, QueryExec};
-
-#[derive(Debug, Default)]
-pub(crate) struct MockQueryExec {
-    response: Mutex<Option<Result<QueryResponse, QueryError>>>,
-}
-
-impl MockQueryExec {
-    pub(crate) fn with_result(self, r: Result<QueryResponse, QueryError>) -> Self {
-        *self.response.lock() = Some(r);
-        self
-    }
-}
-
-#[async_trait]
-impl QueryExec for MockQueryExec {
-    type Response = QueryResponse;
-
-    async fn query_exec(
-        &self,
-        _namespace_id: NamespaceId,
-        _table_id: TableId,
-        _columns: Vec<String>,
-        _span: Option<Span>,
-    ) -> Result<Self::Response, QueryError> {
-        self.response
-            .lock()
-            .take()
-            .unwrap_or(Err(QueryError::NamespaceNotFound(NamespaceId::new(42))))
-    }
-}
--- a/ingest_replica/src/query/mod.rs
+++ b/ingest_replica/src/query/mod.rs
@ -1,14 +0,0 @@
-//! Query execution abstraction & types.
-
-mod r#trait;
-pub(crate) use r#trait::*;
-
-// Response types
-pub(crate) mod partition_response;
-pub(crate) mod response;
-
-pub(crate) mod instrumentation;
-pub(crate) mod tracing;
-
-#[cfg(test)]
-pub(crate) mod mock_query_exec;
--- a/ingest_replica/src/query/partition_response.rs
+++ b/ingest_replica/src/query/partition_response.rs
@ -1,63 +0,0 @@
-//! The per-partition data nested in a query [`QueryResponse`].
-//!
-//! [`QueryResponse`]: super::response::QueryResponse
-
-use data_types::PartitionId;
-use datafusion::physical_plan::SendableRecordBatchStream;
-
-/// Response data for a single partition.
-pub(crate) struct PartitionResponse {
-    /// Stream of snapshots.
-    batches: Option<SendableRecordBatchStream>,
-
-    /// Partition ID.
-    id: PartitionId,
-
-    /// Count of persisted Parquet files for this partition by this ingester instance.
-    completed_persistence_count: u64,
-}
-
-impl std::fmt::Debug for PartitionResponse {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("PartitionResponse")
-            .field(
-                "batches",
-                &match self.batches {
-                    Some(_) => "<SNAPSHOT STREAM>",
-                    None => "<NO DATA>,",
-                },
-            )
-            .field("partition_id", &self.id)
-            .field(
-                "completed_persistence_count",
-                &self.completed_persistence_count,
-            )
-            .finish()
-    }
-}
-
-impl PartitionResponse {
-    pub(crate) fn new(
-        data: Option<SendableRecordBatchStream>,
-        id: PartitionId,
-        completed_persistence_count: u64,
-    ) -> Self {
-        Self {
-            batches: data,
-            id,
-            completed_persistence_count,
-        }
-    }
-
-    pub(crate) fn id(&self) -> PartitionId {
-        self.id
-    }
-
-    pub(crate) fn completed_persistence_count(&self) -> u64 {
-        self.completed_persistence_count
-    }
-
-    pub(crate) fn into_record_batch_stream(self) -> Option<SendableRecordBatchStream> {
-        self.batches
-    }
-}
--- a/ingest_replica/src/query/response.rs
+++ b/ingest_replica/src/query/response.rs
@ -1,60 +0,0 @@
-//! The response type returned from a query [`QueryExec::query_exec()`] call.
-//!
-//! [`QueryExec::query_exec()`]: super::QueryExec::query_exec()
-
-use std::{future, pin::Pin};
-
-use arrow::record_batch::RecordBatch;
-use datafusion::common::DataFusionError;
-use futures::{Stream, StreamExt};
-
-use super::partition_response::PartitionResponse;
-
-/// Stream of partitions in this response.
-pub(crate) struct PartitionStream(Pin<Box<dyn Stream<Item = PartitionResponse> + Send>>);
-
-impl std::fmt::Debug for PartitionStream {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_tuple("PartitionStream").finish()
-    }
-}
-
-impl PartitionStream {
-    pub(crate) fn new<T>(s: T) -> Self
-    where
-        T: Stream<Item = PartitionResponse> + Send + 'static,
-    {
-        Self(s.boxed())
-    }
-}
-
-/// A response stream wrapper for ingester query requests.
-///
-/// The data structure is constructed to allow lazy/streaming/pull-based data
-/// sourcing.
-#[derive(Debug)]
-pub(crate) struct QueryResponse {
-    /// Stream of partitions.
-    partitions: PartitionStream,
-}
-
-impl QueryResponse {
-    /// Make a response
-    pub(crate) fn new(partitions: PartitionStream) -> Self {
-        Self { partitions }
-    }
-
-    /// Return the stream of [`PartitionResponse`].
-    pub(crate) fn into_partition_stream(self) -> impl Stream<Item = PartitionResponse> {
-        self.partitions.0
-    }
-
-    /// Reduce the [`QueryResponse`] to a stream of [`RecordBatch`].
-    pub(crate) fn into_record_batches(
-        self,
-    ) -> impl Stream<Item = Result<RecordBatch, DataFusionError>> {
-        self.into_partition_stream()
-            .filter_map(|partition| future::ready(partition.into_record_batch_stream()))
-            .flatten()
-    }
-}
--- a/ingest_replica/src/query/tracing.rs
+++ b/ingest_replica/src/query/tracing.rs
@ -1,148 +0,0 @@
-use std::borrow::Cow;
-
-use async_trait::async_trait;
-use data_types::{NamespaceId, TableId};
-use trace::span::{Span, SpanRecorder};
-
-use super::QueryExec;
-use crate::query::QueryError;
-
-/// An tracing decorator over a [`QueryExec`] implementation.
-///
-/// This wrapper emits child tracing spans covering the execution of the inner
-/// [`QueryExec::query_exec()`] call.
-///
-/// Constructing this decorator is cheap.
-#[derive(Debug)]
-pub(crate) struct QueryExecTracing<T> {
-    inner: T,
-    name: Cow<'static, str>,
-}
-
-impl<T> QueryExecTracing<T> {
-    pub(crate) fn new(inner: T, name: impl Into<Cow<'static, str>>) -> Self {
-        Self {
-            inner,
-            name: name.into(),
-        }
-    }
-}
-
-#[async_trait]
-impl<T> QueryExec for QueryExecTracing<T>
-where
-    T: QueryExec,
-{
-    type Response = T::Response;
-
-    #[inline(always)]
-    async fn query_exec(
-        &self,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        columns: Vec<String>,
-        span: Option<Span>,
-    ) -> Result<Self::Response, QueryError> {
-        let span = span.map(|s| s.child(self.name.clone()));
-        let mut recorder = SpanRecorder::new(span.clone());
-
-        match self
-            .inner
-            .query_exec(namespace_id, table_id, columns, span)
-            .await
-        {
-            Ok(v) => {
-                recorder.ok("query_exec complete");
-                Ok(v)
-            }
-            Err(e) => {
-                recorder.error(e.to_string());
-                Err(e)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use assert_matches::assert_matches;
-    use trace::{ctx::SpanContext, span::SpanStatus, RingBufferTraceCollector, TraceCollector};
-
-    use crate::query::{
-        mock_query_exec::MockQueryExec,
-        response::{PartitionStream, QueryResponse},
-    };
-
-    use super::*;
-
-    #[track_caller]
-    fn assert_trace(name: impl Into<String>, status: SpanStatus, traces: &dyn TraceCollector) {
-        let traces = traces
-            .as_any()
-            .downcast_ref::<RingBufferTraceCollector>()
-            .expect("unexpected collector impl");
-
-        let name = name.into();
-        let span = traces
-            .spans()
-            .into_iter()
-            .find(|s| s.name == name)
-            .unwrap_or_else(|| panic!("tracing span {name} not found"));
-
-        assert_eq!(
-            span.status, status,
-            "span status does not match expected value"
-        );
-    }
-
-    #[tokio::test]
-    async fn test_ok() {
-        let stream: PartitionStream = PartitionStream::new(futures::stream::iter([]));
-        let mock = MockQueryExec::default().with_result(Ok(QueryResponse::new(stream)));
-
-        let traces: Arc<dyn TraceCollector> = Arc::new(RingBufferTraceCollector::new(5));
-        let span = SpanContext::new(Arc::clone(&traces));
-
-        // Drive the trace wrapper
-        let _ = QueryExecTracing::new(mock, "bananas")
-            .query_exec(
-                NamespaceId::new(42),
-                TableId::new(24),
-                vec![],
-                Some(span.child("root span")),
-            )
-            .await
-            .expect("wrapper should not modify result");
-
-        // Assert the trace showed up.
-        assert_trace("bananas", SpanStatus::Ok, &*traces);
-    }
-
-    #[tokio::test]
-    async fn test_err() {
-        let mock = MockQueryExec::default()
-            .with_result(Err(QueryError::NamespaceNotFound(NamespaceId::new(42))));
-
-        let traces: Arc<dyn TraceCollector> = Arc::new(RingBufferTraceCollector::new(5));
-        let span = SpanContext::new(Arc::clone(&traces));
-
-        // Drive the trace wrapper
-        let got = QueryExecTracing::new(mock, "bananas")
-            .query_exec(
-                NamespaceId::new(42),
-                TableId::new(24),
-                vec![],
-                Some(span.child("root span")),
-            )
-            .await
-            .expect_err("wrapper should not modify result");
-        assert_matches!(got, QueryError::NamespaceNotFound(ns) => {
-            assert_eq!(ns, NamespaceId::new(42));
-        });
-
-        // Assert the trace showed up.
-        assert_trace("bananas", SpanStatus::Err, &*traces);
-    }
-}
--- a/ingest_replica/src/query/trait.rs
+++ b/ingest_replica/src/query/trait.rs
@ -1,49 +0,0 @@
-use std::{fmt::Debug, ops::Deref, sync::Arc};
-
-use async_trait::async_trait;
-use data_types::{NamespaceId, TableId};
-use thiserror::Error;
-use trace::span::Span;
-
-#[derive(Debug, Error)]
-#[allow(missing_copy_implementations)]
-pub(crate) enum QueryError {
-    #[error("namespace id {0} not found")]
-    NamespaceNotFound(NamespaceId),
-
-    #[error("table id {1} not found in namespace id {0}")]
-    TableNotFound(NamespaceId, TableId),
-}
-
-#[async_trait]
-pub(crate) trait QueryExec: Send + Sync + Debug {
-    type Response: Send + Debug;
-
-    async fn query_exec(
-        &self,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        columns: Vec<String>,
-        span: Option<Span>,
-    ) -> Result<Self::Response, QueryError>;
-}
-
-#[async_trait]
-impl<T> QueryExec for Arc<T>
-where
-    T: QueryExec,
-{
-    type Response = T::Response;
-
-    async fn query_exec(
-        &self,
-        namespace_id: NamespaceId,
-        table_id: TableId,
-        columns: Vec<String>,
-        span: Option<Span>,
-    ) -> Result<Self::Response, QueryError> {
-        self.deref()
-            .query_exec(namespace_id, table_id, columns, span)
-            .await
-    }
-}
--- a/ingest_replica/src/query_adaptor.rs
+++ b/ingest_replica/src/query_adaptor.rs
@ -1,208 +0,0 @@
-//! An adaptor over a set of [`RecordBatch`] allowing them to be used as an IOx
-//! [`QueryChunk`].
-
-use std::{any::Any, sync::Arc};
-
-use arrow::record_batch::RecordBatch;
-use arrow_util::util::ensure_schema;
-use data_types::{ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary};
-use datafusion::error::DataFusionError;
-use iox_query::{
-    exec::{stringset::StringSet, IOxSessionContext},
-    util::{compute_timenanosecond_min_max, create_basic_summary},
-    QueryChunk, QueryChunkData, QueryChunkMeta,
-};
-use once_cell::sync::OnceCell;
-use predicate::Predicate;
-use schema::{merge::merge_record_batch_schemas, sort::SortKey, Projection, Schema};
-
-/// A queryable wrapper over a set of ordered [`RecordBatch`]
-///
-/// It is an invariant that a [`QueryAdaptor`] MUST always contain at least one
-/// row. This frees the caller of having to reason about empty [`QueryAdaptor`]
-/// instances yielding empty [`RecordBatch`].
-#[derive(Debug, PartialEq, Clone)]
-pub struct QueryAdaptor {
-    /// The snapshot data from a partition.
-    ///
-    /// This MUST be non-pub(crate) / closed for modification / immutable to support
-    /// interning the merged schema in [`Self::schema()`].
-    data: Vec<Arc<RecordBatch>>,
-
-    /// The catalog ID of the partition the this data is part of.
-    partition_id: PartitionId,
-
-    /// Chunk ID.
-    id: ChunkId,
-
-    /// An interned schema for all [`RecordBatch`] in data.
-    schema: OnceCell<Arc<Schema>>,
-
-    /// An interned table summary.
-    summary: OnceCell<Arc<TableSummary>>,
-}
-
-impl QueryAdaptor {
-    /// Construct a [`QueryAdaptor`].
-    ///
-    /// # Panics
-    ///
-    /// This constructor panics if `data` contains no [`RecordBatch`], or all
-    /// [`RecordBatch`] are empty.
-    pub(crate) fn new(partition_id: PartitionId, data: Vec<Arc<RecordBatch>>) -> Self {
-        // There must always be at least one record batch and one row.
-        //
-        // This upholds an invariant that simplifies dealing with empty
-        // partitions - if there is a QueryAdaptor, it contains data.
-        assert!(data.iter().map(|b| b.num_rows()).sum::<usize>() > 0);
-
-        Self {
-            data,
-            partition_id,
-            // To return a value for debugging and make it consistent with ChunkId created in Compactor,
-            // use Uuid for this. Draw this UUID during chunk generation so that it is stable during the whole query process.
-            id: ChunkId::new(),
-            schema: OnceCell::default(),
-            summary: OnceCell::default(),
-        }
-    }
-
-    pub(crate) fn project_selection(&self, selection: Projection<'_>) -> Vec<RecordBatch> {
-        // Project the column selection across all RecordBatch
-        self.data
-            .iter()
-            .map(|data| {
-                let batch = data.as_ref();
-                let schema = batch.schema();
-
-                // Apply selection to in-memory batch
-                match selection {
-                    Projection::All => batch.clone(),
-                    Projection::Some(columns) => {
-                        let projection = columns
-                            .iter()
-                            .flat_map(|&column_name| {
-                                // ignore non-existing columns
-                                schema.index_of(column_name).ok()
-                            })
-                            .collect::<Vec<_>>();
-                        batch.project(&projection).expect("bug in projection")
-                    }
-                }
-            })
-            .collect()
-    }
-
-    /// Returns the [`RecordBatch`] instances in this [`QueryAdaptor`].
-    pub(crate) fn record_batches(&self) -> &[Arc<RecordBatch>] {
-        self.data.as_ref()
-    }
-
-    /// Returns the partition ID from which the data this [`QueryAdaptor`] was
-    /// sourced from.
-    pub(crate) fn partition_id(&self) -> PartitionId {
-        self.partition_id
-    }
-}
-
-impl QueryChunkMeta for QueryAdaptor {
-    fn summary(&self) -> Arc<TableSummary> {
-        Arc::clone(self.summary.get_or_init(|| {
-            let ts_min_max = compute_timenanosecond_min_max(self.data.iter().map(|b| b.as_ref()))
-                .expect("Should have time range");
-
-            Arc::new(create_basic_summary(
-                self.data.iter().map(|b| b.num_rows()).sum::<usize>() as u64,
-                self.schema(),
-                ts_min_max,
-            ))
-        }))
-    }
-
-    fn schema(&self) -> &Schema {
-        self.schema
-            .get_or_init(|| merge_record_batch_schemas(&self.data).into())
-            .as_ref()
-    }
-
-    fn partition_sort_key(&self) -> Option<&SortKey> {
-        None // Ingester data has not persisted yet and should not be attached to any partition
-    }
-
-    fn partition_id(&self) -> PartitionId {
-        self.partition_id
-    }
-
-    fn sort_key(&self) -> Option<&SortKey> {
-        None // Ingester data is not sorted
-    }
-
-    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
-        &[]
-    }
-}
-
-impl QueryChunk for QueryAdaptor {
-    fn id(&self) -> ChunkId {
-        self.id
-    }
-
-    /// Returns true if the chunk may contain a duplicate "primary key" within
-    /// itself
-    fn may_contain_pk_duplicates(&self) -> bool {
-        // always true because the rows across record batches have not been
-        // de-duplicated.
-        true
-    }
-
-    /// Returns a set of Strings with column names from the specified
-    /// table that have at least one row that matches `predicate`, if
-    /// the predicate can be evaluated entirely on the metadata of
-    /// this Chunk. Returns `None` otherwise
-    fn column_names(
-        &self,
-        _ctx: IOxSessionContext,
-        _predicate: &Predicate,
-        _columns: Projection<'_>,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        Ok(None)
-    }
-
-    /// Return a set of Strings containing the distinct values in the
-    /// specified columns. If the predicate can be evaluated entirely
-    /// on the metadata of this Chunk. Returns `None` otherwise
-    ///
-    /// The requested columns must all have String type.
-    fn column_values(
-        &self,
-        _ctx: IOxSessionContext,
-        _column_name: &str,
-        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        Ok(None)
-    }
-
-    fn data(&self) -> QueryChunkData {
-        let schema = self.schema().as_arrow();
-
-        QueryChunkData::RecordBatches(
-            self.data
-                .iter()
-                .map(|b| ensure_schema(&schema, b).expect("schema handling broken"))
-                .collect(),
-        )
-    }
-
-    /// Returns chunk type
-    fn chunk_type(&self) -> &str {
-        "QueryAdaptor"
-    }
-
-    fn order(&self) -> ChunkOrder {
-        unimplemented!()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
--- a/ingester2/Cargo.toml
+++ b/ingester2/Cargo.toml
@ -35,7 +35,7 @@ parking_lot = "0.12.1"
 parquet_file = { version = "0.1.0", path = "../parquet_file" }
 pin-project = "1.0.12"
 predicate = { version = "0.1.0", path = "../predicate" }
-prost = { version = "0.11.6", default-features = false, features = ["std"] }
+prost = { version = "0.11.9", default-features = false, features = ["std"] }
 rand = "0.8.5"
 schema = { version = "0.1.0", path = "../schema" }
 service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
@ -44,7 +44,7 @@ test_helpers = { path = "../test_helpers", features = ["future_timeout"], option
 thiserror = "1.0.40"
 tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 tokio-util = "0.7.7"
-tonic = "0.8.3"
+tonic = { workspace = true }
 trace = { version = "0.1.0", path = "../trace" }
 uuid = "1.3.1"
 wal = { version = "0.1.0", path = "../wal" }
--- a/ingester2/src/buffer_tree/partition/resolver/cache.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/cache.rs
@ -167,7 +167,7 @@ where
        table_id: TableId,
        table_name: Arc<DeferredLoad<TableName>>,
        transition_shard_id: ShardId,
-    ) -> PartitionData {
+    ) -> Arc<Mutex<PartitionData>> {
        // Use the cached PartitionKey instead of the caller's partition_key,
        // instead preferring to reuse the already-shared Arc<str> in the cache.

@ -188,7 +188,7 @@ where
            // Use the returned partition key instead of the callers - this
            // allows the backing str memory to be reused across all partitions
            // using the same key!
-            return PartitionData::new(
+            return Arc::new(Mutex::new(PartitionData::new(
                partition_id,
                key,
                namespace_id,
@ -197,7 +197,7 @@ where
                table_name,
                SortKeyState::Deferred(Arc::new(sort_key_resolver)),
                transition_shard_id,
-            );
+            )));
        }

        debug!(%table_id, %partition_key, "partition cache miss");
@ -218,6 +218,9 @@ where

 #[cfg(test)]
 mod tests {
+    // Harmless in tests - saves a bunch of extra vars.
+    #![allow(clippy::await_holding_lock)]
+
    use data_types::ShardId;
    use iox_catalog::mem::MemCatalog;

@ -282,10 +285,10 @@ mod tests {
            )
            .await;

-        assert_eq!(got.partition_id(), PARTITION_ID);
-        assert_eq!(got.table_id(), TABLE_ID);
-        assert_eq!(&**got.table_name().get().await, TABLE_NAME);
-        assert_eq!(&**got.namespace_name().get().await, NAMESPACE_NAME);
+        assert_eq!(got.lock().partition_id(), PARTITION_ID);
+        assert_eq!(got.lock().table_id(), TABLE_ID);
+        assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
+        assert_eq!(&**got.lock().namespace_name().get().await, NAMESPACE_NAME);
        assert!(cache.inner.is_empty());
    }

@ -322,11 +325,14 @@ mod tests {
            )
            .await;

-        assert_eq!(got.partition_id(), PARTITION_ID);
-        assert_eq!(got.table_id(), TABLE_ID);
-        assert_eq!(&**got.table_name().get().await, TABLE_NAME);
-        assert_eq!(&**got.namespace_name().get().await, NAMESPACE_NAME);
-        assert_eq!(*got.partition_key(), PartitionKey::from(PARTITION_KEY));
+        assert_eq!(got.lock().partition_id(), PARTITION_ID);
+        assert_eq!(got.lock().table_id(), TABLE_ID);
+        assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
+        assert_eq!(&**got.lock().namespace_name().get().await, NAMESPACE_NAME);
+        assert_eq!(
+            *got.lock().partition_key(),
+            PartitionKey::from(PARTITION_KEY)
+        );

        // The cache should have been cleaned up as it was consumed.
        assert!(cache.entries.lock().is_empty());
@ -334,10 +340,10 @@ mod tests {
        // Assert the partition key from the cache was used for the lifetime of
        // the partition, so that it is shared with the cache + other partitions
        // that share the same partition key across all tables.
-        assert!(got.partition_key().ptr_eq(&stored_partition_key));
+        assert!(got.lock().partition_key().ptr_eq(&stored_partition_key));
        // It does not use the short-lived caller's partition key (derived from
        // the DML op it is processing).
-        assert!(!got.partition_key().ptr_eq(&callers_partition_key));
+        assert!(!got.lock().partition_key().ptr_eq(&callers_partition_key));
    }

    #[tokio::test]
@ -385,9 +391,9 @@ mod tests {
            )
            .await;

-        assert_eq!(got.partition_id(), other_key_id);
-        assert_eq!(got.table_id(), TABLE_ID);
-        assert_eq!(&**got.table_name().get().await, TABLE_NAME);
+        assert_eq!(got.lock().partition_id(), other_key_id);
+        assert_eq!(got.lock().table_id(), TABLE_ID);
+        assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
    }

    #[tokio::test]
@ -434,8 +440,8 @@ mod tests {
            )
            .await;

-        assert_eq!(got.partition_id(), PARTITION_ID);
-        assert_eq!(got.table_id(), other_table);
-        assert_eq!(&**got.table_name().get().await, TABLE_NAME);
+        assert_eq!(got.lock().partition_id(), PARTITION_ID);
+        assert_eq!(got.lock().table_id(), other_table);
+        assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
    }
 }
--- a/ingester2/src/buffer_tree/partition/resolver/catalog.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/catalog.rs
@ -8,6 +8,7 @@ use backoff::{Backoff, BackoffConfig};
 use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
+use parking_lot::Mutex;

 use super::r#trait::PartitionProvider;
 use crate::{
@ -63,7 +64,7 @@ impl PartitionProvider for CatalogPartitionResolver {
        table_id: TableId,
        table_name: Arc<DeferredLoad<TableName>>,
        transition_shard_id: ShardId,
-    ) -> PartitionData {
+    ) -> Arc<Mutex<PartitionData>> {
        debug!(
            %partition_key,
            %table_id,
@ -78,7 +79,7 @@ impl PartitionProvider for CatalogPartitionResolver {
            .await
            .expect("retry forever");

-        PartitionData::new(
+        Arc::new(Mutex::new(PartitionData::new(
            p.id,
            // Use the caller's partition key instance, as it MAY be shared with
            // other instance, but the instance returned from the catalog
@ -90,12 +91,15 @@ impl PartitionProvider for CatalogPartitionResolver {
            table_name,
            SortKeyState::Provided(p.sort_key()),
            transition_shard_id,
-        )
+        )))
    }
 }

 #[cfg(test)]
 mod tests {
+    // Harmless in tests - saves a bunch of extra vars.
+    #![allow(clippy::await_holding_lock)]
+
    use std::{sync::Arc, time::Duration};

    use assert_matches::assert_matches;
@ -157,18 +161,18 @@ mod tests {
            .await;

        // Ensure the table name is available.
-        let _ = got.table_name().get().await;
+        let _ = got.lock().table_name().get().await;

-        assert_eq!(got.namespace_id(), namespace_id);
-        assert_eq!(got.table_name().to_string(), table_name.to_string());
-        assert_matches!(got.sort_key(), SortKeyState::Provided(None));
-        assert!(got.partition_key.ptr_eq(&callers_partition_key));
+        assert_eq!(got.lock().namespace_id(), namespace_id);
+        assert_eq!(got.lock().table_name().to_string(), table_name.to_string());
+        assert_matches!(got.lock().sort_key(), SortKeyState::Provided(None));
+        assert!(got.lock().partition_key.ptr_eq(&callers_partition_key));

        let got = catalog
            .repositories()
            .await
            .partitions()
-            .get_by_id(got.partition_id)
+            .get_by_id(got.lock().partition_id)
            .await
            .unwrap()
            .expect("partition not created");
--- a/ingester2/src/buffer_tree/partition/resolver/coalesce.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/coalesce.rs
@ -0,0 +1,423 @@
+use std::{
+    pin::Pin,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use arrow::compute::kernels::partition;
+use async_trait::async_trait;
+use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
+use futures::{future::Shared, FutureExt};
+use hashbrown::{hash_map::Entry, HashMap};
+use parking_lot::Mutex;
+
+use crate::{
+    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
+    deferred_load::DeferredLoad,
+};
+
+use super::PartitionProvider;
+
+/// A helper alias for a boxed, dynamically dispatched future that resolves to a
+/// arc/mutex wrapped [`PartitionData`].
+type BoxedResolveFuture =
+    Pin<Box<dyn std::future::Future<Output = Arc<Mutex<PartitionData>>> + Send>>;
+
+/// A compound key of `(namespace, table, partition_key)` which uniquely
+/// identifies a single partition.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct Key {
+    namespace_id: NamespaceId,
+    table_id: TableId,
+    partition_key: PartitionKey,
+}
+
+/// The state of the resolver.
+///
+/// The [`Shared`] requires more space than the simple ref-pointer to the
+/// [`PartitionData`], so resolving callers replace the shared handle with the
+/// resolved result where possible.
+#[derive(Debug)]
+enum State {
+    /// A resolve task is ongoing, and the caller can await the [`Shared`]
+    /// future to obtain the result.
+    ///
+    /// If the atomic bool is false, no thread is changing this [`State`] to
+    /// [`State::Resolved`] for the resolved partition. If true, a thread is in
+    /// the process of setting (or already has set) the state to
+    /// [`State::Resolved`].
+    Resolving(Shared<BoxedResolveFuture>, Arc<AtomicBool>),
+
+    /// A prior call resolved this partition.
+    Resolved(Arc<Mutex<PartitionData>>),
+}
+
+/// A coalescing [`PartitionProvider`] reducing N partition fetch requests into
+/// a single call to `T` on a per-partition basis.
+///
+/// This type solves a concurrency problem, where a series of concurrent cache
+/// misses "above" this type causes a series of concurrent lookups against the
+/// inner resolver "below" this type for a single partition. This is wasteful,
+/// as only one result is retained by the callers (a single [`PartitionData`] is
+/// used to reference a partition of data).
+///
+/// This type is typically used to coalesce requests against the
+/// [`CatalogPartitionResolver`]:
+///
+/// ```text
+///                    ┌─────────────────────────────┐
+///                    │            Cache            │
+///                    └─────────────────────────────┘
+///                               │   │   │
+///                               ▼   ▼   ▼
+///                    ┌─────────────────────────────┐
+///                    │  CoalescePartitionResolver  │
+///                    └─────────────────────────────┘
+///                                   │
+///                                   ▼
+///                    ┌─────────────────────────────┐
+///                    │  CatalogPartitionResolver   │
+///                    └─────────────────────────────┘
+/// ```
+///
+/// Imagine the following concurrent requests without this type:
+///
+///  * T1: check cache for partition A, miss
+///  * T2: check cache for partition A, miss
+///  * T1: inner.get_partition(A)
+///  * T2: inner.get_partition(A)
+///  * T1: cache put partition A
+///  * T2: cache put partition A
+///
+/// With this type, the concurrent requests for a single partition (A) are
+/// coalesced into a single request against the inner resolver:
+///
+///  * T1: check cache for partition A, miss
+///  * T2: check cache for partition A, miss
+///  * T1: CoalescePartitionResolver::get_partition(A)
+///  * T2: CoalescePartitionResolver::get_partition(A)
+///  * inner.get_partition() **(a single call to inner is made)**
+///  * T1: cache put partition A
+///  * T2: cache put partition A
+///
+/// # Memory Overhead
+///
+/// This type makes a best effort attempt to minimise the memory overhead of
+/// memorising partition fetches. Callers drop the intermediate resolving state
+/// upon success, leaving only a ref-counted pointer to the shared
+/// [`PartitionData`] (a single [`Arc`] ref overhead).
+///
+/// # Cancellation Safety
+///
+/// This type is cancellation safe - calls to
+/// [`CoalescePartitionResolver::get_partition()`] are safe to abort at any
+/// point.
+///
+/// [`CatalogPartitionResolver`]: super::CatalogPartitionResolver
+#[derive(Debug)]
+pub struct CoalescePartitionResolver<T> {
+    /// The inner resolver the actual partition fetch is delegated to.
+    inner: Arc<T>,
+
+    /// A map of handles to ongoing resolve futures.
+    ongoing: Mutex<HashMap<Key, State>>,
+}
+
+impl<T> CoalescePartitionResolver<T> {
+    pub fn new(inner: Arc<T>) -> Self {
+        Self {
+            inner,
+            ongoing: Mutex::new(HashMap::default()),
+        }
+    }
+}
+
+#[async_trait]
+impl<T> PartitionProvider for CoalescePartitionResolver<T>
+where
+    T: PartitionProvider + 'static,
+{
+    async fn get_partition(
+        &self,
+        partition_key: PartitionKey,
+        namespace_id: NamespaceId,
+        namespace_name: Arc<DeferredLoad<NamespaceName>>,
+        table_id: TableId,
+        table_name: Arc<DeferredLoad<TableName>>,
+        transition_shard_id: ShardId,
+    ) -> Arc<Mutex<PartitionData>> {
+        let key = Key {
+            namespace_id,
+            table_id,
+            partition_key: partition_key.clone(), // Ref-counted anyway!
+        };
+
+        // Check if there's an ongoing (or recently completed) resolve.
+        let (shared, done) = match self.ongoing.lock().entry(key.clone()) {
+            Entry::Occupied(v) => match v.get() {
+                State::Resolving(fut, done) => (fut.clone(), Arc::clone(done)),
+                State::Resolved(v) => return Arc::clone(v),
+            },
+            Entry::Vacant(v) => {
+                // Spawn a future to resolve the partition, and retain a handle
+                // to it.
+                let inner = Arc::clone(&self.inner);
+                let fut: BoxedResolveFuture = Box::pin(async move {
+                    inner
+                        .get_partition(
+                            partition_key,
+                            namespace_id,
+                            namespace_name,
+                            table_id,
+                            table_name,
+                            transition_shard_id,
+                        )
+                        .await
+                });
+
+                // Make the future poll-able by many callers, all of which
+                // resolve to the same output PartitionData instance.
+                let fut = fut.shared();
+                let done = Arc::new(AtomicBool::new(false));
+
+                // Allow future callers to obtain this shared handle, instead of
+                // resolving the partition themselves.
+                v.insert(State::Resolving(fut.clone(), Arc::clone(&done)));
+
+                (fut, done)
+            }
+        };
+
+        // Wait for the resolve to complete.
+        //
+        // If this caller future is dropped before this resolve future
+        // completes, then it remains unpolled until the next caller obtains a
+        // shared handle and continues the process.
+        let res = shared.await;
+
+        // As an optimisation, select exactly one thread to acquire the lock and
+        // change the state instead of every caller trying to set the state to
+        // "resolved", which involves contending on the lock for all concurrent
+        // callers for all concurrent partition fetches.
+        //
+        // Any caller that has been awaiting the shared future above is a
+        // candidate to perform this state change, but only one thread will
+        // attempt to. If the presence of aborted callers waiting on the shared
+        // future, each completed await caller will attempt to change state
+        // (cancellation safe).
+        if done
+            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed)
+            .is_ok()
+        {
+            // This task should drop the Shared, swapping it for the resolved
+            // state.
+            //
+            // This thread SHOULD NOT fail to perform this action as no other
+            // thread will attempt it now the bool has been toggled.
+            let old = self
+                .ongoing
+                .lock()
+                .insert(key, State::Resolved(Arc::clone(&res)));
+
+            // Invariant: the resolve future must exist in the map, and the
+            // state may only be changed by the thread that won the CAS.
+            assert!(matches!(old, Some(State::Resolving(..))));
+        }
+
+        res
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        future,
+        sync::Arc,
+        task::{Context, Poll},
+        time::Duration,
+    };
+
+    use assert_matches::assert_matches;
+    use data_types::{PartitionId, TRANSITION_SHARD_ID};
+    use futures::{stream::FuturesUnordered, StreamExt};
+    use test_helpers::timeout::FutureTimeout;
+
+    use crate::buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState};
+
+    use super::*;
+
+    const PARTITION_KEY: &str = "bananas";
+
+    #[tokio::test]
+    async fn test_coalesce() {
+        const MAX_TASKS: usize = 50;
+
+        let namespace_id = NamespaceId::new(1234);
+        let namespace_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
+            NamespaceName::from("ns-platanos")
+        }));
+        let table_id = TableId::new(24);
+        let table_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
+            TableName::from("platanos")
+        }));
+        let partition = PartitionId::new(4242);
+        let data = PartitionData::new(
+            partition,
+            PartitionKey::from(PARTITION_KEY),
+            namespace_id,
+            Arc::clone(&namespace_name),
+            table_id,
+            Arc::clone(&table_name),
+            SortKeyState::Provided(None),
+            TRANSITION_SHARD_ID,
+        );
+
+        // Add a single instance of the partition - if more than one call is
+        // made, this will cause a panic.
+        let inner = Arc::new(MockPartitionProvider::default().with_partition(data));
+        let layer = Arc::new(CoalescePartitionResolver::new(Arc::clone(&inner)));
+
+        let results = (0..MAX_TASKS)
+            .map(|_| {
+                let namespace_name = Arc::clone(&namespace_name);
+                let table_name = Arc::clone(&table_name);
+                layer.get_partition(
+                    PartitionKey::from(PARTITION_KEY),
+                    namespace_id,
+                    namespace_name,
+                    table_id,
+                    table_name,
+                    TRANSITION_SHARD_ID,
+                )
+            })
+            .collect::<FuturesUnordered<_>>()
+            .collect::<Vec<_>>()
+            .await;
+
+        // All the resulting instances of PartitionData MUST be the same
+        // ref-counted instance.
+        results.as_slice().windows(2).for_each(|v| {
+            assert!(Arc::ptr_eq(&v[0], &v[1]));
+        });
+
+        // The state should have been set to "resolved" to reclaim memory
+        assert_matches!(
+            layer.ongoing.lock().values().next(),
+            Some(State::Resolved(..))
+        );
+    }
+
+    // A resolver that blocks forever when resolving PARTITION_KEY but instantly
+    // finishes all others.
+    #[derive(Debug)]
+    struct BlockingResolver {
+        p: Arc<Mutex<PartitionData>>,
+    }
+
+    impl PartitionProvider for BlockingResolver {
+        fn get_partition<'life0, 'async_trait>(
+            &'life0 self,
+            partition_key: PartitionKey,
+            _namespace_id: NamespaceId,
+            _namespace_name: Arc<DeferredLoad<NamespaceName>>,
+            _table_id: TableId,
+            _table_name: Arc<DeferredLoad<TableName>>,
+            _transition_shard_id: ShardId,
+        ) -> core::pin::Pin<
+            Box<
+                dyn core::future::Future<Output = Arc<Mutex<PartitionData>>>
+                    + core::marker::Send
+                    + 'async_trait,
+            >,
+        >
+        where
+            'life0: 'async_trait,
+            Self: 'async_trait,
+        {
+            if partition_key == PartitionKey::from(PARTITION_KEY) {
+                return future::pending().boxed();
+            }
+            future::ready(Arc::clone(&self.p)).boxed()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_disjoint_parallelised() {
+        use futures::Future;
+
+        let namespace_id = NamespaceId::new(1234);
+        let namespace_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
+            NamespaceName::from("ns-platanos")
+        }));
+        let table_id = TableId::new(24);
+        let table_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
+            TableName::from("platanos")
+        }));
+        let partition = PartitionId::new(4242);
+        let data = PartitionData::new(
+            partition,
+            PartitionKey::from(PARTITION_KEY),
+            namespace_id,
+            Arc::clone(&namespace_name),
+            table_id,
+            Arc::clone(&table_name),
+            SortKeyState::Provided(None),
+            TRANSITION_SHARD_ID,
+        );
+
+        // Add a single instance of the partition - if more than one call is
+        // made to the mock, it will panic.
+        let inner = Arc::new(BlockingResolver {
+            p: Arc::new(Mutex::new(data)),
+        });
+        let layer = Arc::new(CoalescePartitionResolver::new(inner));
+
+        // The following two partitions are for the same (blocked) partition and
+        // neither resolve.
+        let pa_1 = layer.get_partition(
+            PartitionKey::from(PARTITION_KEY),
+            namespace_id,
+            Arc::clone(&namespace_name),
+            table_id,
+            Arc::clone(&table_name),
+            TRANSITION_SHARD_ID,
+        );
+        let pa_2 = layer.get_partition(
+            PartitionKey::from(PARTITION_KEY),
+            namespace_id,
+            Arc::clone(&namespace_name),
+            table_id,
+            Arc::clone(&table_name),
+            TRANSITION_SHARD_ID,
+        );
+
+        let waker = futures::task::noop_waker();
+        let mut cx = Context::from_waker(&waker);
+
+        futures::pin_mut!(pa_1);
+        futures::pin_mut!(pa_2);
+
+        // Neither make progress
+        assert_matches!(Pin::new(&mut pa_1).poll(&mut cx), Poll::Pending);
+        assert_matches!(Pin::new(&mut pa_2).poll(&mut cx), Poll::Pending);
+
+        // But a non-blocked partition is resolved without issue.
+        let _ = layer
+            .get_partition(
+                PartitionKey::from("platanos"),
+                namespace_id,
+                namespace_name,
+                table_id,
+                table_name,
+                TRANSITION_SHARD_ID,
+            )
+            .with_timeout_panic(Duration::from_secs(5))
+            .await;
+
+        // While the original requests are still blocked.
+        assert_matches!(Pin::new(&mut pa_1).poll(&mut cx), Poll::Pending);
+        assert_matches!(Pin::new(&mut pa_2).poll(&mut cx), Poll::Pending);
+    }
+}
--- a/ingester2/src/buffer_tree/partition/resolver/mock.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/mock.rs
@ -55,7 +55,7 @@ impl PartitionProvider for MockPartitionProvider {
        table_id: TableId,
        table_name: Arc<DeferredLoad<TableName>>,
        _transition_shard_id: ShardId,
-    ) -> PartitionData {
+    ) -> Arc<Mutex<PartitionData>> {
        let p = self
            .partitions
            .lock()
@ -67,6 +67,6 @@ impl PartitionProvider for MockPartitionProvider {
        assert_eq!(p.namespace_id(), namespace_id);
        assert_eq!(p.namespace_name().to_string(), namespace_name.to_string());
        assert_eq!(p.table_name().to_string(), table_name.to_string());
-        p
+        Arc::new(Mutex::new(p))
    }
 }
--- a/ingester2/src/buffer_tree/partition/resolver/mod.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/mod.rs
@ -16,5 +16,8 @@ pub(crate) use catalog::*;
 mod sort_key;
 pub(crate) use sort_key::*;

+mod coalesce;
+pub(crate) use coalesce::*;
+
 #[cfg(test)]
 pub(crate) mod mock;
--- a/ingester2/src/buffer_tree/partition/resolver/trait.rs
+++ b/ingester2/src/buffer_tree/partition/resolver/trait.rs
@ -2,6 +2,7 @@ use std::{fmt::Debug, sync::Arc};

 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
+use parking_lot::Mutex;

 use crate::{
    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
@ -25,7 +26,7 @@ pub(crate) trait PartitionProvider: Send + Sync + Debug {
        table_id: TableId,
        table_name: Arc<DeferredLoad<TableName>>,
        transition_shard_id: ShardId,
-    ) -> PartitionData;
+    ) -> Arc<Mutex<PartitionData>>;
 }

 #[async_trait]
@ -41,7 +42,7 @@ where
        table_id: TableId,
        table_name: Arc<DeferredLoad<TableName>>,
        transition_shard_id: ShardId,
-    ) -> PartitionData {
+    ) -> Arc<Mutex<PartitionData>> {
        (**self)
            .get_partition(
                partition_key,
@ -101,9 +102,12 @@ mod tests {
                TRANSITION_SHARD_ID,
            )
            .await;
-        assert_eq!(got.partition_id(), partition);
-        assert_eq!(got.namespace_id(), namespace_id);
-        assert_eq!(got.namespace_name().to_string(), namespace_name.to_string());
-        assert_eq!(got.table_name().to_string(), table_name.to_string());
+        assert_eq!(got.lock().partition_id(), partition);
+        assert_eq!(got.lock().namespace_id(), namespace_id);
+        assert_eq!(
+            got.lock().namespace_name().to_string(),
+            namespace_name.to_string()
+        );
+        assert_eq!(got.lock().table_name().to_string(), table_name.to_string());
    }
 }
--- a/ingester2/src/buffer_tree/table.rs
+++ b/ingester2/src/buffer_tree/table.rs
@ -183,8 +183,7 @@ where
                //
                // This MAY return a different instance than `p` if another
                // thread has already initialised the partition.
-                self.partition_data
-                    .get_or_insert_with(&partition_key, || Arc::new(Mutex::new(p)))
+                self.partition_data.get_or_insert_with(&partition_key, || p)
            }
        };

@ -223,8 +222,9 @@ where
        );

        // Gather the partition data from all of the partitions in this table.
+        let span = SpanRecorder::new(span);
        let partitions = self.partitions().into_iter().map(move |p| {
-            let mut span = SpanRecorder::new(span.clone().map(|s| s.child("partition read")));
+            let mut span = span.child("partition read");

            let (id, completed_persistence_count, data) = {
                let mut p = p.lock();
--- a/ingester2/src/init.rs
+++ b/ingester2/src/init.rs
@ -26,7 +26,9 @@ use wal::Wal;
 use crate::{
    buffer_tree::{
        namespace::name_resolver::{NamespaceNameProvider, NamespaceNameResolver},
-        partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
+        partition::resolver::{
+            CatalogPartitionResolver, CoalescePartitionResolver, PartitionCache, PartitionProvider,
+        },
        table::name_resolver::{TableNameProvider, TableNameResolver},
        BufferTree,
    },
@ -281,8 +283,10 @@ where
        .await
        .map_err(InitError::PreWarmPartitions)?;

-    // Build the partition provider, wrapped in the partition cache.
+    // Build the partition provider, wrapped in the partition cache and request
+    // coalescer.
    let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
+    let partition_provider = CoalescePartitionResolver::new(Arc::new(partition_provider));
    let partition_provider = PartitionCache::new(
        partition_provider,
        recent_partitions,
--- a/ingester2/src/query/tracing.rs
+++ b/ingester2/src/query/tracing.rs
@ -43,12 +43,11 @@ where
        columns: Vec<String>,
        span: Option<Span>,
    ) -> Result<Self::Response, QueryError> {
-        let span = span.map(|s| s.child(self.name.clone()));
-        let mut recorder = SpanRecorder::new(span.clone());
+        let mut recorder = SpanRecorder::new(span).child(self.name.clone());

        match self
            .inner
-            .query_exec(namespace_id, table_id, columns, span)
+            .query_exec(namespace_id, table_id, columns, recorder.span().cloned())
            .await
        {
            Ok(v) => {
@ -89,7 +88,7 @@ mod tests {
            .spans()
            .into_iter()
            .find(|s| s.name == name)
-            .unwrap_or_else(|| panic!("tracing span {name} not found"));
+            .unwrap_or_else(|| panic!("tracing span {name} not found in\n{traces:#?}"));

        assert_eq!(
            span.status, status,
--- a/ingester2/src/server/grpc/query.rs
+++ b/ingester2/src/server/grpc/query.rs
@ -146,6 +146,7 @@ where
        request: Request<Ticket>,
    ) -> Result<Response<Self::DoGetStream>, tonic::Status> {
        let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
+        let span = span_ctx.child_span("ingester query");

        // Acquire and hold a permit for the duration of this request, or return
        // an error if the existing requests have already exhausted the
@ -178,12 +179,7 @@ where

        let response = match self
            .query_handler
-            .query_exec(
-                namespace_id,
-                table_id,
-                request.columns,
-                span_ctx.child_span("ingester query"),
-            )
+            .query_exec(namespace_id, table_id, request.columns, span)
            .await
        {
            Ok(v) => v,
--- a/ingester2_test_ctx/Cargo.toml
+++ b/ingester2_test_ctx/Cargo.toml
@ -25,11 +25,11 @@ mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
 object_store = "0.5.6"
 observability_deps = { version = "0.1.0", path = "../observability_deps" }
 parquet_file = { version = "0.1.0", path = "../parquet_file" }
-prost = { version = "0.11.6", default-features = false, features = ["std"] }
+prost = { version = "0.11.9", default-features = false, features = ["std"] }
 tempfile = { version = "3.5.0" }
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
 tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 tokio-util = "0.7.7"
-tonic = "0.8.3"
+tonic = { workspace = true }
 wal = { version = "0.1.0", path = "../wal" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@ -24,7 +24,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
 regex = "1.7"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.95"
+serde_json = "1.0.96"
 snafu = "0.7"
 tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 toml = "0.7.3"
--- a/iox_query/Cargo.toml
+++ b/iox_query/Cargo.toml
@ -29,6 +29,7 @@ indexmap = { version = "1.9", features = ["std"] }
 itertools = "0.10.5"
 object_store = "0.5.6"
 observability_deps = { path = "../observability_deps" }
+once_cell = "1"
 parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
 query_functions = { path = "../query_functions"}
--- a/iox_query/src/exec/gapfill/algo.rs
+++ b/iox_query/src/exec/gapfill/algo.rs
@ -45,16 +45,19 @@ use super::{params::GapFillParams, FillStrategy};
 ///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
 ///  │                  2 ║    ║   │   │             ║   │   │             ║
 ///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
-///  │                  3 ║    ║   │   │             ║   │   │             ║
 ///  │                      .                .                     .
 /// output_batch_size       .                .                     .
 ///  │                      .                .                     .
+///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
 ///  │              n - 1 ║    ║   │   │             ║   │   │             ║
 ///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
 ///  ┴────              n ║    ║   │   │             ║   │   │             ║
 ///                       ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
-/// trailing row    n + 1 ║    ║   │   │             ║   │   │             ║
-///                       ╙────╨───┴───┴─────────────╨───┴───┴─────────────╜
+/// trailing row(s) n + 1 ║    ║   │   │             ║   │   │             ║
+///                       ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///                         .                .                     .
+///                         .                .                     .
+///                         .                .                     .
 /// ```
 ///
 /// Just before generating output, the cursor will generally point at offset 1
@ -69,13 +72,19 @@ use super::{params::GapFillParams, FillStrategy};
 ///       (using the [`take`](take::take) kernel) when we are generating trailing gaps, i.e.,
 ///       when all of the input rows have been output for a series in the previous batch,
 ///       but there still remains missing rows to produce at the end.
-/// - Having one additional _trailing row_ at the end ensures that `GapFiller` can
+/// - Having at least one additional _trailing row_ at the end ensures that `GapFiller` can
 ///       infer whether there is trailing gaps to produce at the beginning of the
 ///       next batch, since it can discover if the last row starts a new series.
+/// - If there are columns that have a fill strategy of [`LinearInterpolate`], then more
+///       trailing rows may be necessary to find the next non-null value for the column.
+///
+/// [`LinearInterpolate`]: FillStrategy::LinearInterpolate
 #[derive(Debug)]
 pub(super) struct GapFiller {
    /// The static parameters of gap-filling: time range start, end and the stride.
    params: GapFillParams,
+    /// The number of rows to produce in each output batch.
+    batch_size: usize,
    /// The current state of gap-filling, including the next timestamp,
    /// the offset of the next input row, and remaining space in output batch.
    cursor: Cursor,
@ -83,9 +92,25 @@ pub(super) struct GapFiller {

 impl GapFiller {
    /// Initialize a [GapFiller] at the beginning of an input record batch.
-    pub fn new(params: GapFillParams) -> Self {
+    pub fn new(params: GapFillParams, batch_size: usize) -> Self {
        let cursor = Cursor::new(&params);
-        Self { params, cursor }
+        Self {
+            params,
+            batch_size,
+            cursor,
+        }
+    }
+
+    /// Given that the cursor points at the input row that will be
+    /// the first row in the next output batch, return the offset
+    /// of last input row that could possibly be in the output.
+    ///
+    /// This offset is used by ['BufferedInput`] to determine how many
+    /// rows need to be buffered.
+    ///
+    /// [`BufferedInput`]: super::BufferedInput
+    pub(super) fn last_output_row_offset(&self) -> usize {
+        self.cursor.next_input_offset + self.batch_size - 1
    }

    /// Returns true if there are no more output rows to produce given
@ -100,14 +125,13 @@ impl GapFiller {
    /// schema at member `0`.
    pub fn build_gapfilled_output(
        &mut self,
-        batch_size: usize,
        schema: SchemaRef,
        input_time_array: (usize, &TimestampNanosecondArray),
        group_arrays: &[(usize, ArrayRef)],
        aggr_arrays: &[(usize, ArrayRef)],
    ) -> Result<RecordBatch> {
-        let series_ends = self.plan_output_batch(batch_size, input_time_array.1, group_arrays)?;
-        self.cursor.remaining_output_batch_size = batch_size;
+        let series_ends = self.plan_output_batch(input_time_array.1, group_arrays)?;
+        self.cursor.remaining_output_batch_size = self.batch_size;
        self.build_output(
            schema,
            input_time_array,
@ -139,7 +163,6 @@ impl GapFiller {
    /// to partition input rows into series.
    fn plan_output_batch(
        &mut self,
-        batch_size: usize,
        input_time_array: &TimestampNanosecondArray,
        group_arr: &[(usize, ArrayRef)],
    ) -> Result<Vec<usize>> {
@ -165,7 +188,7 @@ impl GapFiller {

        let start_offset = cursor.next_input_offset;
        assert!(start_offset <= 1, "input is sliced after it is consumed");
-        while output_row_count < batch_size {
+        while output_row_count < self.batch_size {
            match ranges.next() {
                Some(Range { end, .. }) => {
                    assert!(
--- a/iox_query/src/exec/gapfill/algo/interpolate.rs
+++ b/iox_query/src/exec/gapfill/algo/interpolate.rs
@ -90,7 +90,6 @@ impl Cursor {
            .map(|seg| Segment::<T::Native>::try_from(seg.clone()))
            .transpose()?;
        let mut builder = InterpolateBuilder {
-            params,
            values: Vec::with_capacity(self.remaining_output_batch_size),
            segment,
            input_time_array,
@ -173,7 +172,6 @@ impl_from_segment_scalar_value!(f64);
 /// Implements [`VecBuilder`] for build aggregate columns whose gaps
 /// are being filled using linear interpolation.
 pub(super) struct InterpolateBuilder<'a, T: ArrowPrimitiveType> {
-    pub params: &'a GapFillParams,
    pub values: Vec<Option<T::Native>>,
    pub segment: Option<Segment<T::Native>>,
    pub input_time_array: &'a TimestampNanosecondArray,
@ -193,27 +191,25 @@ where
                offset,
                series_end_offset,
            } => {
-                // If
-                //   we are not at the last point
-                //   and the distance to the next point is greater than the stride
-                //   and both this point and the next are not null
-                // then create a segment that will be used to fill in the missing rows.
-                if offset + 1 < series_end_offset
-                    && self.input_time_array.value(offset + 1) > ts + self.params.stride
-                    && self.input_aggr_array.is_valid(offset)
-                    && self.input_aggr_array.is_valid(offset + 1)
-                {
-                    self.segment = Some(Segment {
+                if self.input_aggr_array.is_valid(offset) {
+                    let end_offset = self.find_end_offset(offset, series_end_offset);
+                    // Find the next non-null value in this column for the series.
+                    // If there is one, start a new segment at the current value.
+                    self.segment = end_offset.map(|end_offset| Segment {
                        start_point: (ts, self.input_aggr_array.value(offset)),
                        end_point: (
-                            self.input_time_array.value(offset + 1),
-                            self.input_aggr_array.value(offset + 1),
+                            self.input_time_array.value(end_offset),
+                            self.input_aggr_array.value(end_offset),
                        ),
-                    })
+                    });
+                    self.copy_point(offset);
                } else {
-                    self.segment = None;
+                    self.values.push(
+                        self.segment
+                            .as_ref()
+                            .map(|seg| T::Native::interpolate(seg, ts)),
+                    );
                }
-                self.copy_point(offset);
            }
            RowStatus::Missing { ts, .. } => self.values.push(
                self.segment
@ -243,6 +239,17 @@ where
            .then_some(self.input_aggr_array.value(offset));
        self.values.push(v)
    }
+
+    /// Scan forward to find the endpoint for a segment that starts at `start_offset`.
+    /// Skip over any null values.
+    ///
+    /// We are guaranteed to have buffered enough input to find the next non-null point for this series,
+    /// if there is one, by the logic in [`BufferedInput`].
+    ///
+    /// [`BufferedInput`]: super::super::buffered_input::BufferedInput
+    fn find_end_offset(&self, start_offset: usize, series_end_offset: usize) -> Option<usize> {
+        ((start_offset + 1)..series_end_offset).find(|&i| self.input_aggr_array.is_valid(i))
+    }
 }

 /// A trait for the native numeric types that can be interpolated
@ -375,8 +382,8 @@ mod test {
        - "| 1970-01-01T00:00:00.000001200Z | 133  |"
        - "| 1970-01-01T00:00:00.000001300Z | 166  |"
        - "| 1970-01-01T00:00:00.000001400Z | 200  |"
-        - "| 1970-01-01T00:00:00.000001500Z |      |"
-        - "| 1970-01-01T00:00:00.000001600Z |      |"
+        - "| 1970-01-01T00:00:00.000001500Z | 466  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 733  |"
        - "| 1970-01-01T00:00:00.000001700Z | 1000 |"
        - "| 1970-01-01T00:00:00.000001800Z | 500  |"
        - "| 1970-01-01T00:00:00.000001900Z | 0    |"
@ -447,8 +454,8 @@ mod test {
        - "| 1970-01-01T00:00:00.000001200Z | 133  |"
        - "| 1970-01-01T00:00:00.000001300Z | 166  |"
        - "| 1970-01-01T00:00:00.000001400Z | 200  |"
-        - "| 1970-01-01T00:00:00.000001500Z |      |"
-        - "| 1970-01-01T00:00:00.000001600Z |      |"
+        - "| 1970-01-01T00:00:00.000001500Z | 466  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 733  |"
        - "| 1970-01-01T00:00:00.000001700Z | 1000 |"
        - "| 1970-01-01T00:00:00.000001800Z | 500  |"
        - "| 1970-01-01T00:00:00.000001900Z | 0    |"
@ -519,8 +526,8 @@ mod test {
        - "| 1970-01-01T00:00:00.000001200Z | 200.0  |"
        - "| 1970-01-01T00:00:00.000001300Z | 300.0  |"
        - "| 1970-01-01T00:00:00.000001400Z | 400.0  |"
-        - "| 1970-01-01T00:00:00.000001500Z |        |"
-        - "| 1970-01-01T00:00:00.000001600Z |        |"
+        - "| 1970-01-01T00:00:00.000001500Z | 600.0  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 800.0  |"
        - "| 1970-01-01T00:00:00.000001700Z | 1000.0 |"
        - "| 1970-01-01T00:00:00.000001800Z | 500.0  |"
        - "| 1970-01-01T00:00:00.000001900Z | 0.0    |"
--- a/iox_query/src/exec/gapfill/buffered_input.rs
+++ b/iox_query/src/exec/gapfill/buffered_input.rs
@ -0,0 +1,405 @@
+//! Logic for buffering record batches for gap filling.
+
+use std::sync::Arc;
+
+use arrow::{
+    array::ArrayRef,
+    record_batch::RecordBatch,
+    row::{RowConverter, Rows, SortField},
+};
+use datafusion::error::{DataFusionError, Result};
+use hashbrown::HashSet;
+
+use super::{params::GapFillParams, FillStrategy};
+
+/// Encapsulate the logic around how to buffer input records.
+///
+/// If there are no columns with [`FillStrategy::LinearInterpolate`], then
+/// we need to buffer up to the last input row that might appear in the output, plus
+/// one additional row.
+///
+/// However, if there are columns filled via interpolation, then we need
+/// to ensure that we read ahead far enough to a non-null value, or a change
+/// of group columns, in the columns being interpolated.
+///
+/// [`FillStrategy::LinearInterpolate`]: super::FillStrategy::LinearInterpolate
+/// [`GapFillStream`]: super::stream::GapFillStream
+pub(super) struct BufferedInput {
+    /// Indexes of group columns in the schema (not including time).
+    group_cols: Vec<usize>,
+    /// Indexes of aggregate columns filled via interpolation.
+    interpolate_cols: Vec<usize>,
+    /// Buffered records from the input stream.
+    batches: Vec<RecordBatch>,
+    /// When gap filling with interpolated values, this row converter
+    /// is used to compare rows to see if group columns have changed.
+    row_converter: Option<RowConverter>,
+    /// When gap filling with interpolated values, cache a row-oriented
+    /// representation of the last row that may appear in the output so
+    /// it doesn't need to be computed more than once.
+    last_output_row: Option<Rows>,
+}
+
+impl BufferedInput {
+    pub(super) fn new(params: &GapFillParams, group_cols: Vec<usize>) -> Self {
+        let interpolate_cols = params
+            .fill_strategy
+            .iter()
+            .filter_map(|(col_offset, fs)| {
+                (fs == &FillStrategy::LinearInterpolate).then_some(*col_offset)
+            })
+            .collect::<Vec<usize>>();
+        Self {
+            group_cols,
+            interpolate_cols,
+            batches: vec![],
+            row_converter: None,
+            last_output_row: None,
+        }
+    }
+    /// Add a new batch of buffered records from the input stream.
+    pub(super) fn push(&mut self, batch: RecordBatch) {
+        self.batches.push(batch);
+    }
+
+    /// Transfer ownership of the buffered record batches to the caller for
+    /// processing.
+    pub(super) fn take(&mut self) -> Vec<RecordBatch> {
+        self.last_output_row = None;
+        std::mem::take(&mut self.batches)
+    }
+
+    /// Determine if we need more input before we start processing.
+    pub(super) fn need_more(&mut self, last_output_row_offset: usize) -> Result<bool> {
+        let record_count: usize = self.batches.iter().map(|rb| rb.num_rows()).sum();
+        // min number of rows needed is the number of rows up to and including
+        // the last row that may appear in the output, plus one more row.
+        let min_needed = last_output_row_offset + 2;
+
+        if record_count < min_needed {
+            return Ok(true);
+        } else if self.interpolate_cols.is_empty() {
+            return Ok(false);
+        }
+
+        // Check to see if the last row that might appear in the output
+        // has a different group column values than the last buffered row.
+        // If they are different, then we have enough input to start.
+        let (last_output_batch_offset, last_output_row_offset) = self
+            .find_row_idx(last_output_row_offset)
+            .expect("checked record count");
+        if self.group_columns_changed((last_output_batch_offset, last_output_row_offset))? {
+            return Ok(false);
+        }
+
+        // Now check if there are non-null values in the columns being interpolated.
+        // We skip over the batches that come before the one that contains the last
+        // possible output row. We start with the last buffered batch, so we can avoid
+        // having to slice unless necessary.
+        let mut cols_that_need_more =
+            HashSet::<usize>::from_iter(self.interpolate_cols.iter().cloned());
+        let mut to_remove = vec![];
+        for (i, batch) in self
+            .batches
+            .iter()
+            .enumerate()
+            .skip(last_output_batch_offset)
+            .rev()
+        {
+            for col_offset in cols_that_need_more.clone() {
+                // If this is the batch containing the last possible output row, slice the
+                // array so we are just looking at that value and the ones after.
+                let array = batch.column(col_offset);
+                let array = if i == last_output_batch_offset {
+                    let length = array.len() - last_output_row_offset;
+                    batch
+                        .column(col_offset)
+                        .slice(last_output_row_offset, length)
+                } else {
+                    Arc::clone(array)
+                };
+
+                if array.null_count() < array.len() {
+                    to_remove.push(col_offset);
+                }
+            }
+
+            to_remove.drain(..).for_each(|c| {
+                cols_that_need_more.remove(&c);
+            });
+            if cols_that_need_more.is_empty() {
+                break;
+            }
+        }
+
+        Ok(!cols_that_need_more.is_empty())
+    }
+
+    /// Check to see if the group column values have changed between the last row
+    /// that may be in the output and the last buffered input row.
+    ///
+    /// This method uses the row-oriented representation of Arrow data from [`arrow::row`] to
+    /// compare rows in different record batches.
+    ///
+    /// [`arrow::row`]: https://docs.rs/arrow-row/36.0.0/arrow_row/index.html
+    fn group_columns_changed(&mut self, last_output_row_idx: (usize, usize)) -> Result<bool> {
+        if self.group_cols.is_empty() {
+            return Ok(false);
+        }
+
+        let last_buffered_row_idx = self.last_buffered_row_idx();
+        if last_output_row_idx == last_buffered_row_idx {
+            // the output row is also the last buffered row,
+            // so there is nothing to compare.
+            return Ok(false);
+        }
+
+        let last_input_rows = self.convert_row(self.last_buffered_row_idx())?;
+        let last_row_in_output = self.last_output_row(last_output_row_idx)?;
+
+        Ok(last_row_in_output.row(0) != last_input_rows.row(0))
+    }
+
+    /// Get a row converter for comparing records. Keep it in [`Self::row_converter`]
+    /// to avoid creating it multiple times.
+    fn get_row_converter(&mut self) -> Result<&mut RowConverter> {
+        if self.row_converter.is_none() {
+            let batch = self.batches.first().expect("at least one batch");
+            let sort_fields = self
+                .group_cols
+                .iter()
+                .map(|c| SortField::new(batch.column(*c).data_type().clone()))
+                .collect();
+            let row_converter =
+                RowConverter::new(sort_fields).map_err(DataFusionError::ArrowError)?;
+            self.row_converter = Some(row_converter);
+        }
+        Ok(self.row_converter.as_mut().expect("cannot be none"))
+    }
+
+    /// Convert a row to row-oriented format for easy comparison.
+    fn convert_row(&mut self, row_idxs: (usize, usize)) -> Result<Rows> {
+        let batch = &self.batches[row_idxs.0];
+        let columns: Vec<ArrayRef> = self
+            .group_cols
+            .iter()
+            .map(|col_idx| batch.column(*col_idx).slice(row_idxs.1, 1))
+            .collect();
+        self.get_row_converter()?
+            .convert_columns(&columns)
+            .map_err(DataFusionError::ArrowError)
+    }
+
+    /// Returns the row-oriented representation of the last buffered row that may appear in the next
+    /// output batch. Since this row may be used multiple times, cache it in `self` to
+    /// avoid computing it multiple times.
+    fn last_output_row(&mut self, idxs: (usize, usize)) -> Result<&Rows> {
+        if self.last_output_row.is_none() {
+            let rows = self.convert_row(idxs)?;
+            self.last_output_row = Some(rows);
+        }
+        Ok(self.last_output_row.as_ref().expect("cannot be none"))
+    }
+
+    /// Return the `(batch_idx, row_idx)` of the last buffered row.
+    fn last_buffered_row_idx(&self) -> (usize, usize) {
+        let last_batch_len = self.batches.last().unwrap().num_rows();
+        (self.batches.len() - 1, last_batch_len - 1)
+    }
+
+    /// Return the `(batch_idx, row_idx)` of the `nth` row.
+    fn find_row_idx(&self, mut nth: usize) -> Option<(usize, usize)> {
+        let mut idx = None;
+        for (i, batch) in self.batches.iter().enumerate() {
+            if nth >= batch.num_rows() {
+                nth -= batch.num_rows()
+            } else {
+                idx = Some((i, nth));
+                break;
+            }
+        }
+        idx
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::VecDeque;
+
+    use arrow_util::test_util::batches_to_lines;
+
+    use super::*;
+    use crate::exec::gapfill::exec_tests::TestRecords;
+
+    fn test_records(batch_size: usize) -> VecDeque<RecordBatch> {
+        let records = TestRecords {
+            group_cols: vec![
+                std::iter::repeat(Some("a")).take(12).collect(),
+                std::iter::repeat(Some("b"))
+                    .take(6)
+                    .chain(std::iter::repeat(Some("c")).take(6))
+                    .collect(),
+            ],
+            time_col: (0..12).map(|i| Some(1000 + i * 5)).take(12).collect(),
+            agg_cols: vec![
+                vec![
+                    Some(1),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(10),
+                ],
+                vec![
+                    Some(2),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(20),
+                    None,
+                    None,
+                    None,
+                ],
+                (0..12).map(Some).collect(),
+            ],
+            input_batch_size: batch_size,
+        };
+
+        TryInto::<Vec<RecordBatch>>::try_into(records)
+            .unwrap()
+            .into()
+    }
+
+    fn test_params() -> GapFillParams {
+        GapFillParams {
+            stride: 50_000_000,
+            first_ts: Some(1_000_000_000),
+            last_ts: 1_055_000_000,
+            fill_strategy: [
+                (3, FillStrategy::LinearInterpolate),
+                (4, FillStrategy::LinearInterpolate),
+            ]
+            .into(),
+        }
+    }
+
+    // This test is just here so it's clear what the
+    // test data is
+    #[test]
+    fn test_test_records() {
+        let batch = test_records(1000).pop_front().unwrap();
+        let actual = batches_to_lines(&[batch]);
+        insta::assert_yaml_snapshot!(actual, @r###"
+        ---
+        - +----+----+--------------------------+----+----+----+
+        - "| g0 | g1 | time                     | a0 | a1 | a2 |"
+        - +----+----+--------------------------+----+----+----+
+        - "| a  | b  | 1970-01-01T00:00:01Z     | 1  | 2  | 0  |"
+        - "| a  | b  | 1970-01-01T00:00:01.005Z |    |    | 1  |"
+        - "| a  | b  | 1970-01-01T00:00:01.010Z |    |    | 2  |"
+        - "| a  | b  | 1970-01-01T00:00:01.015Z |    |    | 3  |"
+        - "| a  | b  | 1970-01-01T00:00:01.020Z |    |    | 4  |"
+        - "| a  | b  | 1970-01-01T00:00:01.025Z |    |    | 5  |"
+        - "| a  | c  | 1970-01-01T00:00:01.030Z |    |    | 6  |"
+        - "| a  | c  | 1970-01-01T00:00:01.035Z |    |    | 7  |"
+        - "| a  | c  | 1970-01-01T00:00:01.040Z |    | 20 | 8  |"
+        - "| a  | c  | 1970-01-01T00:00:01.045Z |    |    | 9  |"
+        - "| a  | c  | 1970-01-01T00:00:01.050Z |    |    | 10 |"
+        - "| a  | c  | 1970-01-01T00:00:01.055Z | 10 |    | 11 |"
+        - +----+----+--------------------------+----+----+----+
+        "###);
+    }
+
+    #[test]
+    fn no_group_no_interpolate() {
+        let batch_size = 3;
+        let mut params = test_params();
+        params.fill_strategy = [].into();
+
+        let mut buffered_input = BufferedInput::new(&params, vec![]);
+        let mut batches = test_records(batch_size);
+
+        // There are no rows, so that is less than the batch size,
+        // it needs more.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 3 rows, still less than batch_size + 1,
+        // so it needs more.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // We now have batch_size * 2, records, which is enough.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+
+    #[test]
+    fn no_group() {
+        let batch_size = 3;
+        let params = test_params();
+        let mut buffered_input = BufferedInput::new(&params, vec![]);
+        let mut batches = test_records(batch_size);
+
+        // There are no rows, so that is less than the batch size,
+        // it needs more.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 3 rows, still less than batch_size + 1,
+        // so it needs more.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 6 rows, if we were not interpolating,
+        // this would be enough.
+        buffered_input.push(batches.pop_front().unwrap());
+
+        // If we are interpolating, there are no non null values
+        // at offset 5.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // Push more rows, now totaling 9.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+        // Column `a1` has a non-null value at offset 8.
+        // If that were the only column being interpolated, we would have enough.
+
+        // 12 rows, with non-null values in both columns being interpolated.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+
+    #[test]
+    fn with_group() {
+        let params = test_params();
+        let group_cols = vec![0, 1];
+        let mut buffered_input = BufferedInput::new(&params, group_cols);
+
+        let batch_size = 3;
+        let mut batches = test_records(batch_size);
+
+        // no rows
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 3 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 6 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 9 rows (series changes here)
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+}
--- a/iox_query/src/exec/gapfill/exec_tests.rs
+++ b/iox_query/src/exec/gapfill/exec_tests.rs
@ -775,6 +775,7 @@ fn test_gapfill_fill_interpolate() {
                    Some("b"),
                    Some("b"),
                    Some("b"),
+                    Some("b"),
                ]],
                time_col: vec![
                    None,
@ -788,7 +789,7 @@ fn test_gapfill_fill_interpolate() {
                    // --- new series
                    None,
                    Some(975),
-                    // 1000
+                    Some(1000),
                    Some(1025),
                    // 1050
                    Some(1075),
@ -807,7 +808,7 @@ fn test_gapfill_fill_interpolate() {
                    // --- new series
                    Some(-10),
                    Some(1100), //  975
-                    // 1200        1000
+                    None, // 1200  1000 (this null value will be filled)
                    Some(1300), // 1025
                    // 1325        1050
                    Some(1350), // 1075
@ -979,13 +980,13 @@ fn assert_batch_count(actual_batches: &[RecordBatch], batch_size: usize) {

 type ExprVec = Vec<Arc<dyn PhysicalExpr>>;

-struct TestRecords {
-    group_cols: Vec<Vec<Option<&'static str>>>,
+pub(super) struct TestRecords {
+    pub group_cols: Vec<Vec<Option<&'static str>>>,
    // Stored as millisecods since intervals use millis,
    // to let test cases be consistent and easier to read.
-    time_col: Vec<Option<i64>>,
-    agg_cols: Vec<Vec<Option<i64>>>,
-    input_batch_size: usize,
+    pub time_col: Vec<Option<i64>>,
+    pub agg_cols: Vec<Vec<Option<i64>>>,
+    pub input_batch_size: usize,
 }

 impl TestRecords {
@ -1174,14 +1175,16 @@ fn phys_fill_strategies(

 fn get_params_ms_with_fill_strategy(
    batch: &TestRecords,
-    stride: i64,
+    stride_ms: i64,
    start: Option<i64>,
    end: i64,
    fill_strategy: FillStrategy,
 ) -> GapFillExecParams {
+    // stride is in ms
+    let stride = ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000);
+
    GapFillExecParams {
-        // interval day time is milliseconds in the low 32-bit word
-        stride: phys_lit(ScalarValue::IntervalDayTime(Some(stride))), // milliseconds
+        stride: phys_lit(stride),
        time_column: Column::new("t", batch.group_cols.len()),
        origin: phys_lit(ScalarValue::TimestampNanosecond(Some(0), None)),
        // timestamps are nanos, so scale them accordingly
--- a/iox_query/src/exec/gapfill/mod.rs
+++ b/iox_query/src/exec/gapfill/mod.rs
@ -2,6 +2,7 @@
 //! a gap-filling extension to DataFusion

 mod algo;
+mod buffered_input;
 #[cfg(test)]
 mod exec_tests;
 mod params;
@ -31,7 +32,6 @@ use datafusion::{
    },
    prelude::Expr,
 };
-use datafusion_util::sort_exprs::requirements_from_sort_exprs;

 use self::stream::GapFillStream;

@ -475,7 +475,9 @@ impl ExecutionPlan for GapFillExec {
    }

    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
-        vec![Some(requirements_from_sort_exprs(&self.sort_expr))]
+        vec![Some(PhysicalSortRequirement::from_sort_exprs(
+            &self.sort_expr,
+        ))]
    }

    fn maintains_input_order(&self) -> Vec<bool> {
@ -740,11 +742,11 @@ mod test {
            explain,
            @r###"
        ---
-        - " ProjectionExec: expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, AVG(temps.temp)@1 as AVG(temps.temp)]"
-        - "   GapFillExec: group_expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[AVG(temps.temp)@1], stride=60000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
-        - "     SortExec: expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
-        - "       AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
-        - "         AggregateExec: mode=Partial, gby=[datebin(60000, time@0, 0) as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
+        - " ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, AVG(temps.temp)@1 as AVG(temps.temp)]"
+        - "   GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[AVG(temps.temp)@1], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - "     SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
+        - "       AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
+        - "         AggregateExec: mode=Partial, gby=[datebin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
        - "           EmptyExec: produce_one_row=false"
        "###
        );
@ -770,11 +772,11 @@ mod test {
            explain,
            @r###"
        ---
-        - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, AVG(temps.temp)@3 as AVG(temps.temp)]"
-        - "   GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[AVG(temps.temp)@3], stride=60000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
-        - "     SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
-        - "       AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
-        - "         AggregateExec: mode=Partial, gby=[loc@1 as loc, datebin(60000, time@0, 0) as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
+        - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, AVG(temps.temp)@3 as AVG(temps.temp)]"
+        - "   GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[AVG(temps.temp)@3], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - "     SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
+        - "       AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
+        - "         AggregateExec: mode=Partial, gby=[loc@1 as loc, datebin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
        - "           EmptyExec: produce_one_row=false"
        "###
        );
--- a/iox_query/src/exec/gapfill/params.rs
+++ b/iox_query/src/exec/gapfill/params.rs
@ -2,7 +2,7 @@
 use std::ops::Bound;

 use arrow::{
-    datatypes::{IntervalDayTimeType, SchemaRef},
+    datatypes::{IntervalMonthDayNanoType, SchemaRef},
    record_batch::RecordBatch,
 };
 use chrono::Duration;
@ -133,10 +133,17 @@ fn extract_timestamp_nanos(cv: &ColumnarValue) -> Result<i64> {

 fn extract_interval_nanos(cv: &ColumnarValue) -> Result<i64> {
    match cv {
-        ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => {
-            let (days, ms) = IntervalDayTimeType::to_parts(*v);
+        ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(v))) => {
+            let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(*v);
+
+            if months != 0 {
+                return Err(DataFusionError::Execution(
+                    "gap filling does not support month intervals".to_string(),
+                ));
+            }
+
            let nanos =
-                (Duration::days(days as i64) + Duration::milliseconds(ms as i64)).num_nanoseconds();
+                (Duration::days(days as i64) + Duration::nanoseconds(nanos)).num_nanoseconds();
            nanos.ok_or_else(|| {
                DataFusionError::Execution("gap filling argument is too large".to_string())
            })
@ -261,9 +268,7 @@ mod tests {
    }

    fn interval(ns: i64) -> Arc<dyn PhysicalExpr> {
-        Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(
-            ns / 1_000_000,
-        ))))
+        Arc::new(Literal::new(ScalarValue::new_interval_mdn(0, 0, ns)))
    }

    fn timestamp(ns: i64) -> Arc<dyn PhysicalExpr> {
--- a/iox_query/src/exec/gapfill/stream.rs
+++ b/iox_query/src/exec/gapfill/stream.rs
@ -22,9 +22,16 @@ use datafusion::{
 };
 use futures::{ready, Stream, StreamExt};

-use super::{algo::GapFiller, params::GapFillParams, GapFillExec};
+use super::{algo::GapFiller, buffered_input::BufferedInput, params::GapFillParams, GapFillExec};

 /// An implementation of a gap-filling operator that uses the [Stream] trait.
+///
+/// This type takes responsibility for:
+/// - Reading input record batches
+/// - Accounting for memory
+/// - Extracting arrays for processing by [`GapFiller`]
+/// - Recording metrics
+/// - Sending record batches to next operator (by implementing [`Self::poll_next'])
 #[allow(dead_code)]
 pub(super) struct GapFillStream {
    /// The schema of the input and output.
@ -38,12 +45,10 @@ pub(super) struct GapFillStream {
    group_expr: Vec<Arc<dyn PhysicalExpr>>,
    /// The aggregate columns from the select list of the original query.
    aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
-    /// The number of rows to produce in each output batch.
-    batch_size: usize,
    /// The producer of the input record batches.
    input: SendableRecordBatchStream,
    /// Input that has been read from the iput stream.
-    buffered_input_batches: Vec<RecordBatch>,
+    buffered_input: BufferedInput,
    /// The thing that does the gap filling.
    gap_filler: GapFiller,
    /// This is true as long as there are more input record batches to read from `input`.
@ -83,16 +88,19 @@ impl GapFillStream {
            .collect::<Vec<_>>();
        let aggr_expr = aggr_expr.to_owned();
        let time_expr = group_expr.split_off(group_expr.len() - 1).pop().unwrap();
+
+        let group_cols = group_expr.iter().map(expr_to_index).collect::<Vec<_>>();
        let params = GapFillParams::try_new(Arc::clone(&schema), params)?;
-        let gap_filler = GapFiller::new(params);
+        let buffered_input = BufferedInput::new(&params, group_cols);
+
+        let gap_filler = GapFiller::new(params, batch_size);
        Ok(Self {
            schema,
            time_expr,
            group_expr,
            aggr_expr,
-            batch_size,
            input,
-            buffered_input_batches: vec![],
+            buffered_input,
            gap_filler,
            more_input: true,
            reservation,
@ -112,28 +120,17 @@ impl Stream for GapFillStream {

    /// Produces a gap-filled record batch from its input stream.
    ///
-    /// This method starts off by reading input until it has buffered `batch_size` + 2 rows,
-    /// or until there is no more input. Having at least `batch_size` rows ensures that we
-    /// can produce at least one full output batch. We need two additional rows so that we have
-    ///  1) an input row that corresponds to the row before the current output batch. This is
-    ///     needed for the case where we are producing trailing gaps, and we need to use the
-    ///     `take` kernel to build the group columns. There must be at least one row from the
-    ///     corresponding series in the input to take from.
-    ///  2) an input row that corresponds to the next input row that will be read after the
-    ///     current output batch. This tells us if we have processed all of our input for a series
-    ///     but may be in "trailing gaps" mode.
-    ///
-    /// Once input rows have been buffered, it will produce a gap-filled [RecordBatch] with `self.batch_size`
-    /// rows (or less, if there is no more input).
+    /// For details on implementation, see [`GapFiller`].
    fn poll_next(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Option<Result<RecordBatch>>> {
-        while self.more_input && self.buffered_input_row_count() < self.batch_size + 2 {
+        let last_output_row_offset = self.gap_filler.last_output_row_offset();
+        while self.more_input && self.buffered_input.need_more(last_output_row_offset)? {
            match ready!(self.input.poll_next_unpin(cx)) {
                Some(Ok(batch)) => {
                    self.reservation.try_grow(batch.get_array_memory_size())?;
-                    self.buffered_input_batches.push(batch);
+                    self.buffered_input.push(batch);
                }
                Some(Err(e)) => {
                    return Poll::Ready(Some(Err(e)));
@ -162,8 +159,7 @@ impl Stream for GapFillStream {

        match self.process(input_batch) {
            Ok((output_batch, remaining_input_batch)) => {
-                self.buffered_input_batches.push(remaining_input_batch);
-                assert_eq!(1, self.buffered_input_batches.len());
+                self.buffered_input.push(remaining_input_batch);

                self.reservation
                    .shrink(output_batch.get_array_memory_size());
@ -175,30 +171,21 @@ impl Stream for GapFillStream {
 }

 impl GapFillStream {
-    /// Count of input rows that are currently buffered.
-    fn buffered_input_row_count(&self) -> usize {
-        self.buffered_input_batches
-            .iter()
-            .map(|rb| rb.num_rows())
-            .sum()
-    }
-
    /// If any buffered input batches are present, concatenates it all together
    /// and returns an owned batch to the caller, leaving `self.buffered_input_batches` empty.
    fn take_buffered_input(&mut self) -> Result<Option<RecordBatch>> {
-        if self.buffered_input_batches.is_empty() {
+        let batches = self.buffered_input.take();
+        if batches.is_empty() {
            return Ok(None);
        }

-        let mut v = vec![];
-        std::mem::swap(&mut v, &mut self.buffered_input_batches);
-        let old_size = v.iter().map(|rb| rb.get_array_memory_size()).sum();
+        let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum();

-        let mut batch = arrow::compute::concat_batches(&self.schema, &v)
+        let mut batch = arrow::compute::concat_batches(&self.schema, &batches)
            .map_err(DataFusionError::ArrowError)?;
        self.reservation.try_grow(batch.get_array_memory_size())?;

-        if v.len() > 1 {
+        if batches.len() > 1 {
            // Optimize the dictionaries. The output of this operator uses the take kernel to produce
            // its output. Since the input batches will usually be smaller than the output, it should
            // be less work to optimize here vs optimizing the output.
@ -234,7 +221,6 @@ impl GapFillStream {
        let output_batch = self
            .gap_filler
            .build_gapfilled_output(
-                self.batch_size,
                Arc::clone(&self.schema),
                input_time_array,
                &group_arrays,
--- a/iox_query/src/exec/seriesset/converter.rs
+++ b/iox_query/src/exec/seriesset/converter.rs
@ -4,7 +4,7 @@

 use arrow::{
    self,
-    array::{Array, BooleanArray, DictionaryArray, StringArray},
+    array::{downcast_array, Array, BooleanArray, DictionaryArray, StringArray},
    compute,
    datatypes::{DataType, Int32Type, SchemaRef},
    record_batch::RecordBatch,
@ -188,9 +188,7 @@ impl SeriesSetConverter {
        ])
        .expect("concat");

-        // until https://github.com/apache/arrow-rs/issues/2901 is done, use a workaround
-        // to get a `BooleanArray`
-        BooleanArray::from(arr.data().clone())
+        downcast_array(&arr)
    }

    /// Creates (column_name, column_value) pairs for each column
--- a/iox_query/src/exec/split.rs
+++ b/iox_query/src/exec/split.rs
@ -73,9 +73,7 @@ use datafusion::{
    scalar::ScalarValue,
 };

-use datafusion_util::{
-    sort_exprs::requirements_from_sort_exprs, watch::WatchedTask, AdapterStream,
-};
+use datafusion_util::{watch::WatchedTask, AdapterStream};
 use futures::StreamExt;
 use observability_deps::tracing::*;
 use parking_lot::Mutex;
@ -215,7 +213,7 @@ impl ExecutionPlan for StreamSplitExec {
        let requirement = self
            .input
            .output_ordering()
-            .map(requirements_from_sort_exprs);
+            .map(PhysicalSortRequirement::from_sort_exprs);

        vec![requirement]
    }
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@ -20,6 +20,7 @@ use datafusion::{error::DataFusionError, prelude::SessionContext};
 use exec::{stringset::StringSet, IOxSessionContext};
 use hashbrown::HashMap;
 use observability_deps::tracing::{debug, trace};
+use once_cell::sync::Lazy;
 use parquet_file::storage::ParquetExecInput;
 use predicate::{rpc_predicate::QueryNamespaceMeta, Predicate, PredicateMatch};
 use schema::{
@ -45,9 +46,12 @@ pub use query_functions::group_by::{Aggregate, WindowDuration};
 /// The name of the virtual column that represents the chunk order.
 pub const CHUNK_ORDER_COLUMN_NAME: &str = "__chunk_order";

+static CHUNK_ORDER_FIELD: Lazy<Arc<Field>> =
+    Lazy::new(|| Arc::new(Field::new(CHUNK_ORDER_COLUMN_NAME, DataType::Int64, false)));
+
 /// Generate [`Field`] for [chunk order column](CHUNK_ORDER_COLUMN_NAME).
-pub fn chunk_order_field() -> Field {
-    Field::new(CHUNK_ORDER_COLUMN_NAME, DataType::Int64, false)
+pub fn chunk_order_field() -> Arc<Field> {
+    Arc::clone(&CHUNK_ORDER_FIELD)
 }

 /// Trait for an object (designed to be a Chunk) which can provide
--- a/iox_query/src/logical_optimizer/handle_gapfill.rs
+++ b/iox_query/src/logical_optimizer/handle_gapfill.rs
@ -14,7 +14,7 @@ use datafusion::{
    optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule},
    prelude::{col, Expr},
 };
-use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, LOCF_UDF_NAME};
+use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME};
 use std::{
    collections::HashSet,
    ops::{Bound, Range},
@ -349,6 +349,14 @@ impl TreeNodeRewriter for DateBinGapfillRewriter {
    }
 }

+fn udf_to_fill_strategy(name: &str) -> Option<FillStrategy> {
+    match name {
+        LOCF_UDF_NAME => Some(FillStrategy::PrevNullAsMissing),
+        INTERPOLATE_UDF_NAME => Some(FillStrategy::LinearInterpolate),
+        _ => None,
+    }
+}
+
 fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
    let Projection {
        input,
@ -365,12 +373,16 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
        return Ok(None)
    };

-    let fill_cols: Vec<(&Expr, FillStrategy)> = proj_exprs
+    let fill_cols: Vec<(&Expr, FillStrategy, &str)> = proj_exprs
        .iter()
        .filter_map(|e| match e {
-            Expr::ScalarUDF { fun, args } if fun.name == LOCF_UDF_NAME => {
-                let col = &args[0];
-                Some((col, FillStrategy::PrevNullAsMissing))
+            Expr::ScalarUDF { fun, args } => {
+                if let Some(strategy) = udf_to_fill_strategy(&fun.name) {
+                    let col = &args[0];
+                    Some((col, strategy, fun.name.as_str()))
+                } else {
+                    None
+                }
            }
            _ => None,
        })
@ -383,12 +395,12 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
    // Clone the existing GapFill node, then modify it in place
    // to reflect the new fill strategy.
    let mut new_gapfill = child_gapfill.clone();
-    for (e, col) in fill_cols {
-        if new_gapfill.replace_fill_strategy(e, col).is_none() {
-            // There was a gap filling function called on an aggregate column.
-            return Err(DataFusionError::Plan(
-                "LOCF must be called on an aggregate column in a gap-filling query".to_string(),
-            ));
+    for (e, fs, fn_name) in fill_cols {
+        if new_gapfill.replace_fill_strategy(e, fs).is_none() {
+            // There was a gap filling function called on a non-aggregate column.
+            return Err(DataFusionError::Plan(format!(
+                "{fn_name} must be called on an aggregate column in a gap-filling query"
+            )));
        }
    }

@ -397,7 +409,9 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
        .iter()
        .cloned()
        .map(|e| match e {
-            Expr::ScalarUDF { fun, mut args } if fun.name == LOCF_UDF_NAME => args.remove(0),
+            Expr::ScalarUDF { fun, mut args } if udf_to_fill_strategy(&fun.name).is_some() => {
+                args.remove(0)
+            }
            _ => e,
        })
        .collect();
@ -433,16 +447,19 @@ fn check_node(node: &LogicalPlan) -> Result<()> {
    node.expressions().iter().try_for_each(|expr| {
        let dbg_count = count_udf(expr, DATE_BIN_GAPFILL_UDF_NAME)?;
        if dbg_count > 0 {
-            Err(DataFusionError::Plan(format!(
+            return Err(DataFusionError::Plan(format!(
                "{DATE_BIN_GAPFILL_UDF_NAME} may only be used as a GROUP BY expression"
-            )))
-        } else if count_udf(expr, LOCF_UDF_NAME)? > 0 {
-            Err(DataFusionError::Plan(format!(
-                "{LOCF_UDF_NAME} may only be used in the SELECT list of a gap-filling query"
-            )))
-        } else {
-            Ok(())
+            )));
        }
+
+        for fn_name in [LOCF_UDF_NAME, INTERPOLATE_UDF_NAME] {
+            if count_udf(expr, fn_name)? > 0 {
+                return Err(DataFusionError::Plan(format!(
+                    "{fn_name} may only be used in the SELECT list of a gap-filling query"
+                )));
+            }
+        }
+        Ok(())
    })
 }

@ -459,7 +476,9 @@ mod test {
    use datafusion::optimizer::OptimizerContext;
    use datafusion::prelude::{avg, case, col, lit, lit_timestamp_nano, min, Expr};
    use datafusion::scalar::ScalarValue;
-    use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, LOCF_UDF_NAME};
+    use query_functions::gapfill::{
+        DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME,
+    };

    fn table_scan() -> Result<LogicalPlan> {
        let schema = Schema::new(vec![
@ -497,6 +516,13 @@ mod test {
        })
    }

+    fn interpolate(arg: Expr) -> Result<Expr> {
+        Ok(Expr::ScalarUDF {
+            fun: query_functions::registry().udf(INTERPOLATE_UDF_NAME)?,
+            args: vec![arg],
+        })
+    }
+
    fn optimize(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
        let optimizer = Optimizer::with_rules(vec![Arc::new(HandleGapFill::default())]);
        optimizer.optimize_recursively(
@ -581,6 +607,20 @@ mod test {
        Ok(())
    }

+    /// calling INTERPOLATE in a WHERE predicate is not valid
+    #[test]
+    fn misplaced_interpolate_err() -> Result<()> {
+        // date_bin_gapfill used in a filter should produce an error
+        let scan = table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .filter(interpolate(col("temp"))?.gt(lit(100.0)))?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: interpolate may only be used in the SELECT list of a gap-filling query",
+        );
+        Ok(())
+    }
    /// calling LOCF on the SELECT list but not on an aggregate column is not valid.
    #[test]
    fn misplaced_locf_non_agg_err() -> Result<()> {
@ -607,7 +647,7 @@ mod test {
            .build()?;
        assert_optimizer_err(
            &plan,
-            "LOCF must be called on an aggregate column in a gap-filling query",
+            "locf must be called on an aggregate column in a gap-filling query",
        );
        Ok(())
    }
@ -852,4 +892,37 @@ mod test {
        assert_optimized_plan_eq(&plan, &expected)?;
        Ok(())
    }
+
+    #[test]
+    fn with_interpolate() -> Result<()> {
+        let dbg_args = "IntervalDayTime(\"60000\"),temps.time,TimestampNanosecond(0, None)";
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamp_nano(1000))
+                    .and(col("time").lt(lit_timestamp_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                col(format!("date_bin_gapfill({dbg_args})")),
+                interpolate(col("AVG(temps.temp)"))?,
+                interpolate(col("MIN(temps.temp)"))?,
+            ])?
+            .build()?;
+
+        let expected = format!(
+            "Projection: date_bin_gapfill({dbg_args}), AVG(temps.temp), MIN(temps.temp)\
+           \n  GapFill: groupBy=[[date_bin_gapfill({dbg_args})]], aggr=[[INTERPOLATE(AVG(temps.temp)), INTERPOLATE(MIN(temps.temp))]], time_column=date_bin_gapfill({dbg_args}), stride=IntervalDayTime(\"60000\"), range=Included(TimestampNanosecond(1000, None))..Excluded(TimestampNanosecond(2000, None))\
+           \n    Aggregate: groupBy=[[datebin(IntervalDayTime(\"60000\"), temps.time, TimestampNanosecond(0, None))]], aggr=[[AVG(temps.temp), MIN(temps.temp)]]\
+           \n      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)\
+           \n        TableScan: temps");
+        assert_optimized_plan_eq(&plan, &expected)?;
+        Ok(())
+    }
 }
--- a/iox_query/src/physical_optimizer/dedup/test_util.rs
+++ b/iox_query/src/physical_optimizer/dedup/test_util.rs
@ -1,6 +1,6 @@
 use std::sync::Arc;

-use arrow::datatypes::Schema as ArrowSchema;
+use arrow::datatypes::{Fields, Schema as ArrowSchema};
 use datafusion::physical_plan::ExecutionPlan;
 use schema::Schema;

@ -40,7 +40,7 @@ fn dedup_plan_impl(
                .iter()
                .cloned()
                .chain(std::iter::once(chunk_order_field()))
-                .collect(),
+                .collect::<Fields>(),
        ))
    } else {
        schema.as_arrow()
--- a/iox_query/src/physical_optimizer/projection_pushdown.rs
+++ b/iox_query/src/physical_optimizer/projection_pushdown.rs
@ -169,12 +169,14 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
                        &column_names,
                        Arc::clone(child_sort.input()),
                        |plan| {
-                            Ok(Arc::new(SortExec::new_with_partitioning(
-                                reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
-                                plan,
-                                child_sort.preserve_partitioning(),
-                                child_sort.fetch(),
-                            )))
+                            Ok(Arc::new(
+                                SortExec::new(
+                                    reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
+                                    plan,
+                                )
+                                .with_preserve_partitioning(child_sort.preserve_partitioning())
+                                .with_fetch(child_sort.fetch()),
+                            ))
                        },
                    )?;

@ -930,7 +932,7 @@ mod tests {
            ProjectionExec::try_new(
                vec![(expr_col("tag1", &schema), String::from("tag1"))],
                Arc::new(
-                    SortExec::try_new(
+                    SortExec::new(
                        vec![PhysicalSortExpr {
                            expr: expr_col("tag2", &schema),
                            options: SortOptions {
@ -939,9 +941,8 @@ mod tests {
                            },
                        }],
                        Arc::new(TestExec::new(schema)),
-                        Some(42),
                    )
-                    .unwrap(),
+                    .with_fetch(Some(42)),
                ),
            )
            .unwrap(),
@ -971,18 +972,20 @@ mod tests {
        let plan = Arc::new(
            ProjectionExec::try_new(
                vec![(expr_col("tag1", &schema), String::from("tag1"))],
-                Arc::new(SortExec::new_with_partitioning(
-                    vec![PhysicalSortExpr {
-                        expr: expr_col("tag2", &schema),
-                        options: SortOptions {
-                            descending: true,
-                            ..Default::default()
-                        },
-                    }],
-                    Arc::new(TestExec::new_with_partitions(schema, 2)),
-                    true,
-                    Some(42),
-                )),
+                Arc::new(
+                    SortExec::new(
+                        vec![PhysicalSortExpr {
+                            expr: expr_col("tag2", &schema),
+                            options: SortOptions {
+                                descending: true,
+                                ..Default::default()
+                            },
+                        }],
+                        Arc::new(TestExec::new_with_partitions(schema, 2)),
+                    )
+                    .with_preserve_partitioning(true)
+                    .with_fetch(Some(42)),
+                ),
            )
            .unwrap(),
        );
--- a/Show More
+++ b/Show More