Merge branch 'main' into ntran/dedup_compare_cols_order

2021-07-21 15:42:30 +00:00 · 2021-07-21 15:42:30 +00:00 · 18dd108ba6
parent 86add39175 56dd430d8f
commit 18dd108ba6
49 changed files with 3323 additions and 2286 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -769,9 +769,9 @@ dependencies = [

 [[package]]
 name = "crypto-mac"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6"
+checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a"
 dependencies = [
 "generic-array",
 "subtle",
@ -826,6 +826,7 @@ dependencies = [
 "influxdb_line_protocol",
 "num_cpus",
 "observability_deps",
+ "once_cell",
 "percent-encoding",
 "regex",
 "serde",
@ -843,7 +844,7 @@ dependencies = [
 [[package]]
 name = "datafusion"
 version = "4.0.0-SNAPSHOT"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3"
 dependencies = [
 "ahash 0.7.4",
 "arrow",
@ -4330,9 +4331,9 @@ dependencies = [

 [[package]]
 name = "tinyvec"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
+checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0"
 dependencies = [
 "tinyvec_macros",
 ]
@ -4345,9 +4346,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"

 [[package]]
 name = "tokio"
-version = "1.8.1"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985"
+checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207"
 dependencies = [
 "autocfg",
 "bytes",
@ -4984,9 +4985,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"

 [[package]]
 name = "zeroize"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18"
+checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd"

 [[package]]
 name = "zstd"
--- a/data_types/Cargo.toml
+++ b/data_types/Cargo.toml
@ -15,6 +15,7 @@ regex = "1.4"
 serde = { version = "1.0", features = ["rc", "derive"] }
 snafu = "0.6"
 observability_deps = { path = "../observability_deps" }
+once_cell = { version = "1.4.0", features = ["parking_lot"] }

 [dev-dependencies] # In alphabetical order
 test_helpers = { path = "../test_helpers" }
--- a/data_types/src/database_rules.rs
+++ b/data_types/src/database_rules.rs
@ -166,6 +166,10 @@ pub struct LifecycleRules {

    /// Maximum number of rows to buffer in a MUB chunk before compacting it
    pub mub_row_threshold: NonZeroUsize,
+
+    /// Use up to this amount of space in bytes for caching Parquet files. None
+    /// will disable Parquet file caching.
+    pub parquet_cache_limit: Option<NonZeroU64>,
 }

 impl LifecycleRules {
@ -195,6 +199,7 @@ impl Default for LifecycleRules {
            persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS)
                .unwrap(),
            mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(),
+            parquet_cache_limit: None,
        }
    }
 }
--- a/data_types/src/instant.rs
+++ b/data_types/src/instant.rs
@ -0,0 +1,53 @@
+use chrono::{DateTime, Utc};
+use once_cell::sync::OnceCell;
+use std::time::Instant;
+
+/// Stores an Instant and DateTime<Utc> captured as close as possible together
+static INSTANCE: OnceCell<(DateTime<Utc>, Instant)> = OnceCell::new();
+
+/// Provides a conversion from Instant to DateTime<Utc> for display purposes
+///
+/// It is an approximation as if the system clock changes, the returned DateTime will not be
+/// the same as the DateTime that would have been recorded at the time the Instant was created.
+///
+/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger
+/// Instant will have a larger returned DateTime.
+///
+/// This should ONLY be used for display purposes, the results should not be used to
+/// drive logic, nor persisted
+pub fn to_approximate_datetime(instant: Instant) -> DateTime<Utc> {
+    let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now()));
+
+    if ref_instant > instant {
+        ref_date
+            - chrono::Duration::from_std(ref_instant.duration_since(instant))
+                .expect("date overflow")
+    } else {
+        ref_date
+            + chrono::Duration::from_std(instant.duration_since(ref_instant))
+                .expect("date overflow")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_to_datetime() {
+        // Seed global state
+        to_approximate_datetime(Instant::now());
+
+        let (ref_date, ref_instant) = *INSTANCE.get().unwrap();
+
+        assert_eq!(
+            to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)),
+            ref_date + chrono::Duration::nanoseconds(78)
+        );
+
+        assert_eq!(
+            to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)),
+            ref_date - chrono::Duration::nanoseconds(23)
+        );
+    }
+}
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@ -13,12 +13,14 @@
 pub mod chunk_metadata;
 pub mod consistent_hasher;
 mod database_name;
-pub use database_name::*;
 pub mod database_rules;
 pub mod database_state;
 pub mod error;
+pub mod instant;
 pub mod job;
 pub mod names;
 pub mod partition_metadata;
 pub mod server_id;
 pub mod timestamp;
+pub mod write_summary;
+pub use database_name::*;
--- a/data_types/src/write_summary.rs
+++ b/data_types/src/write_summary.rs
@ -0,0 +1,20 @@
+use chrono::{DateTime, Utc};
+
+/// A description of a set of writes
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct WriteSummary {
+    /// The wall clock timestamp of the last write in this summary
+    pub time_of_first_write: DateTime<Utc>,
+
+    /// The wall clock timestamp of the last write in this summary
+    pub time_of_last_write: DateTime<Utc>,
+
+    /// The minimum row timestamp for data in this summary
+    pub min_timestamp: DateTime<Utc>,
+
+    /// The maximum row timestamp value for data in this summary
+    pub max_timestamp: DateTime<Utc>,
+
+    /// The number of rows in this summary
+    pub row_count: usize,
+}
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version"

 # Rename to workaround doctest bug
 # Turn off optional datafusion features (function packages)
-upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" }
+upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" }
--- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
+++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
@ -82,6 +82,10 @@ message LifecycleRules {
  // If 0, compactions are limited to the default number.
  // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS
  uint32 max_active_compactions = 16;
+
+  // Use up to this amount of space in bytes for caching Parquet files.
+  // A value of 0 disables Parquet caching
+  uint64 parquet_cache_limit = 17;
 }

 message DatabaseRules {
--- a/generated_types/src/database_rules/lifecycle.rs
+++ b/generated_types/src/database_rules/lifecycle.rs
@ -35,6 +35,10 @@ impl From<LifecycleRules> for management::LifecycleRules {
            persist_row_threshold: config.persist_row_threshold.get() as u64,
            persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(),
            mub_row_threshold: config.mub_row_threshold.get() as u64,
+            parquet_cache_limit: config
+                .parquet_cache_limit
+                .map(|v| v.get())
+                .unwrap_or_default(),
        }
    }
 }
@ -69,6 +73,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
                .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()),
            mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize)
                .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()),
+            parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit),
        })
    }
 }
@ -93,6 +98,7 @@ mod tests {
            persist_row_threshold: 57,
            persist_age_threshold_seconds: 23,
            mub_row_threshold: 3454,
+            parquet_cache_limit: 10,
        };

        let config: LifecycleRules = protobuf.clone().try_into().unwrap();
@ -125,6 +131,11 @@ mod tests {
            protobuf.persist_age_threshold_seconds
        );
        assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold);
+        assert_eq!(
+            config.parquet_cache_limit.unwrap().get(),
+            protobuf.parquet_cache_limit
+        );
+        assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit);
    }

    #[test]
--- a/internal_types/src/schema.rs
+++ b/internal_types/src/schema.rs
@ -11,7 +11,7 @@ use arrow::datatypes::{
    DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
    SchemaRef as ArrowSchemaRef, TimeUnit,
 };
-use snafu::Snafu;
+use snafu::{OptionExt, Snafu};

 use crate::{
    schema::sort::{ColumnSort, SortKey},
@ -395,11 +395,9 @@ impl Schema {
    pub fn compute_select_indicies(&self, columns: &[&str]) -> Result<Vec<usize>> {
        columns
            .iter()
-            .map(|column_name| {
+            .map(|&column_name| {
                self.find_index_of(column_name)
-                    .ok_or_else(|| Error::ColumnNotFound {
-                        column_name: column_name.to_string(),
-                    })
+                    .context(ColumnNotFound { column_name })
            })
            .collect()
    }
@ -788,12 +786,12 @@ macro_rules! assert_column_eq {

 #[cfg(test)]
 mod test {
+    use arrow::compute::SortOptions;
    use InfluxColumnType::*;
    use InfluxFieldType::*;

    use super::{builder::SchemaBuilder, *};
    use crate::schema::merge::SchemaMerger;
-    use crate::schema::sort::SortOptions;

    fn make_field(
        name: &str,
--- a/internal_types/src/schema/sort.rs
+++ b/internal_types/src/schema/sort.rs
@ -1,5 +1,6 @@
 use std::{fmt::Display, str::FromStr};

+use arrow::compute::SortOptions;
 use indexmap::{map::Iter, IndexMap};
 use itertools::Itertools;
 use snafu::Snafu;
@ -23,24 +24,6 @@ pub enum Error {

 pub type Result<T, E = Error> = std::result::Result<T, E>;

-/// Temporary - <https://github.com/apache/arrow-rs/pull/425>
-#[derive(Debug, Clone, Copy, Eq, PartialEq)]
-pub struct SortOptions {
-    /// Whether to sort in descending order
-    pub descending: bool,
-    /// Whether to sort nulls first
-    pub nulls_first: bool,
-}
-
-impl Default for SortOptions {
-    fn default() -> Self {
-        Self {
-            descending: false,
-            nulls_first: true,
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct ColumnSort {
    /// Position of this column in the sort key
--- a/lifecycle/src/policy.rs
+++ b/lifecycle/src/policy.rs
@ -1399,6 +1399,7 @@ mod tests {
        let rules = LifecycleRules {
            late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
            persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
+            max_active_compactions: NonZeroU32::new(10).unwrap(),
            ..Default::default()
        };

@ -1538,6 +1539,7 @@ mod tests {
            persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
            late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
            persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(),
+            max_active_compactions: NonZeroU32::new(10).unwrap(),
            ..Default::default()
        };
        let now = Instant::now();
--- a/persistence_windows/src/persistence_windows.rs
+++ b/persistence_windows/src/persistence_windows.rs
@ -7,12 +7,13 @@ use std::{

 use chrono::{DateTime, TimeZone, Utc};

-use data_types::partition_metadata::PartitionAddr;
+use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary};
 use entry::Sequence;
 use internal_types::guard::{ReadGuard, ReadLock};

 use crate::checkpoint::PartitionCheckpoint;
 use crate::min_max_sequence::MinMaxSequence;
+use data_types::instant::to_approximate_datetime;

 const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30);

@ -45,6 +46,9 @@ pub struct PersistenceWindows {
    late_arrival_period: Duration,
    closed_window_period: Duration,

+    /// The instant this PersistenceWindows was created
+    created_at: Instant,
+
    /// The last instant passed to PersistenceWindows::add_range
    last_instant: Instant,

@ -106,6 +110,8 @@ impl PersistenceWindows {

        let closed_window_count = late_arrival_seconds / closed_window_seconds;

+        let created_at_instant = Instant::now();
+
        Self {
            persistable: ReadLock::new(None),
            closed: VecDeque::with_capacity(closed_window_count as usize),
@ -113,11 +119,18 @@ impl PersistenceWindows {
            addr,
            late_arrival_period,
            closed_window_period,
-            last_instant: Instant::now(),
+            created_at: created_at_instant,
+            last_instant: created_at_instant,
            max_sequence_numbers: Default::default(),
        }
    }

+    /// Updates the late arrival period of this `PersistenceWindows` instance
+    pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) {
+        self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD);
+        self.late_arrival_period = late_arrival_period;
+    }
+
    /// Updates the windows with the information from a batch of rows from a single sequencer
    /// to the same partition. The min and max times are the times on the row data. The `received_at`
    /// Instant is when the data was received. Taking it in this function is really just about
@ -165,7 +178,7 @@ impl PersistenceWindows {
        self.rotate(received_at);

        match self.open.as_mut() {
-            Some(w) => w.add_range(sequence, row_count, min_time, max_time),
+            Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at),
            None => {
                self.open = Some(Window::new(
                    received_at,
@ -335,6 +348,21 @@ impl PersistenceWindows {
        self.windows().next()
    }

+    /// Returns approximate summaries of the unpersisted writes contained
+    /// recorded by this PersistenceWindow instance
+    ///
+    /// These are approximate because persistence may partially flush a window, which will
+    /// update the min row timestamp but not the row count
+    pub fn summaries(&self) -> impl Iterator<Item = WriteSummary> + '_ {
+        self.windows().map(move |window| WriteSummary {
+            time_of_first_write: to_approximate_datetime(window.created_at),
+            time_of_last_write: to_approximate_datetime(window.last_instant),
+            min_timestamp: window.min_time,
+            max_timestamp: window.max_time,
+            row_count: window.row_count,
+        })
+    }
+
    /// Returns true if this PersistenceWindows instance is empty
    pub fn is_empty(&self) -> bool {
        self.minimum_window().is_none()
@ -374,9 +402,14 @@ struct Window {
    /// The server time when this window was created. Used to determine how long data in this
    /// window has been sitting in memory.
    created_at: Instant,
+    /// The server time of the last write to this window
+    last_instant: Instant,
+    /// The number of rows in the window
    row_count: usize,
-    min_time: DateTime<Utc>, // min time value for data in the window
-    max_time: DateTime<Utc>, // max time value for data in the window
+    /// min time value for data in the window
+    min_time: DateTime<Utc>,
+    /// max time value for data in the window
+    max_time: DateTime<Utc>,
    /// maps sequencer_id to the minimum and maximum sequence numbers seen
    sequencer_numbers: BTreeMap<u32, MinMaxSequence>,
 }
@ -399,6 +432,7 @@ impl Window {

        Self {
            created_at,
+            last_instant: created_at,
            row_count,
            min_time,
            max_time,
@ -414,7 +448,11 @@ impl Window {
        row_count: usize,
        min_time: DateTime<Utc>,
        max_time: DateTime<Utc>,
+        instant: Instant,
    ) {
+        assert!(self.created_at <= instant);
+        self.last_instant = instant;
+
        self.row_count += row_count;
        if self.min_time > min_time {
            self.min_time = min_time;
@ -440,6 +478,10 @@ impl Window {

    /// Add one window to another. Used to collapse closed windows into persisted.
    fn add_window(&mut self, other: Self) {
+        assert!(self.last_instant <= other.created_at);
+        assert!(self.last_instant <= other.last_instant);
+
+        self.last_instant = other.last_instant;
        self.row_count += other.row_count;
        if self.min_time > other.min_time {
            self.min_time = other.min_time;
@ -1265,4 +1307,119 @@ mod tests {
        assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2));
        assert_eq!(w.closed[1].row_count, 11);
    }
+
+    #[test]
+    fn test_summaries() {
+        let late_arrival_period = Duration::from_secs(100);
+        let mut w = make_windows(late_arrival_period);
+        let instant = w.created_at;
+        let created_at_time = to_approximate_datetime(w.created_at);
+
+        // Window 1
+        w.add_range(
+            Some(&Sequence { id: 1, number: 1 }),
+            11,
+            Utc.timestamp_nanos(10),
+            Utc.timestamp_nanos(11),
+            instant + Duration::from_millis(1),
+        );
+
+        w.add_range(
+            Some(&Sequence { id: 1, number: 2 }),
+            4,
+            Utc.timestamp_nanos(10),
+            Utc.timestamp_nanos(340),
+            instant + Duration::from_millis(30),
+        );
+
+        w.add_range(
+            Some(&Sequence { id: 1, number: 3 }),
+            6,
+            Utc.timestamp_nanos(1),
+            Utc.timestamp_nanos(5),
+            instant + Duration::from_millis(50),
+        );
+
+        // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2
+        w.add_range(
+            Some(&Sequence { id: 1, number: 4 }),
+            3,
+            Utc.timestamp_nanos(89),
+            Utc.timestamp_nanos(90),
+            instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1),
+        );
+
+        // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3
+        w.add_range(
+            Some(&Sequence { id: 1, number: 5 }),
+            8,
+            Utc.timestamp_nanos(3),
+            Utc.timestamp_nanos(4),
+            instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3,
+        );
+
+        let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap();
+
+        let summaries: Vec<_> = w.summaries().collect();
+
+        assert_eq!(summaries.len(), 3);
+        assert_eq!(
+            summaries,
+            vec![
+                WriteSummary {
+                    time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: created_at_time + chrono::Duration::milliseconds(50),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(340),
+                    row_count: 21
+                },
+                WriteSummary {
+                    time_of_first_write: created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    time_of_last_write: created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    min_timestamp: Utc.timestamp_nanos(89),
+                    max_timestamp: Utc.timestamp_nanos(90),
+                    row_count: 3
+                },
+                WriteSummary {
+                    time_of_first_write: created_at_time + closed_duration * 3,
+                    time_of_last_write: created_at_time + closed_duration * 3,
+                    min_timestamp: Utc.timestamp_nanos(3),
+                    max_timestamp: Utc.timestamp_nanos(4),
+                    row_count: 8
+                },
+            ]
+        );
+
+        // Rotate first and second windows into persistable
+        w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2);
+
+        let summaries: Vec<_> = w.summaries().collect();
+
+        assert_eq!(summaries.len(), 2);
+        assert_eq!(
+            summaries,
+            vec![
+                WriteSummary {
+                    time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(340),
+                    row_count: 24
+                },
+                WriteSummary {
+                    time_of_first_write: created_at_time + closed_duration * 3,
+                    time_of_last_write: created_at_time + closed_duration * 3,
+                    min_timestamp: Utc.timestamp_nanos(3),
+                    max_timestamp: Utc.timestamp_nanos(4),
+                    row_count: 8
+                },
+            ]
+        );
+    }
 }
--- a/query/src/exec.rs
+++ b/query/src/exec.rs
@ -39,6 +39,7 @@ use crate::plan::{
 };

 use self::{
+    context::IOxExecutionConfig,
    split::StreamSplitNode,
    task::{DedicatedExecutor, Error as ExecutorError},
 };
@ -111,6 +112,9 @@ pub struct Executor {
    /// Executor for running system/reorganization tasks such as
    /// compact
    reorg_exec: DedicatedExecutor,
+
+    /// The default configuration options with which to create contexts
+    config: IOxExecutionConfig,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@ -128,12 +132,25 @@ impl Executor {
        let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads);
        let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads);

+        let config = IOxExecutionConfig::new();
+
        Self {
            query_exec,
            reorg_exec,
+            config,
        }
    }

+    /// returns the config of this executor
+    pub fn config(&self) -> &IOxExecutionConfig {
+        &self.config
+    }
+
+    /// returns a mutable reference to this executor's config
+    pub fn config_mut(&mut self) -> &mut IOxExecutionConfig {
+        &mut self.config
+    }
+
    /// Executes this plan on the query pool, and returns the
    /// resulting set of strings
    pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
@ -289,7 +306,7 @@ impl Executor {
    pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext {
        let executor = self.executor(executor_type).clone();

-        IOxExecutionContext::new(executor)
+        IOxExecutionContext::new(executor, self.config.clone())
    }

    /// Return the execution pool  of the specified type
--- a/query/src/exec/context.rs
+++ b/query/src/exec/context.rs
@ -5,6 +5,7 @@ use std::{fmt, sync::Arc};

 use arrow::record_batch::RecordBatch;
 use datafusion::{
+    catalog::catalog::CatalogProvider,
    execution::context::{ExecutionContextState, QueryPlanner},
    logical_plan::{LogicalPlan, UserDefinedLogicalNode},
    physical_plan::{
@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner {
    }
 }

+// Configuration for an IOx execution context
+#[derive(Clone)]
+pub struct IOxExecutionConfig {
+    /// Configuration options to pass to DataFusion
+    inner: ExecutionConfig,
+}
+
+impl Default for IOxExecutionConfig {
+    fn default() -> Self {
+        const BATCH_SIZE: usize = 1000;
+
+        // Setup default configuration
+        let inner = ExecutionConfig::new()
+            .with_batch_size(BATCH_SIZE)
+            .create_default_catalog_and_schema(true)
+            .with_information_schema(true)
+            .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
+            .with_query_planner(Arc::new(IOxQueryPlanner {}));
+
+        Self { inner }
+    }
+}
+
+impl fmt::Debug for IOxExecutionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "IOxExecutionConfig ...")
+    }
+}
+
+impl IOxExecutionConfig {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Set execution concurrency
+    pub fn set_concurrency(&mut self, concurrency: usize) {
+        self.inner.concurrency = concurrency;
+    }
+}
+
 /// This is an execution context for planning in IOx.  It wraps a
 /// DataFusion execution context with the information needed for planning.
 ///
@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext {

 impl IOxExecutionContext {
    /// Create an ExecutionContext suitable for executing DataFusion plans
-    ///
-    /// The config is created with a default catalog and schema, but this
-    /// can be overridden at a later date
-    pub fn new(exec: DedicatedExecutor) -> Self {
-        const BATCH_SIZE: usize = 1000;
-
-        // TBD: Should we be reusing an execution context across all executions?
-        let config = ExecutionConfig::new()
-            .with_batch_size(BATCH_SIZE)
-            .create_default_catalog_and_schema(true)
-            .with_information_schema(true)
-            .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
-            .with_query_planner(Arc::new(IOxQueryPlanner {}));
-
-        let inner = ExecutionContext::with_config(config);
+    pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self {
+        let inner = ExecutionContext::with_config(config.inner);

        Self { inner, exec }
    }
@ -160,11 +188,13 @@ impl IOxExecutionContext {
        &self.inner
    }

-    /// returns a mutable reference to the inner datafusion execution context
-    pub fn inner_mut(&mut self) -> &mut ExecutionContext {
-        &mut self.inner
+    /// registers a catalog with the inner context
+    pub fn register_catalog(&mut self, name: impl Into<String>, catalog: Arc<dyn CatalogProvider>) {
+        self.inner.register_catalog(name, catalog);
    }

+    ///
+
    /// Prepare a SQL statement for execution. This assumes that any
    /// tables referenced in the SQL have been registered with this context
    pub fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
--- a/query/src/frontend/reorg.rs
+++ b/query/src/frontend/reorg.rs
@ -268,8 +268,9 @@ struct ScanPlan<C: QueryChunk + 'static> {

 #[cfg(test)]
 mod test {
+    use arrow::compute::SortOptions;
    use arrow_util::assert_batches_eq;
-    use internal_types::schema::{merge::SchemaMerger, sort::SortOptions};
+    use internal_types::schema::merge::SchemaMerger;

    use crate::{
        exec::{Executor, ExecutorType},
--- a/query/src/frontend/sql.rs
+++ b/query/src/frontend/sql.rs
@ -87,7 +87,7 @@ impl SqlQueryPlanner {
        executor: &Executor,
    ) -> Result<Arc<dyn ExecutionPlan>> {
        let mut ctx = executor.new_context(ExecutorType::Query);
-        ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database);
+        ctx.register_catalog(DEFAULT_CATALOG, database);
        ctx.prepare_sql(query).context(Preparing)
    }
 }
--- a/query/src/provider/deduplicate/algo.rs
+++ b/query/src/provider/deduplicate/algo.rs
@ -366,21 +366,12 @@ impl RecordBatchDeduplicator {
    }

    /// Create a new record batch from offset --> len
-    ///
-    /// <https://github.com/apache/arrow-rs/issues/460> for adding this upstream
    fn slice_record_batch(
        batch: &RecordBatch,
        offset: usize,
        len: usize,
    ) -> ArrowResult<RecordBatch> {
-        let schema = batch.schema();
-        let new_columns: Vec<_> = batch
-            .columns()
-            .iter()
-            .map(|old_column| old_column.slice(offset, len))
-            .collect();
-
-        let batch = RecordBatch::try_new(schema, new_columns)?;
+        let batch = batch.slice(offset, len);

        // At time of writing, `concat_batches` concatenates the
        // contents of dictionaries as well; Do a post pass to remove the
--- a/query_tests/cases/in/all_chunks_dropped.expected
+++ b/query_tests/cases/in/all_chunks_dropped.expected
@ -1,25 +1,27 @@
 -- Test Setup: OneMeasurementAllChunksDropped
 -- SQL: SELECT * from information_schema.tables;
-+---------------+--------------------+---------------+------------+
-| table_catalog | table_schema       | table_name    | table_type |
-+---------------+--------------------+---------------+------------+
-| public        | iox                | h2o           | BASE TABLE |
-| public        | system             | chunks        | BASE TABLE |
-| public        | system             | columns       | BASE TABLE |
-| public        | system             | chunk_columns | BASE TABLE |
-| public        | system             | operations    | BASE TABLE |
-| public        | information_schema | tables        | VIEW       |
-| public        | information_schema | columns       | VIEW       |
-+---------------+--------------------+---------------+------------+
+---------------+--------------------+---------------------+------------+
+| table_catalog | table_schema       | table_name          | table_type |
+---------------+--------------------+---------------------+------------+
+| public        | iox                | h2o                 | BASE TABLE |
+| public        | system             | chunks              | BASE TABLE |
+| public        | system             | columns             | BASE TABLE |
+| public        | system             | chunk_columns       | BASE TABLE |
+| public        | system             | operations          | BASE TABLE |
+| public        | system             | persistence_windows | BASE TABLE |
+| public        | information_schema | tables              | VIEW       |
+| public        | information_schema | columns             | VIEW       |
+---------------+--------------------+---------------------+------------+
 -- SQL: SHOW TABLES;
-+---------------+--------------------+---------------+------------+
-| table_catalog | table_schema       | table_name    | table_type |
-+---------------+--------------------+---------------+------------+
-| public        | iox                | h2o           | BASE TABLE |
-| public        | system             | chunks        | BASE TABLE |
-| public        | system             | columns       | BASE TABLE |
-| public        | system             | chunk_columns | BASE TABLE |
-| public        | system             | operations    | BASE TABLE |
-| public        | information_schema | tables        | VIEW       |
-| public        | information_schema | columns       | VIEW       |
-+---------------+--------------------+---------------+------------+
+---------------+--------------------+---------------------+------------+
+| table_catalog | table_schema       | table_name          | table_type |
+---------------+--------------------+---------------------+------------+
+| public        | iox                | h2o                 | BASE TABLE |
+| public        | system             | chunks              | BASE TABLE |
+| public        | system             | columns             | BASE TABLE |
+| public        | system             | chunk_columns       | BASE TABLE |
+| public        | system             | operations          | BASE TABLE |
+| public        | system             | persistence_windows | BASE TABLE |
+| public        | information_schema | tables              | VIEW       |
+| public        | information_schema | columns             | VIEW       |
+---------------+--------------------+---------------------+------------+
--- a/query_tests/cases/in/duplicates.expected
+++ b/query_tests/cases/in/duplicates.expected
@ -1,86 +1,87 @@
 -- Test Setup: OneMeasurementThreeChunksWithDuplicates
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                      |
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=None                                                                                                        |
-| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| physical_plan                           | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC]                                                                                             |
-|                                         |   ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
-|                                         |     ExecutionPlan(PlaceHolder)                                                                                                            |
-|                                         |       DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                                |
-|                                         |         SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                      |
-|                                         |           ExecutionPlan(PlaceHolder)                                                                                                      |
-|                                         |             IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |             IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o;
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                    |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=None                                                                                                        |
-| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| physical_plan                           | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
-|                                         |   ExecutionPlan(PlaceHolder)                                                                                                            |
-|                                         |     DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                                |
-|                                         |       SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                      |
-|                                         |         ExecutionPlan(PlaceHolder)                                                                                                      |
-|                                         |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |     IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-|                                         |     IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
-+-----------------------------------------+-------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                          |
-+-----------------------------------------+-------------------------------------------------------------------------------+
-| logical_plan                            | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=None                                            |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=None                                            |
-| logical_plan after projection_push_down | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=Some([4])                                       |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=Some([1])                                       |
-| logical_plan after simplify_expressions | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=Some([4])                                       |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=Some([1])                                       |
-| physical_plan                           | ExecutionPlan(PlaceHolder)                                                    |
-|                                         |   ProjectionExec: expr=[state@0 as name]                                      |
-|                                         |     ExecutionPlan(PlaceHolder)                                                |
-|                                         |       ProjectionExec: expr=[state@1 as state]                                 |
-|                                         |         DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                  |
-|                                         |           SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]        |
-|                                         |             ExecutionPlan(PlaceHolder)                                        |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |   ProjectionExec: expr=[city@0 as name]                                       |
-|                                         |     ExecutionPlan(PlaceHolder)                                                |
-|                                         |       ProjectionExec: expr=[city@0 as city]                                   |
-|                                         |         DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                  |
-|                                         |           SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]        |
-|                                         |             ExecutionPlan(PlaceHolder)                                        |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-+-----------------------------------------+-------------------------------------------------------------------------------+
+-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                        |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                      |
+|               |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                     |
+|               |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                      |
+| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC]                                                                                               |
+|               |   CoalescePartitionsExec                                                                                                                    |
+|               |     ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
+|               |       ExecutionPlan(PlaceHolder)                                                                                                            |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                              |
+|               |             SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                    |
+|               |               ExecutionPlan(PlaceHolder)                                                                                                    |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                    |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
+|               |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
+| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
+|               |   ExecutionPlan(PlaceHolder)                                                                                                            |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                              |
+|               |         SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                    |
+|               |           ExecutionPlan(PlaceHolder)                                                                                                    |
+|               |             RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |             RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+---------------+-----------------------------------------------------------------------------------+
+| plan_type     | plan                                                                              |
+---------------+-----------------------------------------------------------------------------------+
+| logical_plan  | Union                                                                             |
+|               |   Projection: #h2o.state AS name                                                  |
+|               |     TableScan: h2o projection=Some([4])                                           |
+|               |   Projection: #h2o.city AS name                                                   |
+|               |     TableScan: h2o projection=Some([1])                                           |
+| physical_plan | ExecutionPlan(PlaceHolder)                                                        |
+|               |   ProjectionExec: expr=[state@0 as name]                                          |
+|               |     ExecutionPlan(PlaceHolder)                                                    |
+|               |       ProjectionExec: expr=[state@1 as state]                                     |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                          |
+|               |           DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                    |
+|               |             SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]          |
+|               |               ExecutionPlan(PlaceHolder)                                          |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |   ProjectionExec: expr=[city@0 as name]                                           |
+|               |     ExecutionPlan(PlaceHolder)                                                    |
+|               |       ProjectionExec: expr=[city@0 as city]                                       |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                          |
+|               |           DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                    |
+|               |             SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]          |
+|               |               ExecutionPlan(PlaceHolder)                                          |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+---------------+-----------------------------------------------------------------------------------+
--- a/query_tests/cases/in/duplicates.sql
+++ b/query_tests/cases/in/duplicates.sql
@ -2,11 +2,11 @@
 -- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates

 -- Plan with order by
-explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
+explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;


 -- plan without order by
-explain verbose select time, state, city, min_temp, max_temp, area from h2o;
+EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;

 -- Union plan
-EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
+EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
--- a/query_tests/cases/in/pushdown.expected
+++ b/query_tests/cases/in/pushdown.expected
@ -1,218 +1,167 @@
 -- Test Setup: TwoMeasurementsPredicatePushDown
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=None                                                     |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                    |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200                                                  |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Float64) > 200                                                |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 4                                                                  |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury             |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                             |
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=None                                                                                                                        |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                      |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence             |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                       |
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                   |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                              |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000                |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                             |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200  and count < 40000;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000               |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 4 AND system@1 < 7                                                 |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 5 AND system@1 < 7                                                 |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                        |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=None                                                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                 |
-|                                         |   FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                            |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                  |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                   |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                              |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|                                         |   FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                    |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                             |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                                                                                                    |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                                                                                                               |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                             |
-|                                         |   FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00)                                                                       |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                                                                                                              |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant;
+---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
+---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   RepartitionExec: partitioning=RoundRobinBatch(4)                                          |
+|               |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
+---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200;
+---------------+--------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                         |
+---------------+--------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                        |
+|               |   Filter: #restaurant.count Gt Int64(200)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                  |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                 |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200                                                                 |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                       |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] |
+---------------+--------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0;
+---------------+----------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                           |
+---------------+----------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                          |
+|               |   Filter: #restaurant.count Gt Float64(200)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                    |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                   |
+|               |     FilterExec: CAST(count@0 AS Float64) > 200                                                                 |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                         |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] |
+---------------+----------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0;
+---------------+---------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                          |
+---------------+---------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                         |
+|               |   Filter: #restaurant.system Gt Float64(4)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                   |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                  |
+|               |     FilterExec: system@1 > 4                                                                                  |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                        |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] |
+---------------+---------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                        |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                       |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                 |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury                                                           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")]  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                               |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                         |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                        |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence                                                              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                              |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")]                                                                          |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                   |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                  |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000)                                                 |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                           |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000                                                              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)]                                                                                     |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200  and count < 40000;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000                                                       |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)]  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: system@1 > 4 AND system@1 < 7                                                                                         |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)]  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: system@1 > 5 AND system@1 < 7                                                                                         |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)]  |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                           |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                          |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                    |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                   |
+|               |     FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                                                                             |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                         |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system]             |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                   |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                  |
+|               |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")                                                 |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                           |
+|               |     FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                                                                  |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)]                                                                                     |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                 |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                                |
+|               |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130)                                              |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                                          |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                                                                         |
+|               |     FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130                                                                                                                               |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                                                               |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)]                                                                                                |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
--- a/query_tests/cases/in/pushdown.sql
+++ b/query_tests/cases/in/pushdown.sql
@ -2,44 +2,44 @@
 -- IOX_SETUP: TwoMeasurementsPredicatePushDown

 -- Test 1: Select everything
-EXPLAIN VERBOSE SELECT * from restaurant;
+EXPLAIN SELECT * from restaurant;

 -- Test 2: One push-down expression: count > 200
 -- TODO: Make push-down predicates shown in explain verbose. Ticket #1538
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
+EXPLAIN SELECT * from restaurant where count > 200;

 -- Test 2.2: One push-down expression: count > 200.0
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
+EXPLAIN SELECT * from restaurant where count > 200.0;

 -- Test 2.3: One push-down expression: system > 4.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
+EXPLAIN SELECT * from restaurant where system > 4.0;


 -- Test 3: Two push-down expression: count > 200 and town != 'tewsbury'
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';

 -- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury'
 -- even though the results are different
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');

 -- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;

 -- Test 6: two push-down expression: count > 200 and count < 40000
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200  and count < 40000;
+EXPLAIN SELECT * from restaurant where count > 200  and count < 40000;

 -- Test 7: two push-down expression on float: system > 4.0 and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
+EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;

 -- Test 8: two push-down expression on float: system > 5.0 and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
+EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;

 -- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
+EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;

 -- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
+EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');

 -- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and
 -- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130)
-EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
+EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
--- a/query_tests/src/runner.rs
+++ b/query_tests/src/runner.rs
@ -4,12 +4,16 @@ mod parse;
 mod setup;

 use arrow::record_batch::RecordBatch;
-use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner};
+use query::{
+    exec::{Executor, ExecutorType},
+    frontend::sql::SqlQueryPlanner,
+};
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::{
    io::LineWriter,
    io::Write,
    path::{Path, PathBuf},
+    sync::Arc,
 };

 use self::{parse::TestQueries, setup::TestSetup};
@ -261,7 +265,13 @@ impl<W: Write> Runner<W> {
            writeln!(self.log, "Running scenario '{}'", scenario_name)?;
            writeln!(self.log, "SQL: '{:#?}'", sql)?;
            let planner = SqlQueryPlanner::default();
-            let executor = db.executor();
+            let num_threads = 1;
+            let mut executor = Executor::new(num_threads);
+
+            // hardcode concurrency in tests as by default is is the
+            // number of cores, which varies across machines
+            executor.config_mut().set_concurrency(4);
+            let executor = Arc::new(executor);

            let physical_plan = planner
                .query(db, &sql, executor.as_ref())
--- a/query_tests/src/sql.rs
+++ b/query_tests/src/sql.rs
@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() {
    // validate we have access to information schema for listing table
    // names
    let expected = vec![
-        "+---------------+--------------------+---------------+------------+",
-        "| table_catalog | table_schema       | table_name    | table_type |",
-        "+---------------+--------------------+---------------+------------+",
-        "| public        | information_schema | columns       | VIEW       |",
-        "| public        | information_schema | tables        | VIEW       |",
-        "| public        | iox                | h2o           | BASE TABLE |",
-        "| public        | iox                | o2            | BASE TABLE |",
-        "| public        | system             | chunk_columns | BASE TABLE |",
-        "| public        | system             | chunks        | BASE TABLE |",
-        "| public        | system             | columns       | BASE TABLE |",
-        "| public        | system             | operations    | BASE TABLE |",
-        "+---------------+--------------------+---------------+------------+",
+        "+---------------+--------------------+---------------------+------------+",
+        "| table_catalog | table_schema       | table_name          | table_type |",
+        "+---------------+--------------------+---------------------+------------+",
+        "| public        | information_schema | columns             | VIEW       |",
+        "| public        | information_schema | tables              | VIEW       |",
+        "| public        | iox                | h2o                 | BASE TABLE |",
+        "| public        | iox                | o2                  | BASE TABLE |",
+        "| public        | system             | chunk_columns       | BASE TABLE |",
+        "| public        | system             | chunks              | BASE TABLE |",
+        "| public        | system             | columns             | BASE TABLE |",
+        "| public        | system             | operations          | BASE TABLE |",
+        "| public        | system             | persistence_windows | BASE TABLE |",
+        "+---------------+--------------------+---------------------+------------+",
    ];
    run_sql_test_case!(
        TwoMeasurementsManyFields {},
--- a/server/src/config.rs
+++ b/server/src/config.rs
@ -8,7 +8,7 @@ use data_types::{
    DatabaseName,
 };
 use metrics::MetricRegistry;
-use object_store::{path::ObjectStorePath, ObjectStore};
+use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi};
 use parquet_file::catalog::PreservedCatalog;
 use query::exec::Executor;
 use write_buffer::config::WriteBufferConfig;
@ -16,9 +16,13 @@ use write_buffer::config::WriteBufferConfig;
 /// This module contains code for managing the configuration of the server.
 use crate::{
    db::{catalog::Catalog, DatabaseToCommit, Db},
-    Error, JobRegistry, Result,
+    DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error,
+    InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch,
+    ServerShuttingDown,
 };
+use object_store::path::Path;
 use observability_deps::tracing::{self, error, info, warn, Instrument};
+use snafu::{ensure, OptionExt};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;

@ -34,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb";
 /// run to completion if the tokio runtime is dropped
 #[derive(Debug)]
 pub(crate) struct Config {
-    shutdown: CancellationToken,
    jobs: Arc<JobRegistry>,
-    state: RwLock<ConfigState>,
+    object_store: Arc<ObjectStore>,
+    exec: Arc<Executor>,
+    server_id: ServerId,
    metric_registry: Arc<MetricRegistry>,
+
+    shutdown: CancellationToken,
+    state: RwLock<ConfigState>,
 }

 pub(crate) enum UpdateError<E> {
@ -55,14 +63,20 @@ impl Config {
    /// Create new empty config.
    pub(crate) fn new(
        jobs: Arc<JobRegistry>,
+        object_store: Arc<ObjectStore>,
+        exec: Arc<Executor>,
+        server_id: ServerId,
        metric_registry: Arc<MetricRegistry>,
        remote_template: Option<RemoteTemplate>,
    ) -> Self {
        Self {
+            jobs,
+            object_store,
+            exec,
+            server_id,
+            metric_registry,
            shutdown: Default::default(),
            state: RwLock::new(ConfigState::new(remote_template)),
-            jobs,
-            metric_registry,
        }
    }

@ -77,33 +91,20 @@ impl Config {
    /// This only works if the database is not yet known. To recover a database out of an uninitialized state, see
    /// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog)
    /// without initializing it, see [`block_db`](Self::block_db).
-    pub(crate) fn create_db(
-        &self,
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    ) -> Result<DatabaseHandle<'_>> {
+    pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
        let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
-        if state.databases.contains_key(&db_name) {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );
+        ensure!(
+            !state.databases.contains_key(&db_name),
+            DatabaseAlreadyExists { db_name }
+        );

        state.reservations.insert(db_name.clone());
        Ok(DatabaseHandle {
-            state: Some(Arc::new(DatabaseState::Known {
-                object_store,
-                exec,
-                server_id,
-                db_name,
-            })),
+            state: Some(Arc::new(DatabaseState::Known { db_name })),
            config: &self,
        })
    }
@ -115,32 +116,27 @@ impl Config {
    /// While the handle is held, no other operations for the given database can be executed.
    ///
    /// This only works if the database is known but is uninitialized. To create a new database that is not yet known,
-    /// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog)
+    /// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog)
    /// without initializing it, see [`block_db`](Self::block_db).
    pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
        let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );

-        let db_state =
-            state
-                .databases
-                .get(&db_name)
-                .cloned()
-                .ok_or_else(|| Error::DatabaseNotFound {
-                    db_name: db_name.to_string(),
-                })?;
+        let db_state = state
+            .databases
+            .get(&db_name)
+            .cloned()
+            .context(DatabaseNotFound { db_name: &db_name })?;

-        if db_state.is_initialized() {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !db_state.is_initialized(),
+            DatabaseAlreadyExists { db_name }
+        );

-        state.reservations.insert(db_name.clone());
+        state.reservations.insert(db_name);
        Ok(DatabaseHandle {
            state: Some(db_state),
            config: &self,
@ -159,16 +155,14 @@ impl Config {
        db_name: DatabaseName<'static>,
    ) -> Result<BlockDatabaseGuard<'_>> {
        let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
-        if state.databases.contains_key(&db_name) {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );
+        ensure!(
+            !state.databases.contains_key(&db_name),
+            DatabaseAlreadyExists { db_name }
+        );

        state.reservations.insert(db_name.clone());
        Ok(BlockDatabaseGuard {
@ -228,11 +222,9 @@ impl Config {
        // TODO: implement for non-initialized databases
        let db = self
            .db_initialized(db_name)
-            .ok_or_else(|| Error::DatabaseNotFound {
-                db_name: db_name.to_string(),
-            })?;
+            .context(DatabaseNotFound { db_name })?;

-        db.update_db_rules(update).map_err(UpdateError::Closure)
+        db.update_rules(update).map_err(UpdateError::Closure)
    }

    /// Get all registered remote servers.
@ -311,6 +303,24 @@ impl Config {
    pub fn metrics_registry(&self) -> Arc<MetricRegistry> {
        Arc::clone(&self.metric_registry)
    }
+
+    /// Returns the object store of this server
+    pub fn object_store(&self) -> Arc<ObjectStore> {
+        Arc::clone(&self.object_store)
+    }
+
+    /// Returns the server id of this server
+    pub fn server_id(&self) -> ServerId {
+        self.server_id
+    }
+
+    /// Base location in object store for this server.
+    pub fn root_path(&self) -> Path {
+        let id = self.server_id.get();
+        let mut path = self.object_store.new_path();
+        path.push_dir(format!("{}", id));
+        path
+    }
 }

 /// Get object store path for the database config under the given root (= path under with the server with the current ID
@ -373,41 +383,14 @@ impl RemoteTemplate {
 }

 /// Internal representation of the different database states.
-///
-/// # Shared Data During Transitions
-/// The following elements can safely be shared between states because they won't be poisoned by any half-done
-/// transition (e.g. starting a transition and then failing due to an IO error):
-/// - `object_store`
-/// - `exec`
-///
-/// The following elements can trivially be copied from one state to the next:
-/// - `server_id`
-/// - `db_name`
-///
-/// The following elements MUST be copied from one state to the next because partial modifications are not allowed:
-/// - `rules`
-///
-/// Exceptions to the above rules are the following states:
-/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm
-/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else
 #[derive(Debug)]
 #[allow(clippy::large_enum_variant)]
 enum DatabaseState {
    /// Database is known but nothing is loaded.
-    Known {
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    },
+    Known { db_name: DatabaseName<'static> },

    /// Rules are loaded
-    RulesLoaded {
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        rules: Arc<DatabaseRules>,
-    },
+    RulesLoaded { rules: Arc<DatabaseRules> },

    /// Catalog is loaded but data from sequencers / write buffers is not yet replayed.
    Replay { db: Arc<Db> },
@ -465,24 +448,6 @@ impl DatabaseState {
        }
    }

-    fn object_store(&self) -> Arc<ObjectStore> {
-        match self {
-            DatabaseState::Known { object_store, .. } => Arc::clone(object_store),
-            DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store),
-            DatabaseState::Replay { db, .. } => Arc::clone(&db.store),
-            DatabaseState::Initialized { db, .. } => Arc::clone(&db.store),
-        }
-    }
-
-    fn server_id(&self) -> ServerId {
-        match self {
-            DatabaseState::Known { server_id, .. } => *server_id,
-            DatabaseState::RulesLoaded { server_id, .. } => *server_id,
-            DatabaseState::Replay { db, .. } => db.server_id,
-            DatabaseState::Initialized { db, .. } => db.server_id,
-        }
-    }
-
    fn rules(&self) -> Option<Arc<DatabaseRules>> {
        match self {
            DatabaseState::Known { .. } => None,
@ -548,12 +513,12 @@ impl<'a> DatabaseHandle<'a> {

    /// Get object store.
    pub fn object_store(&self) -> Arc<ObjectStore> {
-        self.state().object_store()
+        Arc::clone(&self.config.object_store)
    }

    /// Get server ID.
    pub fn server_id(&self) -> ServerId {
-        self.state().server_id()
+        self.config.server_id
    }

    /// Get metrics registry.
@ -592,32 +557,26 @@ impl<'a> DatabaseHandle<'a> {
    /// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded).
    pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> {
        match self.state().as_ref() {
-            DatabaseState::Known {
-                object_store,
-                exec,
-                server_id,
-                db_name,
-            } => {
-                if db_name != &rules.name {
-                    return Err(Error::RulesDatabaseNameMismatch {
-                        actual: rules.name.to_string(),
-                        expected: db_name.to_string(),
-                    });
-                }
+            DatabaseState::Known { db_name } => {
+                ensure!(
+                    db_name == &rules.name,
+                    RulesDatabaseNameMismatch {
+                        actual: rules.name,
+                        expected: db_name,
+                    }
+                );

                self.state = Some(Arc::new(DatabaseState::RulesLoaded {
-                    object_store: Arc::clone(&object_store),
-                    exec: Arc::clone(&exec),
-                    server_id: *server_id,
                    rules: Arc::new(rules),
                }));

                Ok(())
            }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                actual: state.code(),
                expected: DatabaseStateCode::Known,
-            }),
+            }
+            .fail(),
        }
    }

@ -629,16 +588,11 @@ impl<'a> DatabaseHandle<'a> {
        write_buffer: Option<WriteBufferConfig>,
    ) -> Result<()> {
        match self.state().as_ref() {
-            DatabaseState::RulesLoaded {
-                object_store,
-                exec,
-                server_id,
-                rules,
-            } => {
+            DatabaseState::RulesLoaded { rules } => {
                let database_to_commit = DatabaseToCommit {
-                    server_id: *server_id,
-                    object_store: Arc::clone(&object_store),
-                    exec: Arc::clone(&exec),
+                    server_id: self.config.server_id,
+                    object_store: Arc::clone(&self.config.object_store),
+                    exec: Arc::clone(&self.config.exec),
                    preserved_catalog,
                    catalog,
                    rules: Arc::clone(&rules),
@ -650,10 +604,11 @@ impl<'a> DatabaseHandle<'a> {

                Ok(())
            }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                actual: state.code(),
                expected: DatabaseStateCode::RulesLoaded,
-            }),
+            }
+            .fail(),
        }
    }

@ -663,7 +618,7 @@ impl<'a> DatabaseHandle<'a> {
            DatabaseState::Replay { db } => {
                if self.config.shutdown.is_cancelled() {
                    error!("server is shutting down");
-                    return Err(Error::ServerShuttingDown);
+                    return ServerShuttingDown.fail();
                }

                let shutdown = self.config.shutdown.child_token();
@ -686,10 +641,11 @@ impl<'a> DatabaseHandle<'a> {

                Ok(())
            }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                actual: state.code(),
                expected: DatabaseStateCode::Replay,
-            }),
+            }
+            .fail(),
        }
    }
 }
@ -730,40 +686,32 @@ mod test {
    use super::*;
    use std::num::NonZeroU32;

+    fn make_config(remote_template: Option<RemoteTemplate>) -> Config {
+        let store = Arc::new(ObjectStore::new_in_memory());
+        let server_id = ServerId::try_from(1).unwrap();
+        let metric_registry = Arc::new(metrics::MetricRegistry::new());
+        Config::new(
+            Arc::new(JobRegistry::new()),
+            Arc::clone(&store),
+            Arc::new(Executor::new(1)),
+            server_id,
+            Arc::clone(&metric_registry),
+            remote_template,
+        )
+    }
+
    #[tokio::test]
    async fn create_db() {
        // setup
        let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
        let rules = DatabaseRules::new(name.clone());

        // getting handle while DB is reserved => fails
        {
-            let _db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let _db_reservation = config.create_db(name.clone()).unwrap();

-            let err = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap_err();
+            let err = config.create_db(name.clone()).unwrap_err();
            assert!(matches!(err, Error::DatabaseReserved { .. }));

            let err = config.block_db(name.clone()).unwrap_err();
@ -775,14 +723,7 @@ mod test {

        // name in rules must match reserved name
        {
-            let mut db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    DatabaseName::new("bar").unwrap(),
-                )
-                .unwrap();
+            let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();

            let err = db_reservation
                .advance_rules_loaded(rules.clone())
@ -795,14 +736,7 @@ mod test {

        // handle.abort just works (aka does not mess up the transaction afterwards)
        {
-            let db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    DatabaseName::new("bar").unwrap(),
-                )
-                .unwrap();
+            let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();

            db_reservation.abort();
        }
@ -812,21 +746,14 @@ mod test {

        // create DB successfull
        {
-            let mut db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let mut db_reservation = config.create_db(name.clone()).unwrap();

            db_reservation.advance_rules_loaded(rules).unwrap();

            let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
                &name,
-                Arc::clone(&store),
-                server_id,
+                config.object_store(),
+                config.server_id(),
                config.metrics_registry(),
                false,
            )
@ -866,14 +793,7 @@ mod test {
        assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));

        // create DB as second time => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
        assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));

        // block fully initiliazed DB => fail
@ -888,40 +808,18 @@ mod test {
    async fn recover_db() {
        // setup
        let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
        let rules = DatabaseRules::new(name.clone());

        // create DB but don't continue with rules loaded (e.g. because the rules file is broken)
        {
-            let db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let db_reservation = config.create_db(name.clone()).unwrap();
            db_reservation.commit();
        }
        assert!(config.has_uninitialized_database(&name));

        // create DB while it is uninitialized => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
        assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));

        // recover an unknown DB => fail
@ -935,19 +833,19 @@ mod test {
            let mut db_reservation = config.recover_db(name.clone()).unwrap();
            assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known);
            assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
            assert!(db_reservation.rules().is_none());

            db_reservation.advance_rules_loaded(rules).unwrap();
            assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded);
            assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
            assert!(db_reservation.rules().is_some());

            let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
                &name,
-                Arc::clone(&store),
-                server_id,
+                config.object_store(),
+                config.server_id(),
                config.metrics_registry(),
                false,
            )
@ -958,13 +856,13 @@ mod test {
                .unwrap();
            assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay);
            assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
            assert!(db_reservation.rules().is_some());

            db_reservation.advance_init().unwrap();
            assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized);
            assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
            assert!(db_reservation.rules().is_some());

            db_reservation.commit();
@ -978,14 +876,7 @@ mod test {
        assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));

        // create recovered DB => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
        assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));

        // block recovered DB => fail
@ -1000,28 +891,13 @@ mod test {
    async fn block_db() {
        // setup
        let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);

        // block DB
        let handle = config.block_db(name.clone()).unwrap();

        // create while blocked => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
        assert!(matches!(err, Error::DatabaseReserved { .. }));

        // recover while blocked => fail
@ -1034,14 +910,7 @@ mod test {

        // unblock => DB can be created
        drop(handle);
-        config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap();
+        config.create_db(name.clone()).unwrap();

        // cleanup
        config.drain().await
@ -1051,20 +920,12 @@ mod test {
    async fn test_db_drop() {
        // setup
        let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
        let rules = DatabaseRules::new(name.clone());
        let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
            &name,
-            Arc::clone(&store),
-            server_id,
+            config.object_store(),
+            config.server_id(),
            config.metrics_registry(),
            false,
        )
@ -1072,14 +933,7 @@ mod test {
        .unwrap();

        // create DB
-        let mut db_reservation = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap();
+        let mut db_reservation = config.create_db(name.clone()).unwrap();
        db_reservation.advance_rules_loaded(rules).unwrap();
        db_reservation
            .advance_replay(preserved_catalog, catalog, None)
@ -1126,12 +980,7 @@ mod test {

    #[test]
    fn resolve_remote() {
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            Some(RemoteTemplate::new("http://iox-query-{id}:8082")),
-        );
+        let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082")));

        let server_id = ServerId::new(NonZeroU32::new(42).unwrap());
        let remote = config.resolve_remote(server_id);
--- a/server/src/db.rs
+++ b/server/src/db.rs
@ -50,7 +50,7 @@ use std::{
    time::{Duration, Instant},
 };
 use write_buffer::config::WriteBufferConfig;
-use write_buffer::core::WriteBufferError;
+use write_buffer::core::{FetchHighWatermark, WriteBufferError};

 pub mod access;
 pub mod catalog;
@ -144,6 +144,91 @@ pub enum Error {

 pub type Result<T, E = Error> = std::result::Result<T, E>;

+/// Metrics for data ingest via write buffer.
+#[derive(Debug)]
+struct WriteBufferIngestMetrics {
+    /// Metrics domain
+    domain: Arc<metrics::Domain>,
+}
+
+impl WriteBufferIngestMetrics {
+    fn new(domain: Arc<metrics::Domain>) -> Self {
+        Self { domain }
+    }
+
+    fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics {
+        let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())];
+
+        let red = self
+            .domain
+            .register_red_metric_with_labels(Some("ingest"), labels.clone());
+        let bytes_read = self.domain.register_counter_metric_with_labels(
+            "read",
+            Some("bytes"),
+            "Bytes read from sequencer",
+            labels.clone(),
+        );
+        let last_sequence_number = self.domain.register_gauge_metric_with_labels(
+            "last_sequence_number",
+            None,
+            "Last consumed sequence number (e.g. Kafka offset)",
+            &labels,
+        );
+        let sequence_number_lag = self.domain.register_gauge_metric_with_labels(
+            "sequence_number_lag",
+            None,
+            "The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number",
+            &labels,
+        );
+        let last_min_ts = self.domain.register_gauge_metric_with_labels(
+            "last_min_ts",
+            None,
+            "Minimum timestamp of last write as unix timestamp in nanoseconds",
+            &labels,
+        );
+        let last_max_ts = self.domain.register_gauge_metric_with_labels(
+            "last_max_ts",
+            None,
+            "Maximum timestamp of last write as unix timestamp in nanoseconds",
+            &labels,
+        );
+
+        SequencerMetrics {
+            red,
+            bytes_read,
+            last_sequence_number,
+            sequence_number_lag,
+            last_min_ts,
+            last_max_ts,
+        }
+    }
+}
+
+/// Metrics for a single sequencer.
+#[derive(Debug)]
+struct SequencerMetrics {
+    /// Metrics for tracking ingest.
+    red: metrics::RedMetric,
+
+    /// Bytes read from sequencer.
+    ///
+    /// This metrics is independent of the success / error state of the entries.
+    bytes_read: metrics::Counter,
+
+    /// Last consumed sequence number (e.g. Kafka offset).
+    last_sequence_number: metrics::Gauge,
+
+    // The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed
+    // sequence number.
+    sequence_number_lag: metrics::Gauge,
+
+    /// Minimum timestamp of last write as unix timestamp in nanoseconds.
+    last_min_ts: metrics::Gauge,
+
+    /// Maximum timestamp of last write as unix timestamp in nanoseconds.
+    last_max_ts: metrics::Gauge,
+}
+
 /// This is the main IOx Database object. It is the root object of any
 /// specific InfluxDB IOx instance
 ///
@ -203,10 +288,10 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 pub struct Db {
    rules: RwLock<Arc<DatabaseRules>>,

-    pub server_id: ServerId, // this is also the Query Server ID
+    server_id: ServerId, // this is also the Query Server ID

    /// Interface to use for persistence
-    pub store: Arc<ObjectStore>,
+    store: Arc<ObjectStore>,

    /// Executor for running queries
    exec: Arc<Executor>,
@ -248,8 +333,8 @@ pub struct Db {
    /// Metric labels
    metric_labels: Vec<KeyValue>,

-    /// Metrics for tracking the number of errors that occur while ingesting data
-    ingest_errors: metrics::Counter,
+    /// Ingest metrics
+    ingest_metrics: WriteBufferIngestMetrics,

    /// Optionally connect to a write buffer for either buffering writes or reading buffered writes
    write_buffer: Option<WriteBufferConfig>,
@ -285,9 +370,8 @@ impl Db {
        let metric_labels = database_to_commit.catalog.metric_labels.clone();

        let ingest_domain =
-            metrics_registry.register_domain_with_labels("ingest", metric_labels.clone());
-        let ingest_errors =
-            ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest");
+            metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone());
+        let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain));

        let catalog = Arc::new(database_to_commit.catalog);

@ -316,7 +400,7 @@ impl Db {
            worker_iterations_lifecycle: AtomicUsize::new(0),
            worker_iterations_cleanup: AtomicUsize::new(0),
            metric_labels,
-            ingest_errors,
+            ingest_metrics,
            write_buffer: database_to_commit.write_buffer,
            cleanup_lock: Default::default(),
        }
@ -333,13 +417,40 @@ impl Db {
    }

    /// Updates the database rules
-    pub fn update_db_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
+    pub fn update_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
    where
        F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E>,
    {
-        let mut rules = self.rules.write();
-        let new_rules = Arc::new(update(rules.as_ref().clone())?);
-        *rules = Arc::clone(&new_rules);
+        let (late_arrive_window_updated, new_rules) = {
+            let mut rules = self.rules.write();
+            info!(db_name=%rules.name,  "updating rules for database");
+            let new_rules = Arc::new(update(rules.as_ref().clone())?);
+            let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds
+                != new_rules.lifecycle_rules.late_arrive_window_seconds;
+
+            *rules = Arc::clone(&new_rules);
+            (late_arrive_window_updated, new_rules)
+        };
+
+        if late_arrive_window_updated {
+            // Hold a read lock to prevent concurrent modification and
+            // use values from re-acquired read guard
+            let current = self.rules.read();
+
+            // Update windows
+            let partitions = self.catalog.partitions();
+            for partition in &partitions {
+                let mut partition = partition.write();
+                let addr = partition.addr().clone();
+                if let Some(windows) = partition.persistence_windows_mut() {
+                    info!(partition=%addr, "updating persistence windows");
+                    windows.set_late_arrival_period(Duration::from_secs(
+                        current.lifecycle_rules.late_arrive_window_seconds.get() as u64,
+                    ))
+                }
+            }
+        }
+
        Ok(new_rules)
    }

@ -656,9 +767,17 @@ impl Db {
            // streaming from the write buffer loop
            async {
                if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer {
+                    let mut write_buffer = write_buffer
+                        .try_lock()
+                        .expect("no streams should exist at this point");
                    let mut futures = vec![];
-                    for (_sequencer_id, stream) in write_buffer.streams() {
-                        let fut = self.stream_in_sequenced_entries(stream);
+                    for (sequencer_id, stream) in write_buffer.streams() {
+                        let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id);
+                        let fut = self.stream_in_sequenced_entries(
+                            stream.stream,
+                            stream.fetch_high_watermark,
+                            metrics,
+                        );
                        futures.push(fut);
                    }

@ -675,32 +794,122 @@ impl Db {

    /// This is used to take entries from a `Stream` and put them in the mutable buffer, such as
    /// streaming entries from a write buffer.
-    async fn stream_in_sequenced_entries(
-        &self,
-        stream: BoxStream<'_, Result<SequencedEntry, WriteBufferError>>,
+    async fn stream_in_sequenced_entries<'a>(
+        &'a self,
+        mut stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
+        f_mark: FetchHighWatermark<'a>,
+        mut metrics: SequencerMetrics,
    ) {
-        stream
-            .for_each(|sequenced_entry_result| async {
-                let sequenced_entry = match sequenced_entry_result {
-                    Ok(sequenced_entry) => sequenced_entry,
-                    Err(e) => {
-                        debug!(?e, "Error converting write buffer data to SequencedEntry");
-                        self.ingest_errors.add(1);
-                        return;
-                    }
-                };
+        let mut watermark_last_updated: Option<Instant> = None;
+        let mut watermark = 0;

-                let sequenced_entry = Arc::new(sequenced_entry);
+        while let Some(sequenced_entry_result) = stream.next().await {
+            let red_observation = metrics.red.observation();

-                if let Err(e) = self.store_sequenced_entry(sequenced_entry) {
+            // get entry from sequencer
+            let sequenced_entry = match sequenced_entry_result {
+                Ok(sequenced_entry) => sequenced_entry,
+                Err(e) => {
+                    debug!(?e, "Error converting write buffer data to SequencedEntry");
+                    red_observation.client_error();
+                    continue;
+                }
+            };
+            let sequenced_entry = Arc::new(sequenced_entry);
+
+            // store entry
+            match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) {
+                Ok(_) => {
+                    red_observation.ok();
+                }
+                Err(e) => {
                    debug!(
                        ?e,
                        "Error storing SequencedEntry from write buffer in database"
                    );
-                    self.ingest_errors.add(1);
+                    red_observation.error();
                }
-            })
-            .await
+            }
+
+            // maybe update sequencer watermark
+            // We are not updating this watermark every round because asking the sequencer for that watermark can be
+            // quite expensive.
+            if watermark_last_updated
+                .map(|ts| ts.elapsed() > Duration::from_secs(10))
+                .unwrap_or(true)
+            {
+                match f_mark().await {
+                    Ok(w) => {
+                        watermark = w;
+                    }
+                    Err(e) => {
+                        debug!(%e, "Error while reading sequencer watermark")
+                    }
+                }
+                watermark_last_updated = Some(Instant::now());
+            }
+
+            // update:
+            // - bytes read
+            // - last sequence number
+            // - lag
+            // - min ts
+            // - max ts
+            let sequence = sequenced_entry
+                .sequence()
+                .expect("entry from write buffer must be sequenced");
+            let entry = sequenced_entry.entry();
+            metrics.bytes_read.add(entry.data().len() as u64);
+            metrics
+                .last_sequence_number
+                .set(sequence.number as usize, &[]);
+            metrics.sequence_number_lag.set(
+                watermark.saturating_sub(sequence.number).saturating_sub(1) as usize,
+                &[],
+            );
+            if let Some(min_ts) = entry
+                .partition_writes()
+                .map(|partition_writes| {
+                    partition_writes
+                        .iter()
+                        .filter_map(|partition_write| {
+                            partition_write
+                                .table_batches()
+                                .iter()
+                                .filter_map(|table_batch| table_batch.min_max_time().ok())
+                                .map(|(min, _max)| min)
+                                .max()
+                        })
+                        .min()
+                })
+                .flatten()
+            {
+                metrics
+                    .last_min_ts
+                    .set(min_ts.timestamp_nanos() as usize, &[]);
+            }
+            if let Some(max_ts) = entry
+                .partition_writes()
+                .map(|partition_writes| {
+                    partition_writes
+                        .iter()
+                        .filter_map(|partition_write| {
+                            partition_write
+                                .table_batches()
+                                .iter()
+                                .filter_map(|table_batch| table_batch.min_max_time().ok())
+                                .map(|(_min, max)| max)
+                                .max()
+                        })
+                        .max()
+                })
+                .flatten()
+            {
+                metrics
+                    .last_max_ts
+                    .set(max_ts.timestamp_nanos() as usize, &[]);
+            }
+        }
    }

    async fn cleanup_unreferenced_parquet_files(
@ -1208,17 +1417,27 @@ mod tests {

    #[tokio::test]
    async fn read_from_write_buffer_write_to_mutable_buffer() {
-        let entry = lp_to_entry("cpu bar=1 10");
        let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1);
-        write_buffer_state
-            .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        write_buffer_state.push_entry(
+            SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10"))
+                .unwrap(),
+        );
+        write_buffer_state.push_entry(
+            SequencedEntry::new_from_sequence(
+                Sequence::new(0, 7),
+                lp_to_entry("cpu bar=2 20\ncpu bar=3 30"),
+            )
+            .unwrap(),
+        );
+        let write_buffer = MockBufferForReading::new(write_buffer_state);

-        let db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+        let test_db = TestDb::builder()
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
            .build()
-            .await
-            .db;
+            .await;
+        let db = test_db.db;

        // do: start background task loop
        let shutdown: CancellationToken = Default::default();
@ -1247,18 +1466,84 @@ mod tests {
            tokio::time::sleep(Duration::from_millis(100)).await;
        }

+        // check: metrics
+        // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise
+        let metrics = test_db.metric_registry;
+        metrics
+            .has_metric_family("write_buffer_ingest_requests_total")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+                ("status", "ok"),
+            ])
+            .counter()
+            .eq(2.0)
+            .unwrap();
+        metrics
+            .has_metric_family("write_buffer_read_bytes_total")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .counter()
+            .eq(528.0)
+            .unwrap();
+        metrics
+            .has_metric_family("write_buffer_last_sequence_number")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(7.0)
+            .unwrap();
+        metrics
+            .has_metric_family("write_buffer_sequence_number_lag")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(0.0)
+            .unwrap();
+        metrics
+            .has_metric_family("write_buffer_last_min_ts")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(20.0)
+            .unwrap();
+        metrics
+            .has_metric_family("write_buffer_last_max_ts")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(30.0)
+            .unwrap();
+
        // do: stop background task loop
        shutdown.cancel();
        join_handle.await.unwrap();

        // check: the expected results should be there
-        let batches = run_query(db, "select * from cpu").await;
+        let batches = run_query(db, "select * from cpu order by time").await;

        let expected = vec![
            "+-----+-------------------------------+",
            "| bar | time                          |",
            "+-----+-------------------------------+",
-            "| 1   | 1970-01-01 00:00:00.000000010 |",
+            "| 2   | 1970-01-01 00:00:00.000000020 |",
+            "| 3   | 1970-01-01 00:00:00.000000030 |",
            "+-----+-------------------------------+",
        ];
        assert_batches_eq!(expected, &batches);
@ -1271,10 +1556,12 @@ mod tests {
            String::from("Something bad happened on the way to creating a SequencedEntry").into(),
            0,
        );
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        let write_buffer = MockBufferForReading::new(write_buffer_state);

        let test_db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
            .build()
            .await;

@ -1291,11 +1578,16 @@ mod tests {
        // check: after a while the error should be reported in the database's metrics
        let t_0 = Instant::now();
        loop {
-            let family = metrics.try_has_metric_family("ingest_errors_total");
+            let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total");

            if let Ok(metric) = family {
                if metric
-                    .with_labels(&[("db_name", "placeholder"), ("svr_id", "1")])
+                    .with_labels(&[
+                        ("db_name", "placeholder"),
+                        ("svr_id", "1"),
+                        ("sequencer_id", "0"),
+                        ("status", "client_error"),
+                    ])
                    .counter()
                    .eq(1.0)
                    .is_ok()
@ -2259,10 +2551,12 @@ mod tests {
        );
        write_buffer_state
            .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap());
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        let write_buffer = MockBufferForReading::new(write_buffer_state);

        let db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
            .build()
            .await
            .db;
--- a/server/src/db/catalog.rs
+++ b/server/src/db/catalog.rs
@ -6,15 +6,16 @@ use hashbrown::{HashMap, HashSet};

 use data_types::chunk_metadata::ChunkSummary;
 use data_types::chunk_metadata::DetailedChunkSummary;
-use data_types::partition_metadata::{PartitionSummary, TableSummary};
+use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary};
 use internal_types::schema::Schema;
-use snafu::Snafu;
+use snafu::{OptionExt, Snafu};
 use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};

 use self::chunk::CatalogChunk;
 use self::metrics::CatalogMetrics;
 use self::partition::Partition;
 use self::table::Table;
+use data_types::write_summary::WriteSummary;

 pub mod chunk;
 mod metrics;
@ -135,11 +136,8 @@ impl Catalog {
    /// Get a specific table by name, returning `None` if there is no such table
    pub fn table(&self, table_name: impl AsRef<str>) -> Result<MappedRwLockReadGuard<'_, Table>> {
        let table_name = table_name.as_ref();
-        RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err(
-            |_| Error::TableNotFound {
-                table: table_name.to_string(),
-            },
-        )
+        RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name))
+            .map_err(|_| TableNotFound { table: table_name }.build())
    }

    /// Get a specific partition by name, returning an error if it can't be found
@ -154,9 +152,9 @@ impl Catalog {
        self.table(table_name)?
            .partition(partition_key)
            .cloned()
-            .ok_or_else(|| Error::PartitionNotFound {
-                partition: partition_key.to_string(),
-                table: table_name.to_string(),
+            .context(PartitionNotFound {
+                partition: partition_key,
+                table: table_name,
            })
    }

@ -174,9 +172,9 @@ impl Catalog {
            .read()
            .chunk(chunk_id)
            .cloned()
-            .ok_or_else(|| Error::ChunkNotFound {
-                partition: partition_key.to_string(),
-                table: table_name.to_string(),
+            .context(ChunkNotFound {
+                partition: partition_key,
+                table: table_name,
                chunk_id,
            })
    }
@ -228,6 +226,23 @@ impl Catalog {
            .collect()
    }

+    /// Returns a list of persistence window summaries for each partition
+    pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> {
+        let mut summaries = Vec::new();
+        let tables = self.tables.read();
+        for table in tables.values() {
+            for partition in table.partitions() {
+                let partition = partition.read();
+                if let Some(w) = partition.persistence_windows() {
+                    for summary in w.summaries() {
+                        summaries.push((partition.addr().clone(), summary))
+                    }
+                }
+            }
+        }
+        summaries
+    }
+
    pub fn chunk_summaries(&self) -> Vec<ChunkSummary> {
        let partition_key = None;
        let table_names = TableNameFilter::AllTables;
--- a/server/src/db/chunk.rs
+++ b/server/src/db/chunk.rs
@ -5,7 +5,7 @@ use std::{

 use data_types::partition_metadata;
 use partition_metadata::TableSummary;
-use snafu::{ResultExt, Snafu};
+use snafu::{OptionExt, ResultExt, Snafu};

 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
@ -417,7 +417,7 @@ impl QueryChunk for DbChunk {
                // column out to get the set of values.
                let values = values
                    .remove(column_name)
-                    .ok_or_else(|| Error::ReadBufferError {
+                    .with_context(|| ReadBufferError {
                        chunk_id: self.id(),
                        msg: format!(
                            "failed to find column_name {:?} in results of tag_values",
--- a/server/src/db/system_tables.rs
+++ b/server/src/db/system_tables.rs
@ -7,38 +7,31 @@
 //!
 //! For example `SELECT * FROM system.chunks`

-use std::convert::AsRef;
+use std::any::Any;
 use std::sync::Arc;
-use std::{any::Any, collections::HashMap};
-
-use chrono::{DateTime, Utc};

 use arrow::{
-    array::{
-        ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray,
-        UInt32Array, UInt32Builder, UInt64Array, UInt64Builder,
-    },
-    datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
+    datatypes::{Field, Schema, SchemaRef},
    error::Result,
    record_batch::RecordBatch,
 };
-use data_types::{
-    chunk_metadata::{ChunkSummary, DetailedChunkSummary},
-    error::ErrorLogger,
-    job::Job,
-    partition_metadata::PartitionSummary,
-};
+use chrono::{DateTime, Utc};
+
 use datafusion::{
    catalog::schema::SchemaProvider,
    datasource::{datasource::Statistics, TableProvider},
    error::{DataFusionError, Result as DataFusionResult},
    physical_plan::{memory::MemoryExec, ExecutionPlan},
 };
-use tracker::TaskTracker;
+
+use crate::JobRegistry;

 use super::catalog::Catalog;
-use crate::JobRegistry;
-use data_types::partition_metadata::TableSummary;
+
+mod chunks;
+mod columns;
+mod operations;
+mod persistence;

 // The IOx system schema
 pub const SYSTEM_SCHEMA: &str = "system";
@ -47,12 +40,14 @@ const CHUNKS: &str = "chunks";
 const COLUMNS: &str = "columns";
 const CHUNK_COLUMNS: &str = "chunk_columns";
 const OPERATIONS: &str = "operations";
+const PERSISTENCE_WINDOWS: &str = "persistence_windows";

 pub struct SystemSchemaProvider {
    chunks: Arc<dyn TableProvider>,
    columns: Arc<dyn TableProvider>,
    chunk_columns: Arc<dyn TableProvider>,
    operations: Arc<dyn TableProvider>,
+    persistence_windows: Arc<dyn TableProvider>,
 }

 impl std::fmt::Debug for SystemSchemaProvider {
@ -67,22 +62,26 @@ impl SystemSchemaProvider {
    pub fn new(db_name: impl Into<String>, catalog: Arc<Catalog>, jobs: Arc<JobRegistry>) -> Self {
        let db_name = db_name.into();
        let chunks = Arc::new(SystemTableProvider {
-            inner: ChunksTable::new(Arc::clone(&catalog)),
+            inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
        });
        let columns = Arc::new(SystemTableProvider {
-            inner: ColumnsTable::new(Arc::clone(&catalog)),
+            inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
        });
        let chunk_columns = Arc::new(SystemTableProvider {
-            inner: ChunkColumnsTable::new(catalog),
+            inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
        });
        let operations = Arc::new(SystemTableProvider {
-            inner: OperationsTable::new(db_name, jobs),
+            inner: operations::OperationsTable::new(db_name, jobs),
+        });
+        let persistence_windows = Arc::new(SystemTableProvider {
+            inner: persistence::PersistenceWindowsTable::new(catalog),
        });
        Self {
            chunks,
            columns,
            chunk_columns,
            operations,
+            persistence_windows,
        }
    }
 }
@ -98,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider {
            COLUMNS.to_string(),
            CHUNK_COLUMNS.to_string(),
            OPERATIONS.to_string(),
+            PERSISTENCE_WINDOWS.to_string(),
        ]
    }

@ -107,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider {
            COLUMNS => Some(Arc::clone(&self.columns)),
            CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)),
            OPERATIONS => Some(Arc::clone(&self.operations)),
+            PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)),
            _ => None,
        }
    }
@ -162,407 +163,6 @@ fn time_to_ts(time: Option<DateTime<Utc>>) -> Option<i64> {
    time.map(|ts| ts.timestamp_nanos())
 }

-/// Implementation of system.chunks table
-#[derive(Debug)]
-struct ChunksTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ChunksTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: chunk_summaries_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ChunksTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
-            .log_if_error("system.chunks table")
-    }
-}
-
-fn chunk_summaries_schema() -> SchemaRef {
-    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
-    Arc::new(Schema::new(vec![
-        Field::new("id", DataType::UInt32, false),
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("storage", DataType::Utf8, false),
-        Field::new("lifecycle_action", DataType::Utf8, true),
-        Field::new("memory_bytes", DataType::UInt64, false),
-        Field::new("object_store_bytes", DataType::UInt64, false),
-        Field::new("row_count", DataType::UInt64, false),
-        Field::new("time_of_first_write", ts.clone(), true),
-        Field::new("time_of_last_write", ts.clone(), true),
-        Field::new("time_closed", ts, true),
-    ]))
-}
-
-fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
-    let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
-    let partition_key = chunks
-        .iter()
-        .map(|c| Some(c.partition_key.as_ref()))
-        .collect::<StringArray>();
-    let table_name = chunks
-        .iter()
-        .map(|c| Some(c.table_name.as_ref()))
-        .collect::<StringArray>();
-    let storage = chunks
-        .iter()
-        .map(|c| Some(c.storage.as_str()))
-        .collect::<StringArray>();
-    let lifecycle_action = chunks
-        .iter()
-        .map(|c| c.lifecycle_action.map(|a| a.name()))
-        .collect::<StringArray>();
-    let memory_bytes = chunks
-        .iter()
-        .map(|c| Some(c.memory_bytes as u64))
-        .collect::<UInt64Array>();
-    let object_store_bytes = chunks
-        .iter()
-        .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
-        .collect::<UInt64Array>();
-    let row_counts = chunks
-        .iter()
-        .map(|c| Some(c.row_count as u64))
-        .collect::<UInt64Array>();
-    let time_of_first_write = chunks
-        .iter()
-        .map(|c| c.time_of_first_write)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-    let time_of_last_write = chunks
-        .iter()
-        .map(|c| c.time_of_last_write)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-    let time_closed = chunks
-        .iter()
-        .map(|c| c.time_closed)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(id),
-            Arc::new(partition_key),
-            Arc::new(table_name),
-            Arc::new(storage),
-            Arc::new(lifecycle_action),
-            Arc::new(memory_bytes),
-            Arc::new(object_store_bytes),
-            Arc::new(row_counts),
-            Arc::new(time_of_first_write),
-            Arc::new(time_of_last_write),
-            Arc::new(time_closed),
-        ],
-    )
-}
-
-/// Implementation of `system.columns` system table
-#[derive(Debug)]
-struct ColumnsTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ColumnsTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: partition_summaries_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ColumnsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-    fn batch(&self) -> Result<RecordBatch> {
-        from_partition_summaries(self.schema(), self.catalog.partition_summaries())
-            .log_if_error("system.columns table")
-    }
-}
-
-fn partition_summaries_schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("column_name", DataType::Utf8, false),
-        Field::new("column_type", DataType::Utf8, false),
-        Field::new("influxdb_type", DataType::Utf8, true),
-    ]))
-}
-
-fn from_partition_summaries(
-    schema: SchemaRef,
-    partitions: Vec<PartitionSummary>,
-) -> Result<RecordBatch> {
-    // Assume each partition has roughly 5 tables with 5 columns
-    let row_estimate = partitions.len() * 25;
-
-    let mut partition_key = StringBuilder::new(row_estimate);
-    let mut table_name = StringBuilder::new(row_estimate);
-    let mut column_name = StringBuilder::new(row_estimate);
-    let mut column_type = StringBuilder::new(row_estimate);
-    let mut influxdb_type = StringBuilder::new(row_estimate);
-
-    // Note no rows are produced for partitions with no tabes, or
-    // tables with no columns: There are other tables to list tables
-    // and columns
-    for partition in partitions {
-        let table = partition.table;
-        for column in table.columns {
-            partition_key.append_value(&partition.key)?;
-            table_name.append_value(&table.name)?;
-            column_name.append_value(&column.name)?;
-            column_type.append_value(column.type_name())?;
-            if let Some(t) = &column.influxdb_type {
-                influxdb_type.append_value(t.as_str())?;
-            } else {
-                influxdb_type.append_null()?;
-            }
-        }
-    }
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(partition_key.finish()) as ArrayRef,
-            Arc::new(table_name.finish()),
-            Arc::new(column_name.finish()),
-            Arc::new(column_type.finish()),
-            Arc::new(influxdb_type.finish()),
-        ],
-    )
-}
-
-/// Implementation of system.column_chunks table
-#[derive(Debug)]
-struct ChunkColumnsTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ChunkColumnsTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: chunk_columns_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ChunkColumnsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
-            .log_if_error("system.column_chunks table")
-    }
-}
-
-fn chunk_columns_schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("chunk_id", DataType::UInt32, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("column_name", DataType::Utf8, false),
-        Field::new("storage", DataType::Utf8, false),
-        Field::new("row_count", DataType::UInt64, true),
-        Field::new("min_value", DataType::Utf8, true),
-        Field::new("max_value", DataType::Utf8, true),
-        Field::new("memory_bytes", DataType::UInt64, true),
-    ]))
-}
-
-fn assemble_chunk_columns(
-    schema: SchemaRef,
-    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
-) -> Result<RecordBatch> {
-    /// Builds an index from column_name -> size
-    fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
-        summary
-            .columns
-            .iter()
-            .map(|column_summary| {
-                (
-                    column_summary.name.as_ref(),
-                    column_summary.memory_bytes as u64,
-                )
-            })
-            .collect()
-    }
-
-    // Assume each chunk has roughly 5 columns
-    let row_estimate = chunk_summaries.len() * 5;
-
-    let mut partition_key = StringBuilder::new(row_estimate);
-    let mut chunk_id = UInt32Builder::new(row_estimate);
-    let mut table_name = StringBuilder::new(row_estimate);
-    let mut column_name = StringBuilder::new(row_estimate);
-    let mut storage = StringBuilder::new(row_estimate);
-    let mut row_count = UInt64Builder::new(row_estimate);
-    let mut min_values = StringBuilder::new(row_estimate);
-    let mut max_values = StringBuilder::new(row_estimate);
-    let mut memory_bytes = UInt64Builder::new(row_estimate);
-
-    // Note no rows are produced for partitions with no chunks, or
-    // tables with no partitions: There are other tables to list tables
-    // and columns
-    for (table_summary, chunk_summary) in chunk_summaries {
-        let mut column_index = make_column_index(&chunk_summary);
-        let storage_value = chunk_summary.inner.storage.as_str();
-
-        for column in &table_summary.columns {
-            partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
-            chunk_id.append_value(chunk_summary.inner.id)?;
-            table_name.append_value(&chunk_summary.inner.table_name)?;
-            column_name.append_value(&column.name)?;
-            storage.append_value(storage_value)?;
-            row_count.append_value(column.count())?;
-            if let Some(v) = column.stats.min_as_str() {
-                min_values.append_value(v)?;
-            } else {
-                min_values.append(false)?;
-            }
-            if let Some(v) = column.stats.max_as_str() {
-                max_values.append_value(v)?;
-            } else {
-                max_values.append(false)?;
-            }
-
-            let size = column_index.remove(column.name.as_str());
-
-            memory_bytes.append_option(size)?;
-        }
-    }
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(partition_key.finish()) as ArrayRef,
-            Arc::new(chunk_id.finish()),
-            Arc::new(table_name.finish()),
-            Arc::new(column_name.finish()),
-            Arc::new(storage.finish()),
-            Arc::new(row_count.finish()),
-            Arc::new(min_values.finish()),
-            Arc::new(max_values.finish()),
-            Arc::new(memory_bytes.finish()),
-        ],
-    )
-}
-
-/// Implementation of system.operations table
-#[derive(Debug)]
-struct OperationsTable {
-    schema: SchemaRef,
-    db_name: String,
-    jobs: Arc<JobRegistry>,
-}
-
-impl OperationsTable {
-    fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
-        Self {
-            schema: operations_schema(),
-            db_name,
-            jobs,
-        }
-    }
-}
-
-impl IoxSystemTable for OperationsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
-            .log_if_error("system.operations table")
-    }
-}
-
-fn operations_schema() -> SchemaRef {
-    let ts = DataType::Time64(TimeUnit::Nanosecond);
-    Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Utf8, false),
-        Field::new("status", DataType::Utf8, true),
-        Field::new("cpu_time_used", ts.clone(), true),
-        Field::new("wall_time_used", ts, true),
-        Field::new("partition_key", DataType::Utf8, true),
-        Field::new("chunk_id", DataType::UInt32, true),
-        Field::new("description", DataType::Utf8, true),
-    ]))
-}
-
-fn from_task_trackers(
-    schema: SchemaRef,
-    db_name: &str,
-    jobs: Vec<TaskTracker<Job>>,
-) -> Result<RecordBatch> {
-    let jobs = jobs
-        .into_iter()
-        .filter(|job| job.metadata().db_name() == Some(db_name))
-        .collect::<Vec<_>>();
-
-    let ids = jobs
-        .iter()
-        .map(|job| Some(job.id().to_string()))
-        .collect::<StringArray>();
-    let statuses = jobs
-        .iter()
-        .map(|job| Some(job.get_status().name()))
-        .collect::<StringArray>();
-    let cpu_time_used = jobs
-        .iter()
-        .map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
-        .collect::<Time64NanosecondArray>();
-    let wall_time_used = jobs
-        .iter()
-        .map(|job| job.get_status().wall_nanos().map(|n| n as i64))
-        .collect::<Time64NanosecondArray>();
-    let partition_keys = jobs
-        .iter()
-        .map(|job| job.metadata().partition_key())
-        .collect::<StringArray>();
-    let chunk_ids = jobs
-        .iter()
-        .map(|job| job.metadata().chunk_id())
-        .collect::<UInt32Array>();
-    let descriptions = jobs
-        .iter()
-        .map(|job| Some(job.metadata().description()))
-        .collect::<StringArray>();
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(ids) as ArrayRef,
-            Arc::new(statuses),
-            Arc::new(cpu_time_used),
-            Arc::new(wall_time_used),
-            Arc::new(partition_keys),
-            Arc::new(chunk_ids),
-            Arc::new(descriptions),
-        ],
-    )
-}
-
 /// Creates a DataFusion ExecutionPlan node that scans a single batch
 /// of records.
 fn scan_batch(
@ -605,141 +205,10 @@ fn scan_batch(

 #[cfg(test)]
 mod tests {
-    use super::*;
+    use arrow::array::{ArrayRef, UInt64Array};
    use arrow_util::assert_batches_eq;
-    use chrono::NaiveDateTime;
-    use data_types::{
-        chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage},
-        partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary},
-    };

-    #[test]
-    fn test_from_chunk_summaries() {
-        let chunks = vec![
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 0,
-                storage: ChunkStorage::OpenMutableBuffer,
-                lifecycle_action: None,
-                memory_bytes: 23754,
-                object_store_bytes: 0,
-                row_count: 11,
-                time_of_first_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(10, 0),
-                    Utc,
-                )),
-                time_of_last_write: None,
-                time_closed: None,
-            },
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 1,
-                storage: ChunkStorage::OpenMutableBuffer,
-                lifecycle_action: Some(ChunkLifecycleAction::Persisting),
-                memory_bytes: 23455,
-                object_store_bytes: 0,
-                row_count: 22,
-                time_of_first_write: None,
-                time_of_last_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(80, 0),
-                    Utc,
-                )),
-                time_closed: None,
-            },
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 2,
-                storage: ChunkStorage::ObjectStoreOnly,
-                lifecycle_action: None,
-                memory_bytes: 1234,
-                object_store_bytes: 5678,
-                row_count: 33,
-                time_of_first_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(100, 0),
-                    Utc,
-                )),
-                time_of_last_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(200, 0),
-                    Utc,
-                )),
-                time_closed: None,
-            },
-        ];
-
-        let expected = vec![
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-            "| id | partition_key | table_name | storage           | lifecycle_action             | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write  | time_closed |",
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-            "| 0  | p1            | table1     | OpenMutableBuffer |                              | 23754        |                    | 11        | 1970-01-01 00:00:10 |                     |             |",
-            "| 1  | p1            | table1     | OpenMutableBuffer | Persisting to Object Storage | 23455        |                    | 22        |                     | 1970-01-01 00:01:20 |             |",
-            "| 2  | p1            | table1     | ObjectStoreOnly   |                              | 1234         | 5678               | 33        | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 |             |",
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-        ];
-
-        let schema = chunk_summaries_schema();
-        let batch = from_chunk_summaries(schema, chunks).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
-
-    #[test]
-    fn test_from_partition_summaries() {
-        let partitions = vec![
-            PartitionSummary {
-                key: "p1".to_string(),
-                table: TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![
-                        ColumnSummary {
-                            name: "c1".to_string(),
-                            influxdb_type: Some(InfluxDbType::Tag),
-                            stats: Statistics::I64(StatValues::new_with_value(23)),
-                        },
-                        ColumnSummary {
-                            name: "c2".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::I64(StatValues::new_with_value(43)),
-                        },
-                        ColumnSummary {
-                            name: "c3".to_string(),
-                            influxdb_type: None,
-                            stats: Statistics::String(StatValues::new_with_value(
-                                "foo".to_string(),
-                            )),
-                        },
-                        ColumnSummary {
-                            name: "time".to_string(),
-                            influxdb_type: Some(InfluxDbType::Timestamp),
-                            stats: Statistics::I64(StatValues::new_with_value(43)),
-                        },
-                    ],
-                },
-            },
-            PartitionSummary {
-                key: "p3".to_string(),
-                table: TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![],
-                },
-            },
-        ];
-
-        let expected = vec![
-            "+---------------+------------+-------------+-------------+---------------+",
-            "| partition_key | table_name | column_name | column_type | influxdb_type |",
-            "+---------------+------------+-------------+-------------+---------------+",
-            "| p1            | t1         | c1          | I64         | Tag           |",
-            "| p1            | t1         | c2          | I64         | Field         |",
-            "| p1            | t1         | c3          | String      |               |",
-            "| p1            | t1         | time        | I64         | Timestamp     |",
-            "+---------------+------------+-------------+-------------+---------------+",
-        ];
-
-        let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
+    use super::*;

    fn seq_array(start: u64, end: u64) -> ArrayRef {
        Arc::new(UInt64Array::from_iter_values(start..end))
@ -820,130 +289,4 @@ mod tests {
            err_string
        );
    }
-
-    #[test]
-    fn test_assemble_chunk_columns() {
-        let lifecycle_action = None;
-
-        let summaries = vec![
-            (
-                Arc::new(TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![
-                        ColumnSummary {
-                            name: "c1".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::String(StatValues::new(
-                                Some("bar".to_string()),
-                                Some("foo".to_string()),
-                                55,
-                            )),
-                        },
-                        ColumnSummary {
-                            name: "c2".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
-                        },
-                    ],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p1".into(),
-                        table_name: "t1".into(),
-                        id: 42,
-                        storage: ChunkStorage::ReadBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![
-                        ChunkColumnSummary {
-                            name: "c1".into(),
-                            memory_bytes: 11,
-                        },
-                        ChunkColumnSummary {
-                            name: "c2".into(),
-                            memory_bytes: 12,
-                        },
-                    ],
-                },
-            ),
-            (
-                Arc::new(TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![ColumnSummary {
-                        name: "c1".to_string(),
-                        influxdb_type: Some(InfluxDbType::Field),
-                        stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
-                    }],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p2".into(),
-                        table_name: "t1".into(),
-                        id: 43,
-                        storage: ChunkStorage::OpenMutableBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![ChunkColumnSummary {
-                        name: "c1".into(),
-                        memory_bytes: 100,
-                    }],
-                },
-            ),
-            (
-                Arc::new(TableSummary {
-                    name: "t2".to_string(),
-                    columns: vec![ColumnSummary {
-                        name: "c3".to_string(),
-                        influxdb_type: Some(InfluxDbType::Field),
-                        stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
-                    }],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p2".into(),
-                        table_name: "t2".into(),
-                        id: 44,
-                        storage: ChunkStorage::OpenMutableBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![ChunkColumnSummary {
-                        name: "c3".into(),
-                        memory_bytes: 200,
-                    }],
-                },
-            ),
-        ];
-
-        let expected = vec![
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-            "| partition_key | chunk_id | table_name | column_name | storage           | row_count | min_value | max_value | memory_bytes |",
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-            "| p1            | 42       | t1         | c1          | ReadBuffer        | 55        | bar       | foo       | 11           |",
-            "| p1            | 42       | t1         | c2          | ReadBuffer        | 66        | 11        | 43        | 12           |",
-            "| p2            | 43       | t1         | c1          | OpenMutableBuffer | 667       | 110       | 430       | 100          |",
-            "| p2            | 44       | t2         | c3          | OpenMutableBuffer | 4         | -1        | 2         | 200          |",
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-        ];
-
-        let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
 }
--- a/server/src/db/system_tables/chunks.rs
+++ b/server/src/db/system_tables/chunks.rs
@ -0,0 +1,201 @@
+use std::sync::Arc;
+
+use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::chunk_metadata::ChunkSummary;
+use data_types::error::ErrorLogger;
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::{time_to_ts, IoxSystemTable};
+
+/// Implementation of system.chunks table
+#[derive(Debug)]
+pub(super) struct ChunksTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ChunksTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: chunk_summaries_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ChunksTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
+            .log_if_error("system.chunks table")
+    }
+}
+
+fn chunk_summaries_schema() -> SchemaRef {
+    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::UInt32, false),
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("storage", DataType::Utf8, false),
+        Field::new("lifecycle_action", DataType::Utf8, true),
+        Field::new("memory_bytes", DataType::UInt64, false),
+        Field::new("object_store_bytes", DataType::UInt64, false),
+        Field::new("row_count", DataType::UInt64, false),
+        Field::new("time_of_first_write", ts.clone(), true),
+        Field::new("time_of_last_write", ts.clone(), true),
+        Field::new("time_closed", ts, true),
+    ]))
+}
+
+fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
+    let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
+    let partition_key = chunks
+        .iter()
+        .map(|c| Some(c.partition_key.as_ref()))
+        .collect::<StringArray>();
+    let table_name = chunks
+        .iter()
+        .map(|c| Some(c.table_name.as_ref()))
+        .collect::<StringArray>();
+    let storage = chunks
+        .iter()
+        .map(|c| Some(c.storage.as_str()))
+        .collect::<StringArray>();
+    let lifecycle_action = chunks
+        .iter()
+        .map(|c| c.lifecycle_action.map(|a| a.name()))
+        .collect::<StringArray>();
+    let memory_bytes = chunks
+        .iter()
+        .map(|c| Some(c.memory_bytes as u64))
+        .collect::<UInt64Array>();
+    let object_store_bytes = chunks
+        .iter()
+        .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
+        .collect::<UInt64Array>();
+    let row_counts = chunks
+        .iter()
+        .map(|c| Some(c.row_count as u64))
+        .collect::<UInt64Array>();
+    let time_of_first_write = chunks
+        .iter()
+        .map(|c| c.time_of_first_write)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+    let time_of_last_write = chunks
+        .iter()
+        .map(|c| c.time_of_last_write)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+    let time_closed = chunks
+        .iter()
+        .map(|c| c.time_closed)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(id),
+            Arc::new(partition_key),
+            Arc::new(table_name),
+            Arc::new(storage),
+            Arc::new(lifecycle_action),
+            Arc::new(memory_bytes),
+            Arc::new(object_store_bytes),
+            Arc::new(row_counts),
+            Arc::new(time_of_first_write),
+            Arc::new(time_of_last_write),
+            Arc::new(time_closed),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use chrono::{DateTime, NaiveDateTime, Utc};
+
+    use arrow_util::assert_batches_eq;
+    use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage};
+
+    use super::*;
+
+    #[test]
+    fn test_from_chunk_summaries() {
+        let chunks = vec![
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 0,
+                storage: ChunkStorage::OpenMutableBuffer,
+                lifecycle_action: None,
+                memory_bytes: 23754,
+                object_store_bytes: 0,
+                row_count: 11,
+                time_of_first_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(10, 0),
+                    Utc,
+                )),
+                time_of_last_write: None,
+                time_closed: None,
+            },
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 1,
+                storage: ChunkStorage::OpenMutableBuffer,
+                lifecycle_action: Some(ChunkLifecycleAction::Persisting),
+                memory_bytes: 23455,
+                object_store_bytes: 0,
+                row_count: 22,
+                time_of_first_write: None,
+                time_of_last_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(80, 0),
+                    Utc,
+                )),
+                time_closed: None,
+            },
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 2,
+                storage: ChunkStorage::ObjectStoreOnly,
+                lifecycle_action: None,
+                memory_bytes: 1234,
+                object_store_bytes: 5678,
+                row_count: 33,
+                time_of_first_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(100, 0),
+                    Utc,
+                )),
+                time_of_last_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(200, 0),
+                    Utc,
+                )),
+                time_closed: None,
+            },
+        ];
+
+        let expected = vec![
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+            "| id | partition_key | table_name | storage           | lifecycle_action             | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write  | time_closed |",
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+            "| 0  | p1            | table1     | OpenMutableBuffer |                              | 23754        |                    | 11        | 1970-01-01 00:00:10 |                     |             |",
+            "| 1  | p1            | table1     | OpenMutableBuffer | Persisting to Object Storage | 23455        |                    | 22        |                     | 1970-01-01 00:01:20 |             |",
+            "| 2  | p1            | table1     | ObjectStoreOnly   |                              | 1234         | 5678               | 33        | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 |             |",
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+        ];
+
+        let schema = chunk_summaries_schema();
+        let batch = from_chunk_summaries(schema, chunks).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}
--- a/server/src/db/system_tables/columns.rs
+++ b/server/src/db/system_tables/columns.rs
@ -0,0 +1,404 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::chunk_metadata::DetailedChunkSummary;
+use data_types::error::ErrorLogger;
+use data_types::partition_metadata::{PartitionSummary, TableSummary};
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::IoxSystemTable;
+
+/// Implementation of `system.columns` system table
+#[derive(Debug)]
+pub(super) struct ColumnsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ColumnsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: partition_summaries_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ColumnsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+    fn batch(&self) -> Result<RecordBatch> {
+        from_partition_summaries(self.schema(), self.catalog.partition_summaries())
+            .log_if_error("system.columns table")
+    }
+}
+
+fn partition_summaries_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("column_name", DataType::Utf8, false),
+        Field::new("column_type", DataType::Utf8, false),
+        Field::new("influxdb_type", DataType::Utf8, true),
+    ]))
+}
+
+fn from_partition_summaries(
+    schema: SchemaRef,
+    partitions: Vec<PartitionSummary>,
+) -> Result<RecordBatch> {
+    // Assume each partition has roughly 5 tables with 5 columns
+    let row_estimate = partitions.len() * 25;
+
+    let mut partition_key = StringBuilder::new(row_estimate);
+    let mut table_name = StringBuilder::new(row_estimate);
+    let mut column_name = StringBuilder::new(row_estimate);
+    let mut column_type = StringBuilder::new(row_estimate);
+    let mut influxdb_type = StringBuilder::new(row_estimate);
+
+    // Note no rows are produced for partitions with no tabes, or
+    // tables with no columns: There are other tables to list tables
+    // and columns
+    for partition in partitions {
+        let table = partition.table;
+        for column in table.columns {
+            partition_key.append_value(&partition.key)?;
+            table_name.append_value(&table.name)?;
+            column_name.append_value(&column.name)?;
+            column_type.append_value(column.type_name())?;
+            if let Some(t) = &column.influxdb_type {
+                influxdb_type.append_value(t.as_str())?;
+            } else {
+                influxdb_type.append_null()?;
+            }
+        }
+    }
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key.finish()) as ArrayRef,
+            Arc::new(table_name.finish()),
+            Arc::new(column_name.finish()),
+            Arc::new(column_type.finish()),
+            Arc::new(influxdb_type.finish()),
+        ],
+    )
+}
+
+/// Implementation of system.column_chunks table
+#[derive(Debug)]
+pub(super) struct ChunkColumnsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ChunkColumnsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: chunk_columns_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ChunkColumnsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
+            .log_if_error("system.column_chunks table")
+    }
+}
+
+fn chunk_columns_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("chunk_id", DataType::UInt32, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("column_name", DataType::Utf8, false),
+        Field::new("storage", DataType::Utf8, false),
+        Field::new("row_count", DataType::UInt64, true),
+        Field::new("min_value", DataType::Utf8, true),
+        Field::new("max_value", DataType::Utf8, true),
+        Field::new("memory_bytes", DataType::UInt64, true),
+    ]))
+}
+
+fn assemble_chunk_columns(
+    schema: SchemaRef,
+    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
+) -> Result<RecordBatch> {
+    /// Builds an index from column_name -> size
+    fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
+        summary
+            .columns
+            .iter()
+            .map(|column_summary| {
+                (
+                    column_summary.name.as_ref(),
+                    column_summary.memory_bytes as u64,
+                )
+            })
+            .collect()
+    }
+
+    // Assume each chunk has roughly 5 columns
+    let row_estimate = chunk_summaries.len() * 5;
+
+    let mut partition_key = StringBuilder::new(row_estimate);
+    let mut chunk_id = UInt32Builder::new(row_estimate);
+    let mut table_name = StringBuilder::new(row_estimate);
+    let mut column_name = StringBuilder::new(row_estimate);
+    let mut storage = StringBuilder::new(row_estimate);
+    let mut row_count = UInt64Builder::new(row_estimate);
+    let mut min_values = StringBuilder::new(row_estimate);
+    let mut max_values = StringBuilder::new(row_estimate);
+    let mut memory_bytes = UInt64Builder::new(row_estimate);
+
+    // Note no rows are produced for partitions with no chunks, or
+    // tables with no partitions: There are other tables to list tables
+    // and columns
+    for (table_summary, chunk_summary) in chunk_summaries {
+        let mut column_index = make_column_index(&chunk_summary);
+        let storage_value = chunk_summary.inner.storage.as_str();
+
+        for column in &table_summary.columns {
+            partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
+            chunk_id.append_value(chunk_summary.inner.id)?;
+            table_name.append_value(&chunk_summary.inner.table_name)?;
+            column_name.append_value(&column.name)?;
+            storage.append_value(storage_value)?;
+            row_count.append_value(column.count())?;
+            if let Some(v) = column.stats.min_as_str() {
+                min_values.append_value(v)?;
+            } else {
+                min_values.append(false)?;
+            }
+            if let Some(v) = column.stats.max_as_str() {
+                max_values.append_value(v)?;
+            } else {
+                max_values.append(false)?;
+            }
+
+            let size = column_index.remove(column.name.as_str());
+
+            memory_bytes.append_option(size)?;
+        }
+    }
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key.finish()) as ArrayRef,
+            Arc::new(chunk_id.finish()),
+            Arc::new(table_name.finish()),
+            Arc::new(column_name.finish()),
+            Arc::new(storage.finish()),
+            Arc::new(row_count.finish()),
+            Arc::new(min_values.finish()),
+            Arc::new(max_values.finish()),
+            Arc::new(memory_bytes.finish()),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_util::assert_batches_eq;
+    use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary};
+    use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics};
+
+    use super::*;
+
+    #[test]
+    fn test_from_partition_summaries() {
+        let partitions = vec![
+            PartitionSummary {
+                key: "p1".to_string(),
+                table: TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![
+                        ColumnSummary {
+                            name: "c1".to_string(),
+                            influxdb_type: Some(InfluxDbType::Tag),
+                            stats: Statistics::I64(StatValues::new_with_value(23)),
+                        },
+                        ColumnSummary {
+                            name: "c2".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::I64(StatValues::new_with_value(43)),
+                        },
+                        ColumnSummary {
+                            name: "c3".to_string(),
+                            influxdb_type: None,
+                            stats: Statistics::String(StatValues::new_with_value(
+                                "foo".to_string(),
+                            )),
+                        },
+                        ColumnSummary {
+                            name: "time".to_string(),
+                            influxdb_type: Some(InfluxDbType::Timestamp),
+                            stats: Statistics::I64(StatValues::new_with_value(43)),
+                        },
+                    ],
+                },
+            },
+            PartitionSummary {
+                key: "p3".to_string(),
+                table: TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![],
+                },
+            },
+        ];
+
+        let expected = vec![
+            "+---------------+------------+-------------+-------------+---------------+",
+            "| partition_key | table_name | column_name | column_type | influxdb_type |",
+            "+---------------+------------+-------------+-------------+---------------+",
+            "| p1            | t1         | c1          | I64         | Tag           |",
+            "| p1            | t1         | c2          | I64         | Field         |",
+            "| p1            | t1         | c3          | String      |               |",
+            "| p1            | t1         | time        | I64         | Timestamp     |",
+            "+---------------+------------+-------------+-------------+---------------+",
+        ];
+
+        let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+
+    #[test]
+    fn test_assemble_chunk_columns() {
+        let lifecycle_action = None;
+
+        let summaries = vec![
+            (
+                Arc::new(TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![
+                        ColumnSummary {
+                            name: "c1".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::String(StatValues::new(
+                                Some("bar".to_string()),
+                                Some("foo".to_string()),
+                                55,
+                            )),
+                        },
+                        ColumnSummary {
+                            name: "c2".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
+                        },
+                    ],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p1".into(),
+                        table_name: "t1".into(),
+                        id: 42,
+                        storage: ChunkStorage::ReadBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![
+                        ChunkColumnSummary {
+                            name: "c1".into(),
+                            memory_bytes: 11,
+                        },
+                        ChunkColumnSummary {
+                            name: "c2".into(),
+                            memory_bytes: 12,
+                        },
+                    ],
+                },
+            ),
+            (
+                Arc::new(TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![ColumnSummary {
+                        name: "c1".to_string(),
+                        influxdb_type: Some(InfluxDbType::Field),
+                        stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
+                    }],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p2".into(),
+                        table_name: "t1".into(),
+                        id: 43,
+                        storage: ChunkStorage::OpenMutableBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![ChunkColumnSummary {
+                        name: "c1".into(),
+                        memory_bytes: 100,
+                    }],
+                },
+            ),
+            (
+                Arc::new(TableSummary {
+                    name: "t2".to_string(),
+                    columns: vec![ColumnSummary {
+                        name: "c3".to_string(),
+                        influxdb_type: Some(InfluxDbType::Field),
+                        stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
+                    }],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p2".into(),
+                        table_name: "t2".into(),
+                        id: 44,
+                        storage: ChunkStorage::OpenMutableBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![ChunkColumnSummary {
+                        name: "c3".into(),
+                        memory_bytes: 200,
+                    }],
+                },
+            ),
+        ];
+
+        let expected = vec![
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+            "| partition_key | chunk_id | table_name | column_name | storage           | row_count | min_value | max_value | memory_bytes |",
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+            "| p1            | 42       | t1         | c1          | ReadBuffer        | 55        | bar       | foo       | 11           |",
+            "| p1            | 42       | t1         | c2          | ReadBuffer        | 66        | 11        | 43        | 12           |",
+            "| p2            | 43       | t1         | c1          | OpenMutableBuffer | 667       | 110       | 430       | 100          |",
+            "| p2            | 44       | t2         | c3          | OpenMutableBuffer | 4         | -1        | 2         | 200          |",
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+        ];
+
+        let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}
--- a/server/src/db/system_tables/operations.rs
+++ b/server/src/db/system_tables/operations.rs
@ -0,0 +1,108 @@
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::error::ErrorLogger;
+use data_types::job::Job;
+use tracker::TaskTracker;
+
+use crate::db::system_tables::IoxSystemTable;
+use crate::JobRegistry;
+
+/// Implementation of system.operations table
+#[derive(Debug)]
+pub(super) struct OperationsTable {
+    schema: SchemaRef,
+    db_name: String,
+    jobs: Arc<JobRegistry>,
+}
+
+impl OperationsTable {
+    pub(super) fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
+        Self {
+            schema: operations_schema(),
+            db_name,
+            jobs,
+        }
+    }
+}
+
+impl IoxSystemTable for OperationsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
+            .log_if_error("system.operations table")
+    }
+}
+
+fn operations_schema() -> SchemaRef {
+    let ts = DataType::Time64(TimeUnit::Nanosecond);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("status", DataType::Utf8, true),
+        Field::new("cpu_time_used", ts.clone(), true),
+        Field::new("wall_time_used", ts, true),
+        Field::new("partition_key", DataType::Utf8, true),
+        Field::new("chunk_id", DataType::UInt32, true),
+        Field::new("description", DataType::Utf8, true),
+    ]))
+}
+
+fn from_task_trackers(
+    schema: SchemaRef,
+    db_name: &str,
+    jobs: Vec<TaskTracker<Job>>,
+) -> Result<RecordBatch> {
+    let jobs = jobs
+        .into_iter()
+        .filter(|job| job.metadata().db_name() == Some(db_name))
+        .collect::<Vec<_>>();
+
+    let ids = jobs
+        .iter()
+        .map(|job| Some(job.id().to_string()))
+        .collect::<StringArray>();
+    let statuses = jobs
+        .iter()
+        .map(|job| Some(job.get_status().name()))
+        .collect::<StringArray>();
+    let cpu_time_used = jobs
+        .iter()
+        .map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
+        .collect::<Time64NanosecondArray>();
+    let wall_time_used = jobs
+        .iter()
+        .map(|job| job.get_status().wall_nanos().map(|n| n as i64))
+        .collect::<Time64NanosecondArray>();
+    let partition_keys = jobs
+        .iter()
+        .map(|job| job.metadata().partition_key())
+        .collect::<StringArray>();
+    let chunk_ids = jobs
+        .iter()
+        .map(|job| job.metadata().chunk_id())
+        .collect::<UInt32Array>();
+    let descriptions = jobs
+        .iter()
+        .map(|job| Some(job.metadata().description()))
+        .collect::<StringArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(ids) as ArrayRef,
+            Arc::new(statuses),
+            Arc::new(cpu_time_used),
+            Arc::new(wall_time_used),
+            Arc::new(partition_keys),
+            Arc::new(chunk_ids),
+            Arc::new(descriptions),
+        ],
+    )
+}
--- a/server/src/db/system_tables/persistence.rs
+++ b/server/src/db/system_tables/persistence.rs
@ -0,0 +1,154 @@
+use std::sync::Arc;
+
+use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::error::ErrorLogger;
+use data_types::partition_metadata::PartitionAddr;
+use data_types::write_summary::WriteSummary;
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::IoxSystemTable;
+
+/// Implementation of system.persistence_windows table
+#[derive(Debug)]
+pub(super) struct PersistenceWindowsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl PersistenceWindowsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: persistence_windows_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for PersistenceWindowsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_write_summaries(self.schema(), self.catalog.persistence_summaries())
+            .log_if_error("system.persistence_windows table")
+    }
+}
+
+fn persistence_windows_schema() -> SchemaRef {
+    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("row_count", DataType::UInt64, false),
+        Field::new("time_of_first_write", ts.clone(), false),
+        Field::new("time_of_last_write", ts.clone(), false),
+        Field::new("min_timestamp", ts.clone(), false),
+        Field::new("max_timestamp", ts, false),
+    ]))
+}
+
+fn from_write_summaries(
+    schema: SchemaRef,
+    chunks: Vec<(PartitionAddr, WriteSummary)>,
+) -> Result<RecordBatch> {
+    let partition_key = chunks
+        .iter()
+        .map(|(addr, _)| Some(addr.partition_key.as_ref()))
+        .collect::<StringArray>();
+    let table_name = chunks
+        .iter()
+        .map(|(addr, _)| Some(addr.table_name.as_ref()))
+        .collect::<StringArray>();
+    let row_counts = chunks
+        .iter()
+        .map(|(_, w)| Some(w.row_count as u64))
+        .collect::<UInt64Array>();
+    let time_of_first_write = chunks
+        .iter()
+        .map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let time_of_last_write = chunks
+        .iter()
+        .map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let min_timestamp = chunks
+        .iter()
+        .map(|(_, w)| Some(w.min_timestamp.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let max_timestamp = chunks
+        .iter()
+        .map(|(_, w)| Some(w.max_timestamp.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key),
+            Arc::new(table_name),
+            Arc::new(row_counts),
+            Arc::new(time_of_first_write),
+            Arc::new(time_of_last_write),
+            Arc::new(min_timestamp),
+            Arc::new(max_timestamp),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use chrono::{TimeZone, Utc};
+
+    use arrow_util::assert_batches_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_from_write_summaries() {
+        let addr = PartitionAddr {
+            db_name: Arc::from("db"),
+            table_name: Arc::from("table"),
+            partition_key: Arc::from("partition"),
+        };
+
+        let summaries = vec![
+            (
+                addr.clone(),
+                WriteSummary {
+                    time_of_first_write: Utc.timestamp_nanos(0),
+                    time_of_last_write: Utc.timestamp_nanos(20),
+                    min_timestamp: Utc.timestamp_nanos(50),
+                    max_timestamp: Utc.timestamp_nanos(60),
+                    row_count: 320,
+                },
+            ),
+            (
+                addr,
+                WriteSummary {
+                    time_of_first_write: Utc.timestamp_nanos(6),
+                    time_of_last_write: Utc.timestamp_nanos(21),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(2),
+                    row_count: 2,
+                },
+            ),
+        ];
+
+        let expected = vec![
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+            "| partition_key | table_name | row_count | time_of_first_write           | time_of_last_write            | min_timestamp                 | max_timestamp                 |",
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+            "| partition     | table      | 320       | 1970-01-01 00:00:00           | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |",
+            "| partition     | table      | 2         | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |",
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+        ];
+
+        let schema = persistence_windows_schema();
+        let batch = from_write_summaries(schema, summaries).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}
--- a/server/src/init.rs
+++ b/server/src/init.rs
@ -2,29 +2,19 @@
 use data_types::{
    database_rules::{DatabaseRules, WriteBufferConnection},
    database_state::DatabaseStateCode,
-    server_id::ServerId,
+    error::ErrorLogger,
    DatabaseName,
 };
 use futures::TryStreamExt;
 use generated_types::database_rules::decode_database_rules;
-use internal_types::once::OnceNonZeroU32;
 use object_store::{
    path::{parsed::DirsAndFileName, ObjectStorePath, Path},
    ObjectStore, ObjectStoreApi,
 };
-use observability_deps::tracing::{debug, error, info, warn};
-use parking_lot::Mutex;
+use observability_deps::tracing::{error, info, warn};
 use parquet_file::catalog::PreservedCatalog;
-use query::exec::Executor;
-use snafu::{OptionExt, ResultExt, Snafu};
-use std::{
-    collections::HashMap,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-};
-use tokio::sync::Semaphore;
+use snafu::{ResultExt, Snafu};
+use std::sync::Arc;
 use write_buffer::config::WriteBufferConfig;

 use crate::{
@ -45,9 +35,6 @@ pub enum Error {
        source: generated_types::database_rules::DecodeError,
    },

-    #[snafu(display("id already set"))]
-    IdAlreadySet { id: ServerId },
-
    #[snafu(display("unable to use server until id is set"))]
    IdNotSet,

@ -97,472 +84,254 @@ pub enum Error {

 pub type Result<T, E = Error> = std::result::Result<T, E>;

-#[derive(Debug, Default)]
-pub struct CurrentServerId(OnceNonZeroU32);
+/// Loads the database configurations based on the databases in the
+/// object store. Any databases in the config already won't be
+/// replaced.
+///
+/// Returns a Vec containing the results of loading the contained databases
+pub(crate) async fn initialize_server(
+    config: Arc<Config>,
+    wipe_on_error: bool,
+) -> Result<Vec<(DatabaseName<'static>, Result<()>)>> {
+    let root = config.root_path();

-impl CurrentServerId {
-    pub fn set(&self, id: ServerId) -> Result<()> {
-        let id = id.get();
+    // get the database names from the object store prefixes
+    // TODO: update object store to pull back all common prefixes by
+    //       following the next tokens.
+    let list_result = config
+        .object_store()
+        .list_with_delimiter(&root)
+        .await
+        .context(StoreError)?;

-        match self.0.set(id) {
-            Ok(()) => {
-                info!(server_id = id, "server ID set");
-                Ok(())
-            }
-            Err(id) => Err(Error::IdAlreadySet {
-                id: ServerId::new(id),
-            }),
-        }
-    }
+    let handles: Vec<_> = list_result
+        .common_prefixes
+        .into_iter()
+        .filter_map(|mut path| {
+            let config = Arc::clone(&config);
+            let root = root.clone();
+            path.set_file_name(DB_RULES_FILE_NAME);
+            let db_name = db_name_from_rules_path(&path)
+                .log_if_error("invalid database path")
+                .ok()?;

-    pub fn get(&self) -> Result<ServerId> {
-        self.0.get().map(ServerId::new).context(IdNotSet)
-    }
-}
-
-#[derive(Debug)]
-pub struct InitStatus {
-    pub server_id: CurrentServerId,
-
-    /// Flags that databases are loaded and server is ready to read/write data.
-    initialized: AtomicBool,
-
-    /// Semaphore that limits the number of jobs that load DBs when the serverID is set.
-    ///
-    /// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex,
-    /// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We
-    /// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical
-    /// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore
-    /// cannot be configured.
-    initialize_semaphore: Semaphore,
-
-    /// Error occurred during generic server init (e.g. listing store content).
-    error_generic: Mutex<Option<Arc<Error>>>,
-
-    /// Errors that occurred during some DB init.
-    errors_databases: Arc<Mutex<HashMap<String, Arc<Error>>>>,
-
-    /// Automatic wipe-on-error recovery
-    ///
-    /// See <https://github.com/influxdata/influxdb_iox/issues/1522>
-    pub(crate) wipe_on_error: AtomicBool,
-}
-
-impl InitStatus {
-    /// Create new "not initialized" status.
-    pub fn new() -> Self {
-        Self {
-            server_id: Default::default(),
-            initialized: AtomicBool::new(false),
-            // Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`.
-            initialize_semaphore: Semaphore::new(1),
-            error_generic: Default::default(),
-            errors_databases: Default::default(),
-            wipe_on_error: AtomicBool::new(true),
-        }
-    }
-
-    /// Base location in object store for this writer.
-    pub fn root_path(&self, store: &ObjectStore) -> Result<Path> {
-        let id = self.server_id.get()?;
-
-        let mut path = store.new_path();
-        path.push_dir(format!("{}", id));
-        Ok(path)
-    }
-
-    /// Check if server is loaded. Databases are loaded and server is ready to read/write.
-    pub fn initialized(&self) -> bool {
-        // Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that
-        // `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that
-        // every data acccess AFTER the following line will also stay AFTER this line.
-        self.initialized.load(Ordering::Acquire)
-    }
-
-    /// Error occurred during generic server init (e.g. listing store content).
-    pub fn error_generic(&self) -> Option<Arc<Error>> {
-        let guard = self.error_generic.lock();
-        guard.clone()
-    }
-
-    /// List all databases with errors in sorted order.
-    pub fn databases_with_errors(&self) -> Vec<String> {
-        let guard = self.errors_databases.lock();
-        let mut names: Vec<_> = guard.keys().cloned().collect();
-        names.sort();
-        names
-    }
-
-    /// Error that occurred during initialization of a specific database.
-    pub fn error_database(&self, db_name: &str) -> Option<Arc<Error>> {
-        let guard = self.errors_databases.lock();
-        guard.get(db_name).cloned()
-    }
-
-    /// Loads the database configurations based on the databases in the
-    /// object store. Any databases in the config already won't be
-    /// replaced.
-    ///
-    /// This requires the serverID to be set (will be a no-op otherwise).
-    ///
-    /// It will be a no-op if the configs are already loaded and the server is ready.
-    pub(crate) async fn maybe_initialize_server(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-    ) {
-        let server_id = match self.server_id.get() {
-            Ok(id) => id,
-            Err(e) => {
-                debug!(%e, "cannot initialize server because cannot get serverID");
-                return;
-            }
-        };
-
-        let _guard = self
-            .initialize_semaphore
-            .acquire()
-            .await
-            .expect("semaphore should not be closed");
-
-        // Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread
-        // that enters this semaphore after we've left actually sees the correct `is_ready` flag.
-        if self.initialized.load(Ordering::Acquire) {
-            // already loaded, so do nothing
-            return;
-        }
-
-        // Check if there was a previous failed attempt
-        if self.error_generic().is_some() {
-            return;
-        }
-
-        match self
-            .maybe_initialize_server_inner(store, config, exec, server_id)
-            .await
-        {
-            Ok(_) => {
-                // mark as ready (use correct ordering for Acquire-Release)
-                self.initialized.store(true, Ordering::Release);
-                info!("loaded databases, server is initalized");
-            }
-            Err(e) => {
-                error!(%e, "error during server init");
-                let mut guard = self.error_generic.lock();
-                *guard = Some(Arc::new(e));
-            }
-        }
-    }
-
-    async fn maybe_initialize_server_inner(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-    ) -> Result<()> {
-        let root = self.root_path(&store)?;
-
-        // get the database names from the object store prefixes
-        // TODO: update object store to pull back all common prefixes by
-        //       following the next tokens.
-        let list_result = store.list_with_delimiter(&root).await.context(StoreError)?;
-
-        let handles: Vec<_> = list_result
-            .common_prefixes
-            .into_iter()
-            .filter_map(|mut path| {
-                let store = Arc::clone(&store);
-                let config = Arc::clone(&config);
-                let exec = Arc::clone(&exec);
-                let errors_databases = Arc::clone(&self.errors_databases);
-                let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
-                let root = root.clone();
-
-                path.set_file_name(DB_RULES_FILE_NAME);
-
-                match db_name_from_rules_path(&path) {
-                    Ok(db_name) => {
-                        let handle = tokio::task::spawn(async move {
-                            match Self::initialize_database(
-                                server_id,
-                                store,
-                                config,
-                                exec,
-                                root,
-                                db_name.clone(),
-                                wipe_on_error,
-                            )
-                            .await
-                            {
-                                Ok(()) => {
-                                    info!(%db_name, "database initialized");
-                                }
-                                Err(e) => {
-                                    error!(%e, %db_name, "cannot load database");
-                                    let mut guard = errors_databases.lock();
-                                    guard.insert(db_name.to_string(), Arc::new(e));
-                                }
-                            }
-                        });
-                        Some(handle)
-                    }
-                    Err(e) => {
-                        error!(%e, "invalid database path");
-                        None
-                    }
-                }
+            Some(async move {
+                let result =
+                    initialize_database(config, root, db_name.clone(), wipe_on_error).await;
+                (db_name, result)
            })
-            .collect();
+        })
+        .collect();

-        futures::future::join_all(handles).await;
+    Ok(futures::future::join_all(handles).await)
+}

+async fn initialize_database(
+    config: Arc<Config>,
+    root: Path,
+    db_name: DatabaseName<'static>,
+    wipe_on_error: bool,
+) -> Result<()> {
+    // Reserve name before expensive IO (e.g. loading the preserved catalog)
+    let mut handle = config
+        .create_db(db_name)
+        .map_err(Box::new)
+        .context(InitDbError)?;
+
+    match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await
+    {
+        Ok(true) => {
+            // finished init and keep DB
+            handle.commit();
+            Ok(())
+        }
+        Ok(false) => {
+            // finished but do not keep DB
+            handle.abort();
+            Ok(())
+        }
+        Err(e) => {
+            // encountered some error, still commit intermediate result
+            handle.commit();
+            Err(e)
+        }
+    }
+}
+
+async fn load_database_rules(store: Arc<ObjectStore>, path: Path) -> Result<Option<DatabaseRules>> {
+    let serialized_rules = loop {
+        match get_database_config_bytes(&path, &store).await {
+            Ok(data) => break data,
+            Err(e) => {
+                if let Error::NoDatabaseConfigError { location } = &e {
+                    warn!(?location, "{}", e);
+                    return Ok(None);
+                }
+                error!(
+                    "error getting database config {:?} from object store: {}",
+                    path, e
+                );
+                tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
+                    .await;
+            }
+        }
+    };
+    let rules = decode_database_rules(serialized_rules.freeze())
+        .context(ErrorDeserializingRulesProtobuf)?;
+
+    Ok(Some(rules))
+}
+
+pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
+    config: Arc<Config>,
+    db_name: &DatabaseName<'static>,
+) -> Result<()> {
+    let store = config.object_store();
+
+    if config.has_uninitialized_database(db_name) {
+        let mut handle = config
+            .recover_db(db_name.clone())
+            .map_err(|e| Arc::new(e) as _)
+            .context(RecoverDbError)?;
+
+        if !((handle.state_code() == DatabaseStateCode::Known)
+            || (handle.state_code() == DatabaseStateCode::RulesLoaded))
+        {
+            // cannot wipe because init state is already too far
+            return Err(Error::DbPartiallyInitialized {
+                db_name: db_name.to_string(),
+            });
+        }
+
+        // wipe while holding handle so no other init/wipe process can interact with the catalog
+        PreservedCatalog::wipe(&store, handle.server_id(), db_name)
+            .await
+            .map_err(Box::new)
+            .context(PreservedCatalogWipeError)?;
+
+        let root = config.root_path();
+
+        let result =
+            try_advance_database_init_process_until_complete(&mut handle, &root, true).await;
+
+        // Commit changes even if failed
+        handle.commit();
+        result.map(|_| ())
+    } else {
+        let handle = config
+            .block_db(db_name.clone())
+            .map_err(|e| Arc::new(e) as _)
+            .context(RecoverDbError)?;
+
+        PreservedCatalog::wipe(&store, config.server_id(), db_name)
+            .await
+            .map_err(Box::new)
+            .context(PreservedCatalogWipeError)?;
+
+        drop(handle);
+
+        info!(%db_name, "wiped preserved catalog of non-registered database");
        Ok(())
    }
+}

-    async fn initialize_database(
-        server_id: ServerId,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-        root: Path,
-        db_name: DatabaseName<'static>,
-        wipe_on_error: bool,
-    ) -> Result<()> {
-        // Reserve name before expensive IO (e.g. loading the preserved catalog)
-        let mut handle = config
-            .create_db(store, exec, server_id, db_name)
-            .map_err(Box::new)
-            .context(InitDbError)?;
-
-        match Self::try_advance_database_init_process_until_complete(
-            &mut handle,
-            &root,
-            wipe_on_error,
-        )
-        .await
-        {
-            Ok(true) => {
-                // finished init and keep DB
-                handle.commit();
-                Ok(())
+/// Try to make as much progress as possible with DB init.
+///
+/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
+/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
+/// (e.g. because not rules file is present.)
+async fn try_advance_database_init_process_until_complete(
+    handle: &mut DatabaseHandle<'_>,
+    root: &Path,
+    wipe_on_error: bool,
+) -> Result<bool> {
+    loop {
+        match try_advance_database_init_process(handle, root, wipe_on_error).await? {
+            InitProgress::Unfinished => {}
+            InitProgress::Done => {
+                return Ok(true);
            }
-            Ok(false) => {
-                // finished but do not keep DB
-                handle.abort();
-                Ok(())
-            }
-            Err(e) => {
-                // encountered some error, still commit intermediate result
-                handle.commit();
-                Err(e)
+            InitProgress::Forget => {
+                return Ok(false);
            }
        }
    }
+}

-    async fn load_database_rules(
-        store: Arc<ObjectStore>,
-        path: Path,
-    ) -> Result<Option<DatabaseRules>> {
-        let serialized_rules = loop {
-            match get_database_config_bytes(&path, &store).await {
-                Ok(data) => break data,
-                Err(e) => {
-                    if let Error::NoDatabaseConfigError { location } = &e {
-                        warn!(?location, "{}", e);
-                        return Ok(None);
-                    }
-                    error!(
-                        "error getting database config {:?} from object store: {}",
-                        path, e
-                    );
-                    tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
-                        .await;
+/// Try to make some progress in the DB init.
+async fn try_advance_database_init_process(
+    handle: &mut DatabaseHandle<'_>,
+    root: &Path,
+    wipe_on_error: bool,
+) -> Result<InitProgress> {
+    match handle.state_code() {
+        DatabaseStateCode::Known => {
+            // known => load DB rules
+            let path = object_store_path_for_database_config(root, &handle.db_name());
+            match load_database_rules(handle.object_store(), path).await? {
+                Some(rules) => {
+                    handle
+                        .advance_rules_loaded(rules)
+                        .map_err(Box::new)
+                        .context(InitDbError)?;
+
+                    // there is still more work to do for this DB
+                    Ok(InitProgress::Unfinished)
+                }
+                None => {
+                    // no rules file present, advice to forget his DB
+                    Ok(InitProgress::Forget)
                }
            }
-        };
-        let rules = decode_database_rules(serialized_rules.freeze())
-            .context(ErrorDeserializingRulesProtobuf)?;
-
-        Ok(Some(rules))
-    }
-
-    pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    ) -> Result<()> {
-        if config.has_uninitialized_database(&db_name) {
-            let mut handle = config
-                .recover_db(db_name.clone())
-                .map_err(|e| Arc::new(e) as _)
-                .context(RecoverDbError)?;
-
-            if !((handle.state_code() == DatabaseStateCode::Known)
-                || (handle.state_code() == DatabaseStateCode::RulesLoaded))
-            {
-                // cannot wipe because init state is already too far
-                return Err(Error::DbPartiallyInitialized {
-                    db_name: db_name.to_string(),
-                });
-            }
-
-            // wipe while holding handle so no other init/wipe process can interact with the catalog
-            PreservedCatalog::wipe(&store, handle.server_id(), &db_name)
-                .await
-                .map_err(Box::new)
-                .context(PreservedCatalogWipeError)?;
-
-            let root = self.root_path(&store)?;
-            let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
-            match Self::try_advance_database_init_process_until_complete(
-                &mut handle,
-                &root,
+        }
+        DatabaseStateCode::RulesLoaded => {
+            // rules already loaded => continue with loading preserved catalog
+            let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
+                &handle.db_name(),
+                handle.object_store(),
+                handle.server_id(),
+                handle.metrics_registry(),
                wipe_on_error,
            )
            .await
-            {
-                Ok(_) => {
-                    // yeah, recovered DB
-                    handle.commit();
+            .map_err(|e| Box::new(e) as _)
+            .context(CatalogLoadError)?;

-                    let mut guard = self.errors_databases.lock();
-                    guard.remove(&db_name.to_string());
-
-                    info!(%db_name, "wiped preserved catalog of registered database and recovered");
-                    Ok(())
-                }
-                Err(e) => {
-                    // could not recover, but still keep new result
-                    handle.commit();
-
-                    let mut guard = self.errors_databases.lock();
-                    let e = Arc::new(e);
-                    guard.insert(db_name.to_string(), Arc::clone(&e));
-
-                    warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
-                    Err(Error::RecoverDbError { source: e })
-                }
-            }
-        } else {
-            let handle = config
-                .block_db(db_name.clone())
-                .map_err(|e| Arc::new(e) as _)
-                .context(RecoverDbError)?;
-
-            PreservedCatalog::wipe(&store, server_id, &db_name)
+            let rules = handle
+                .rules()
+                .expect("in this state rules should be loaded");
+            let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
                .await
+                .context(CreateWriteBuffer {
+                    config: rules.write_buffer_connection.clone(),
+                })?;
+            info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
+
+            handle
+                .advance_replay(preserved_catalog, catalog, write_buffer)
                .map_err(Box::new)
-                .context(PreservedCatalogWipeError)?;
+                .context(InitDbError)?;

-            drop(handle);
-
-            info!(%db_name, "wiped preserved catalog of non-registered database");
-            Ok(())
+            // there is still more work to do for this DB
+            Ok(InitProgress::Unfinished)
        }
-    }
+        DatabaseStateCode::Replay => {
+            let db = handle
+                .db_any_state()
+                .expect("DB should be available in this state");
+            db.perform_replay().await;

-    /// Try to make as much progress as possible with DB init.
-    ///
-    /// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
-    /// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
-    /// (e.g. because not rules file is present.)
-    async fn try_advance_database_init_process_until_complete(
-        handle: &mut DatabaseHandle<'_>,
-        root: &Path,
-        wipe_on_error: bool,
-    ) -> Result<bool> {
-        loop {
-            match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? {
-                InitProgress::Unfinished => {}
-                InitProgress::Done => {
-                    return Ok(true);
-                }
-                InitProgress::Forget => {
-                    return Ok(false);
-                }
-            }
+            handle
+                .advance_init()
+                .map_err(Box::new)
+                .context(InitDbError)?;
+
+            // there is still more work to do for this DB
+            Ok(InitProgress::Unfinished)
        }
-    }
-
-    /// Try to make some progress in the DB init.
-    async fn try_advance_database_init_process(
-        handle: &mut DatabaseHandle<'_>,
-        root: &Path,
-        wipe_on_error: bool,
-    ) -> Result<InitProgress> {
-        match handle.state_code() {
-            DatabaseStateCode::Known => {
-                // known => load DB rules
-                let path = object_store_path_for_database_config(root, &handle.db_name());
-                match Self::load_database_rules(handle.object_store(), path).await? {
-                    Some(rules) => {
-                        handle
-                            .advance_rules_loaded(rules)
-                            .map_err(Box::new)
-                            .context(InitDbError)?;
-
-                        // there is still more work to do for this DB
-                        Ok(InitProgress::Unfinished)
-                    }
-                    None => {
-                        // no rules file present, advice to forget his DB
-                        Ok(InitProgress::Forget)
-                    }
-                }
-            }
-            DatabaseStateCode::RulesLoaded => {
-                // rules already loaded => continue with loading preserved catalog
-                let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
-                    &handle.db_name(),
-                    handle.object_store(),
-                    handle.server_id(),
-                    handle.metrics_registry(),
-                    wipe_on_error,
-                )
-                .await
-                .map_err(|e| Box::new(e) as _)
-                .context(CatalogLoadError)?;
-
-                let rules = handle
-                    .rules()
-                    .expect("in this state rules should be loaded");
-                let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
-                    .await
-                    .context(CreateWriteBuffer {
-                        config: rules.write_buffer_connection.clone(),
-                    })?;
-                info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
-
-                handle
-                    .advance_replay(preserved_catalog, catalog, write_buffer)
-                    .map_err(Box::new)
-                    .context(InitDbError)?;
-
-                // there is still more work to do for this DB
-                Ok(InitProgress::Unfinished)
-            }
-            DatabaseStateCode::Replay => {
-                let db = handle
-                    .db_any_state()
-                    .expect("DB should be available in this state");
-                db.perform_replay().await;
-
-                handle
-                    .advance_init()
-                    .map_err(Box::new)
-                    .context(InitDbError)?;
-
-                // there is still more work to do for this DB
-                Ok(InitProgress::Unfinished)
-            }
-            DatabaseStateCode::Initialized => {
-                // database fully initialized => nothing to do
-                Ok(InitProgress::Done)
-            }
+        DatabaseStateCode::Initialized => {
+            // database fully initialized => nothing to do
+            Ok(InitProgress::Done)
        }
    }
 }
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -74,9 +74,8 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use bytes::BytesMut;
 use db::load::create_preserved_catalog;
-use init::InitStatus;
-use observability_deps::tracing::{debug, info, warn};
-use parking_lot::Mutex;
+use observability_deps::tracing::{debug, error, info, warn};
+use parking_lot::{Mutex, RwLockUpgradableReadGuard};
 use snafu::{OptionExt, ResultExt, Snafu};

 use data_types::{
@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb;
 use influxdb_line_protocol::ParsedLine;
 use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry};
 use object_store::{ObjectStore, ObjectStoreApi};
+use parking_lot::RwLock;
 use query::{exec::Executor, DatabaseStore};
 use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt};
 use write_buffer::config::WriteBufferConfig;
@ -220,11 +220,11 @@ pub enum Error {
    #[snafu(display("cannot create preserved catalog: {}", source))]
    CannotCreatePreservedCatalog { source: DatabaseError },

-    #[snafu(display("cannot set id: {}", source))]
-    SetIdError { source: crate::init::Error },
+    #[snafu(display("id already set"))]
+    IdAlreadySet,

-    #[snafu(display("cannot get id: {}", source))]
-    GetIdError { source: crate::init::Error },
+    #[snafu(display("id not set"))]
+    IdNotSet,

    #[snafu(display(
        "cannot create write buffer with config: {:?}, error: {}",
@ -297,6 +297,8 @@ pub struct ServerConfig {
    metric_registry: Arc<MetricRegistry>,

    remote_template: Option<RemoteTemplate>,
+
+    wipe_catalog_on_error: bool,
 }

 impl ServerConfig {
@ -311,6 +313,7 @@ impl ServerConfig {
            object_store,
            metric_registry,
            remote_template,
+            wipe_catalog_on_error: true,
        }
    }

@ -414,7 +417,6 @@ impl ServerMetrics {
 /// of these structs, which keeps track of all replication and query rules.
 #[derive(Debug)]
 pub struct Server<M: ConnectionManager> {
-    config: Arc<Config>,
    connection_manager: Arc<M>,
    pub store: Arc<ObjectStore>,
    exec: Arc<Executor>,
@ -426,7 +428,50 @@ pub struct Server<M: ConnectionManager> {
    /// and populates the endpoint with this data.
    pub registry: Arc<metrics::MetricRegistry>,

-    init_status: Arc<InitStatus>,
+    /// The state machine for server startup
+    stage: Arc<RwLock<ServerStage>>,
+}
+
+/// The stage of the server in the startup process
+///
+/// The progression is linear Startup -> InitReady -> Initializing -> Initialized
+/// with the sole exception that on failure Initializing -> InitReady
+///
+/// Errors encountered on server init will be retried, however, errors encountered
+/// during database init will require operator intervention
+///
+/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively
+///
+/// They do not impact the state machine's progression, but instead are exposed to the
+/// gRPC management API to allow an operator to assess the state of the system
+#[derive(Debug)]
+enum ServerStage {
+    /// Server has started but doesn't have a server id yet
+    Startup {
+        remote_template: Option<RemoteTemplate>,
+        wipe_catalog_on_error: bool,
+    },
+
+    /// Server can be initialized
+    InitReady {
+        wipe_catalog_on_error: bool,
+        config: Arc<Config>,
+        last_error: Option<Arc<init::Error>>,
+    },
+
+    /// Server has a server id, has started loading
+    Initializing {
+        wipe_catalog_on_error: bool,
+        config: Arc<Config>,
+        last_error: Option<Arc<init::Error>>,
+    },
+
+    /// Server has finish initializing, possibly with errors
+    Initialized {
+        config: Arc<Config>,
+        /// Errors that occurred during some DB init.
+        database_errors: HashMap<String, Arc<init::Error>>,
+    },
 }

 #[derive(Debug)]
@ -454,22 +499,23 @@ where
            // to test the metrics provide a different registry to the `ServerConfig`.
            metric_registry,
            remote_template,
+            wipe_catalog_on_error,
        } = config;
+
        let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get);
+        let exec = Arc::new(Executor::new(num_worker_threads));

        Self {
-            config: Arc::new(Config::new(
-                Arc::clone(&jobs),
-                Arc::clone(&metric_registry),
-                remote_template,
-            )),
            store: object_store,
            connection_manager: Arc::new(connection_manager),
-            exec: Arc::new(Executor::new(num_worker_threads)),
+            exec,
            jobs,
            metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))),
            registry: Arc::clone(&metric_registry),
-            init_status: Arc::new(InitStatus::new()),
+            stage: Arc::new(RwLock::new(ServerStage::Startup {
+                remote_template,
+                wipe_catalog_on_error,
+            })),
        }
    }

@ -478,68 +524,112 @@ where
    ///
    /// A valid server ID Must be non-zero.
    pub fn set_id(&self, id: ServerId) -> Result<()> {
-        self.init_status.server_id.set(id).context(SetIdError)
-    }
+        let mut stage = self.stage.write();
+        match &mut *stage {
+            ServerStage::Startup {
+                remote_template,
+                wipe_catalog_on_error,
+            } => {
+                let remote_template = remote_template.take();

-    /// Returns the current server ID, or an error if not yet set.
-    pub fn require_id(&self) -> Result<ServerId> {
-        self.init_status.server_id.get().context(GetIdError)
+                *stage = ServerStage::InitReady {
+                    wipe_catalog_on_error: *wipe_catalog_on_error,
+                    config: Arc::new(Config::new(
+                        Arc::clone(&self.jobs),
+                        Arc::clone(&self.store),
+                        Arc::clone(&self.exec),
+                        id,
+                        Arc::clone(&self.registry),
+                        remote_template,
+                    )),
+                    last_error: None,
+                };
+                Ok(())
+            }
+            _ => Err(Error::IdAlreadySet),
+        }
    }

    /// Check if server is loaded. Databases are loaded and server is ready to read/write.
    pub fn initialized(&self) -> bool {
-        self.init_status.initialized()
+        matches!(&*self.stage.read(), ServerStage::Initialized { .. })
+    }
+
+    /// Require that server is loaded. Databases are loaded and server is ready to read/write.
+    fn require_initialized(&self) -> Result<Arc<Config>> {
+        match &*self.stage.read() {
+            ServerStage::Startup { .. } => Err(Error::IdNotSet),
+            ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => {
+                Err(Error::ServerNotInitialized {
+                    server_id: config.server_id(),
+                })
+            }
+            ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
+        }
+    }
+
+    /// Returns the config for this server if server id has been set
+    fn config(&self) -> Result<Arc<Config>> {
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Startup { .. } => Err(Error::IdNotSet),
+            ServerStage::InitReady { config, .. }
+            | ServerStage::Initializing { config, .. }
+            | ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
+        }
+    }
+
+    /// Returns the server id for this server if set
+    pub fn server_id(&self) -> Option<ServerId> {
+        self.config().map(|x| x.server_id()).ok()
    }

    /// Error occurred during generic server init (e.g. listing store content).
    pub fn error_generic(&self) -> Option<Arc<crate::init::Error>> {
-        self.init_status.error_generic()
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::InitReady { last_error, .. } => last_error.clone(),
+            ServerStage::Initializing { last_error, .. } => last_error.clone(),
+            _ => None,
+        }
    }

    /// List all databases with errors in sorted order.
    pub fn databases_with_errors(&self) -> Vec<String> {
-        self.init_status.databases_with_errors()
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Initialized {
+                database_errors, ..
+            } => database_errors.keys().cloned().collect(),
+            _ => Default::default(),
+        }
    }

    /// Error that occurred during initialization of a specific database.
    pub fn error_database(&self, db_name: &str) -> Option<Arc<crate::init::Error>> {
-        self.init_status.error_database(db_name)
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Initialized {
+                database_errors, ..
+            } => database_errors.get(db_name).cloned(),
+            _ => None,
+        }
    }

    /// Current database init state.
    pub fn database_state(&self, name: &str) -> Option<DatabaseStateCode> {
-        if let Ok(name) = DatabaseName::new(name) {
-            self.config.db_state(&name)
-        } else {
-            None
-        }
-    }
-
-    /// Require that server is loaded. Databases are loaded and server is ready to read/write.
-    fn require_initialized(&self) -> Result<ServerId> {
-        // since a server ID is the pre-requirement for init, check this first
-        let server_id = self.require_id()?;
-
-        // ordering here isn't that important since this method is not used to check-and-modify the flag
-        if self.initialized() {
-            Ok(server_id)
-        } else {
-            Err(Error::ServerNotInitialized { server_id })
-        }
+        let db_name = DatabaseName::new(name).ok()?;
+        let config = self.config().ok()?;
+        config.db_state(&db_name)
    }

    /// Tells the server the set of rules for a database.
    pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> {
        // Return an error if this server is not yet ready
-        let server_id = self.require_initialized()?;
+        let config = self.require_initialized()?;

        // Reserve name before expensive IO (e.g. loading the preserved catalog)
-        let mut db_reservation = self.config.create_db(
-            Arc::clone(&self.store),
-            Arc::clone(&self.exec),
-            server_id,
-            rules.name.clone(),
-        )?;
+        let mut db_reservation = config.create_db(rules.name.clone())?;

        // register rules
        db_reservation.advance_rules_loaded(rules.clone())?;
@ -548,14 +638,14 @@ where
        let (preserved_catalog, catalog) = create_preserved_catalog(
            rules.db_name(),
            Arc::clone(&self.store),
-            server_id,
-            self.config.metrics_registry(),
+            config.server_id(),
+            config.metrics_registry(),
        )
        .await
        .map_err(|e| Box::new(e) as _)
        .context(CannotCreatePreservedCatalog)?;

-        let write_buffer = WriteBufferConfig::new(server_id, &rules)
+        let write_buffer = WriteBufferConfig::new(config.server_id(), &rules)
            .await
            .map_err(|e| Error::CreatingWriteBuffer {
                config: rules.write_buffer_connection.clone(),
@ -575,13 +665,8 @@ where
    }

    pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> {
-        let location = object_store_path_for_database_config(
-            &self
-                .init_status
-                .root_path(&self.store)
-                .context(GetIdError)?,
-            &rules.name,
-        );
+        let config = self.config()?;
+        let location = object_store_path_for_database_config(&config.root_path(), &rules.name);

        let mut data = BytesMut::new();
        encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?;
@ -604,15 +689,62 @@ where
    /// object store. Any databases in the config already won't be
    /// replaced.
    ///
-    /// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready.
+    /// This requires the serverID to be set.
+    ///
+    /// It will be a no-op if the configs are already loaded and the server is ready.
    pub async fn maybe_initialize_server(&self) {
-        self.init_status
-            .maybe_initialize_server(
-                Arc::clone(&self.store),
-                Arc::clone(&self.config),
-                Arc::clone(&self.exec),
-            )
-            .await;
+        // Explicit scope to help async generator
+        let (wipe_catalog_on_error, config) = {
+            let state = self.stage.upgradable_read();
+            match &*state {
+                ServerStage::InitReady {
+                    wipe_catalog_on_error,
+                    config,
+                    last_error,
+                } => {
+                    let config = Arc::clone(config);
+                    let last_error = last_error.clone();
+                    let wipe_catalog_on_error = *wipe_catalog_on_error;
+
+                    // Mark the server as initializing and drop lock
+
+                    let mut state = RwLockUpgradableReadGuard::upgrade(state);
+                    *state = ServerStage::Initializing {
+                        config: Arc::clone(&config),
+                        wipe_catalog_on_error,
+                        last_error,
+                    };
+                    (wipe_catalog_on_error, config)
+                }
+                _ => return,
+            }
+        };
+
+        let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await;
+        let new_stage = match init_result {
+            // Success -> move to next stage
+            Ok(results) => {
+                info!(server_id=%config.server_id(), "server initialized");
+                ServerStage::Initialized {
+                    config,
+                    database_errors: results
+                        .into_iter()
+                        .filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?))))
+                        .collect(),
+                }
+            }
+            // Error -> return to InitReady
+            Err(err) => {
+                error!(%err, "error during server init");
+                ServerStage::InitReady {
+                    wipe_catalog_on_error,
+                    config,
+                    last_error: Some(Arc::new(err)),
+                }
+            }
+        };
+
+        *self.stage.write() = new_stage;
    }

    pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> {
@ -640,11 +772,10 @@ where
        default_time: i64,
    ) -> Result<()> {
        // Return an error if this server is not yet ready
-        self.require_initialized()?;
+        let config = self.require_initialized()?;

        let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
-        let db = self
-            .config
+        let db = config
            .db_initialized(&db_name)
            .context(DatabaseNotFound { db_name: &*db_name })?;

@ -744,9 +875,12 @@ where
        node_group: &[ServerId],
        entry: Entry,
    ) -> Result<()> {
+        // Return an error if this server is not yet ready
+        let config = self.config()?;
+
        let addrs: Vec<_> = node_group
            .iter()
-            .filter_map(|&node| self.config.resolve_remote(node))
+            .filter_map(|&node| config.resolve_remote(node))
            .collect();
        if addrs.is_empty() {
            return NoRemoteConfigured { node_group }.fail();
@ -775,11 +909,10 @@ where

    pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec<u8>) -> Result<()> {
        // Return an error if this server is not yet ready
-        self.require_initialized()?;
+        let config = self.require_initialized()?;

        let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
-        let db = self
-            .config
+        let db = config
            .db_initialized(&db_name)
            .context(DatabaseNotFound { db_name: &*db_name })?;

@ -825,11 +958,11 @@ where
    }

    pub fn db(&self, name: &DatabaseName<'_>) -> Option<Arc<Db>> {
-        self.config.db_initialized(name)
+        self.config().ok()?.db_initialized(name)
    }

    pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option<Arc<DatabaseRules>> {
-        self.config.db_initialized(name).map(|d| d.rules())
+        self.db(name).map(|d| d.rules())
    }

    // Update database rules and save on success.
@ -841,8 +974,8 @@ where
    where
        F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E> + Send,
    {
-        let rules = self
-            .config
+        let config = self.config()?;
+        let rules = config
            .update_db_rules(db_name, update)
            .map_err(|e| match e {
                crate::config::UpdateError::Closure(e) => UpdateError::Closure(e),
@ -854,16 +987,23 @@ where
        Ok(rules)
    }

-    pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> {
-        self.config.remotes_sorted()
+    pub fn remotes_sorted(&self) -> Result<Vec<(ServerId, String)>> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        Ok(config.remotes_sorted())
    }

-    pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) {
-        self.config.update_remote(id, addr)
+    pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        config.update_remote(id, addr);
+        Ok(())
    }

-    pub fn delete_remote(&self, id: ServerId) -> Option<GRpcConnectionString> {
-        self.config.delete_remote(id)
+    pub fn delete_remote(&self, id: ServerId) -> Result<Option<GRpcConnectionString>> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        Ok(config.delete_remote(id))
    }

    pub fn spawn_dummy_job(&self, nanos: Vec<u64>) -> TaskTracker<Job> {
@ -893,14 +1033,15 @@ where
        partition_key: impl Into<String>,
        chunk_id: u32,
    ) -> Result<TaskTracker<Job>> {
+        let config = self.require_initialized()?;
+
        let db_name = db_name.to_string();
        let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?;

        let partition_key = partition_key.into();
        let table_name = table_name.into();

-        let db = self
-            .config
+        let db = config
            .db_initialized(&name)
            .context(DatabaseNotFound { db_name: &db_name })?;

@ -921,25 +1062,62 @@ where
    /// DB jobs and this command.
    pub fn wipe_preserved_catalog(
        &self,
-        db_name: DatabaseName<'static>,
+        db_name: &DatabaseName<'static>,
    ) -> Result<TaskTracker<Job>> {
-        if self.config.db_initialized(&db_name).is_some() {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        // Can only wipe catalog of database that failed to initialize
+        let config = match &*self.stage.read() {
+            ServerStage::Initialized {
+                config,
+                database_errors,
+            } => {
+                if config.db_initialized(db_name).is_some() {
+                    return Err(Error::DatabaseAlreadyExists {
+                        db_name: db_name.to_string(),
+                    });
+                }
+
+                if !database_errors.contains_key(db_name.as_str()) {
+                    // TODO: Should this be an error? Some end-to-end tests assume it is non-fatal
+                    warn!(%db_name, "wiping database not present at startup");
+                }
+                Arc::clone(config)
+            }
+            ServerStage::Startup { .. } => return Err(Error::IdNotSet),
+            ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => {
+                return Err(Error::ServerNotInitialized {
+                    server_id: config.server_id(),
+                })
+            }
+        };

        let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog {
            db_name: db_name.to_string(),
        });
-        let object_store = Arc::clone(&self.store);
-        let config = Arc::clone(&self.config);
-        let server_id = self.require_id()?;
-        let init_status = Arc::clone(&self.init_status);
+
+        let state = Arc::clone(&self.stage);
+        let db_name = db_name.clone();
+
        let task = async move {
-            init_status
-                .wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name)
-                .await
+            let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await;
+
+            match &mut *state.write() {
+                ServerStage::Initialized {
+                    database_errors, ..
+                } => match result {
+                    Ok(_) => {
+                        info!(%db_name, "wiped preserved catalog of registered database and recovered");
+                        database_errors.remove(db_name.as_str());
+                        Ok(())
+                    }
+                    Err(e) => {
+                        warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
+                        let e = Arc::new(e);
+                        database_errors.insert(db_name.to_string(), Arc::clone(&e));
+                        Err(e)
+                    }
+                },
+                _ => unreachable!("server cannot become uninitialized"),
+            }
        };
        tokio::spawn(task.track(registration));

@ -973,7 +1151,9 @@ where
        }

        info!("shutting down background workers");
-        self.config.drain().await;
+        if let Ok(config) = self.config() {
+            config.drain().await;
+        }

        info!("draining tracker registry");

@ -999,11 +1179,15 @@ where
    type Error = Error;

    fn db_names_sorted(&self) -> Vec<String> {
-        self.config
-            .db_names_sorted()
-            .iter()
-            .map(|i| i.clone().into())
-            .collect()
+        self.config()
+            .map(|config| {
+                config
+                    .db_names_sorted()
+                    .iter()
+                    .map(ToString::to_string)
+                    .collect()
+            })
+            .unwrap_or_default()
    }

    fn db(&self, name: &str) -> Option<Arc<Self::Database>> {
@ -1214,25 +1398,15 @@ mod tests {
        let manager = TestConnectionManager::new();
        let server = Server::new(manager, config());

-        let resp = server.require_id().unwrap_err();
-        assert!(matches!(
-            resp,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        let resp = server.config().unwrap_err();
+        assert!(matches!(resp, Error::IdNotSet));

        let lines = parsed_lines("cpu foo=1 10");
        let resp = server
            .write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME)
            .await
            .unwrap_err();
-        assert!(matches!(
-            resp,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        assert!(matches!(resp, Error::IdNotSet));
    }

    #[tokio::test]
@ -1559,7 +1733,7 @@ mod tests {

        let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2];
        let db = server.db(&db_name).unwrap();
-        db.update_db_rules(|mut rules| {
+        db.update_rules(|mut rules| {
            let shard_config = ShardConfig {
                hash_ring: Some(HashRing {
                    shards: vec![TEST_SHARD_ID].into(),
@ -1589,7 +1763,9 @@ mod tests {
        );

        // one remote is configured but it's down and we'll get connection error
-        server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into());
+        server
+            .update_remote(bad_remote_id, BAD_REMOTE_ADDR.into())
+            .unwrap();
        let err = server
            .write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME)
            .await
@ -1606,8 +1782,12 @@ mod tests {

        // We configure the address for the other remote, this time connection will succeed
        // despite the bad remote failing to connect.
-        server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into());
-        server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into());
+        server
+            .update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into())
+            .unwrap();
+        server
+            .update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into())
+            .unwrap();

        // Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable
        // probability both the remotes will get hit.
@ -1796,7 +1976,7 @@ mod tests {
        let db_name = DatabaseName::new("foo").unwrap();
        let db = server.db(&db_name).unwrap();
        let rules = db
-            .update_db_rules(|mut rules| {
+            .update_rules(|mut rules| {
                rules.lifecycle_rules.buffer_size_hard =
                    Some(std::num::NonZeroUsize::new(10).unwrap());
                Ok::<_, Infallible>(rules)
@ -1844,12 +2024,7 @@ mod tests {
        let err = create_simple_database(&server, "bananas")
            .await
            .unwrap_err();
-        assert!(matches!(
-            err,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        assert!(matches!(err, Error::IdNotSet));

        server.set_id(ServerId::try_from(1).unwrap()).unwrap();
        // do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready
@ -1873,7 +2048,7 @@ mod tests {

        let t_0 = Instant::now();
        loop {
-            if server.require_initialized().is_ok() {
+            if server.config().is_ok() {
                break;
            }
            assert!(t_0.elapsed() < Duration::from_secs(10));
@ -1916,9 +2091,12 @@ mod tests {
        create_simple_database(&server, "foo")
            .await
            .expect("failed to create database");
-        let root = server.init_status.root_path(&store).unwrap();
-        server.config.drain().await;
+
+        let config = server.require_initialized().unwrap();
+        let root = config.root_path();
+        config.drain().await;
        drop(server);
+        drop(config);

        // tamper store
        let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap());
@ -2003,18 +2181,24 @@ mod tests {
        let server = Server::new(manager, config);
        server.set_id(server_id).unwrap();
        server.maybe_initialize_server().await;
+
        create_simple_database(&server, db_name_existing.clone())
            .await
            .expect("failed to create database");
+
        create_simple_database(&server, db_name_rules_broken.clone())
            .await
            .expect("failed to create database");
+
        create_simple_database(&server, db_name_catalog_broken.clone())
            .await
            .expect("failed to create database");
-        let root = server.init_status.root_path(&store).unwrap();
-        server.config.drain().await;
+
+        let config = server.require_initialized().unwrap();
+        let root = config.root_path();
+        config.drain().await;
        drop(server);
+        drop(config);

        // tamper store to break one database
        let path = object_store_path_for_database_config(&root, &db_name_rules_broken);
@ -2045,22 +2229,18 @@ mod tests {
        let store = Arc::try_unwrap(store).unwrap();
        store.get(&path).await.unwrap();
        let manager = TestConnectionManager::new();
-        let config = config_with_store(store);
-        let server = Server::new(manager, config);
-
        // need to disable auto-wipe for this test
-        server
-            .init_status
-            .wipe_on_error
-            .store(false, std::sync::atomic::Ordering::Relaxed);
+        let mut config = config_with_store(store);
+        config.wipe_catalog_on_error = false;
+        let server = Server::new(manager, config);

        // cannot wipe if server ID is not set
        assert_eq!(
            server
-                .wipe_preserved_catalog(db_name_non_existing.clone())
+                .wipe_preserved_catalog(&db_name_non_existing)
                .unwrap_err()
                .to_string(),
-            "cannot get id: unable to use server until id is set"
+            "id not set"
        );

        server.set_id(ServerId::try_from(1).unwrap()).unwrap();
@ -2069,31 +2249,29 @@ mod tests {
        // 1. cannot wipe if DB exists
        assert_eq!(
            server
-                .wipe_preserved_catalog(db_name_existing.clone())
+                .wipe_preserved_catalog(&db_name_existing)
                .unwrap_err()
                .to_string(),
            "database already exists: db_existing"
        );
-        assert!(PreservedCatalog::exists(
-            &server.store,
-            server.require_id().unwrap(),
-            &db_name_existing.to_string()
-        )
-        .await
-        .unwrap());
+        assert!(
+            PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str())
+                .await
+                .unwrap()
+        );

        // 2. wiping a non-existing DB just works, but won't bring DB into existence
        assert!(server.error_database(&db_name_non_existing).is_none());
        PreservedCatalog::new_empty::<TestCatalogState>(
            Arc::clone(&server.store),
-            server.require_id().unwrap(),
+            server_id,
            db_name_non_existing.to_string(),
            (),
        )
        .await
        .unwrap();
        let tracker = server
-            .wipe_preserved_catalog(db_name_non_existing.clone())
+            .wipe_preserved_catalog(&db_name_non_existing)
            .unwrap();
        let metadata = tracker.metadata();
        let expected_metadata = Job::WipePreservedCatalog {
@ -2103,7 +2281,7 @@ mod tests {
        tracker.join().await;
        assert!(!PreservedCatalog::exists(
            &server.store,
-            server.require_id().unwrap(),
+            server_id,
            &db_name_non_existing.to_string()
        )
        .await
@ -2114,7 +2292,7 @@ mod tests {
        // 3. wipe DB with broken rules file, this won't bring DB back to life
        assert!(server.error_database(&db_name_rules_broken).is_some());
        let tracker = server
-            .wipe_preserved_catalog(db_name_rules_broken.clone())
+            .wipe_preserved_catalog(&db_name_rules_broken)
            .unwrap();
        let metadata = tracker.metadata();
        let expected_metadata = Job::WipePreservedCatalog {
@ -2124,7 +2302,7 @@ mod tests {
        tracker.join().await;
        assert!(!PreservedCatalog::exists(
            &server.store,
-            server.require_id().unwrap(),
+            server_id,
            &db_name_rules_broken.to_string()
        )
        .await
@ -2135,7 +2313,7 @@ mod tests {
        // 4. wipe DB with broken catalog, this will bring the DB back to life
        assert!(server.error_database(&db_name_catalog_broken).is_some());
        let tracker = server
-            .wipe_preserved_catalog(db_name_catalog_broken.clone())
+            .wipe_preserved_catalog(&db_name_catalog_broken)
            .unwrap();
        let metadata = tracker.metadata();
        let expected_metadata = Job::WipePreservedCatalog {
@ -2145,7 +2323,7 @@ mod tests {
        tracker.join().await;
        assert!(PreservedCatalog::exists(
            &server.store,
-            server.require_id().unwrap(),
+            server_id,
            &db_name_catalog_broken.to_string()
        )
        .await
@ -2166,18 +2344,16 @@ mod tests {
            .unwrap();
        assert_eq!(
            server
-                .wipe_preserved_catalog(db_name_created.clone())
+                .wipe_preserved_catalog(&db_name_created)
                .unwrap_err()
                .to_string(),
            "database already exists: db_created"
        );
-        assert!(PreservedCatalog::exists(
-            &server.store,
-            server.require_id().unwrap(),
-            &db_name_created.to_string()
-        )
-        .await
-        .unwrap());
+        assert!(
+            PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string())
+                .await
+                .unwrap()
+        );
    }

    #[tokio::test]
--- a/src/commands/database.rs
+++ b/src/commands/database.rs
@ -119,6 +119,11 @@ struct Create {
    /// Maximum number of rows to buffer in a MUB chunk before compacting it
    #[structopt(long, default_value = "100000")]
    mub_row_threshold: u64,
+
+    /// Use up to this amount of space in bytes for caching Parquet files. A
+    /// value of zero disables Parquet file caching.
+    #[structopt(long, default_value = "0")]
+    parquet_cache_limit: u64,
 }

 /// Get list of databases
@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
                    persist_row_threshold: command.persist_row_threshold,
                    persist_age_threshold_seconds: command.persist_age_threshold_seconds,
                    mub_row_threshold: command.mub_row_threshold,
+                    parquet_cache_limit: command.parquet_cache_limit,
                }),

                // Default to hourly partitions
--- a/src/commands/run.rs
+++ b/src/commands/run.rs
@ -231,6 +231,14 @@ Possible values (case insensitive):
        default_value = "serving"
    )]
    pub initial_serving_state: ServingReadinessState,
+
+    /// Maximum size of HTTP requests.
+    #[structopt(
+        long = "--max-http-request-size",
+        env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE",
+        default_value = "10485760" // 10 MiB
+    )]
+    pub max_http_request_size: usize,
 }

 pub async fn command(config: Config) -> Result<()> {
--- a/src/influxdb_ioxd.rs
+++ b/src/influxdb_ioxd.rs
@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> {
    let bind_addr = config.http_bind_address;
    let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?;

-    let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse();
+    let max_http_request_size = config.max_http_request_size;
+
+    let http_server = http::serve(
+        addr,
+        Arc::clone(&app_server),
+        frontend_shutdown.clone(),
+        max_http_request_size,
+    )
+    .fuse();
    info!(bind_address=?bind_addr, "HTTP server listening");

    info!(git_hash, "InfluxDB IOx server ready");
--- a/src/influxdb_ioxd/http.rs
+++ b/src/influxdb_ioxd/http.rs
@ -342,12 +342,26 @@ impl ApplicationError {
    }
 }

-const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB
-
-fn router<M>(server: Arc<AppServer<M>>) -> Router<Body, ApplicationError>
+struct Server<M>
 where
    M: ConnectionManager + Send + Sync + Debug + 'static,
 {
+    app_server: Arc<AppServer<M>>,
+    max_request_size: usize,
+}
+
+fn router<M>(
+    app_server: Arc<AppServer<M>>,
+    max_request_size: usize,
+) -> Router<Body, ApplicationError>
+where
+    M: ConnectionManager + Send + Sync + Debug + 'static,
+{
+    let server = Server {
+        app_server,
+        max_request_size,
+    };
+
    // Create a router and specify the the handlers.
    Router::builder()
        .data(server)
@ -408,7 +422,7 @@ struct WriteInfo {

 /// Parse the request's body into raw bytes, applying size limits and
 /// content encoding as needed.
-async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError> {
+async fn parse_body(req: hyper::Request<Body>, max_size: usize) -> Result<Bytes, ApplicationError> {
    // clippy says the const needs to be assigned to a local variable:
    // error: a `const` item with interior mutability should not be borrowed
    let header_name = CONTENT_ENCODING;
@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
    while let Some(chunk) = payload.next().await {
        let chunk = chunk.context(ClientHangup)?;
        // limit max size of in-memory payload
-        if (body.len() + chunk.len()) > MAX_SIZE {
+        if (body.len() + chunk.len()) > max_size {
            return Err(ApplicationError::RequestSizeExceeded {
-                max_body_size: MAX_SIZE,
+                max_body_size: max_size,
            });
        }
        body.extend_from_slice(&chunk);
@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
        use std::io::Read;
        let decoder = flate2::read::GzDecoder::new(&body[..]);

-        // Read at most MAX_SIZE bytes to prevent a decompression bomb based
+        // Read at most max_size bytes to prevent a decompression bomb based
        // DoS.
-        let mut decoder = decoder.take(MAX_SIZE as u64);
+        let mut decoder = decoder.take(max_size as u64);
        let mut decoded_data = Vec::new();
        decoder
            .read_to_end(&mut decoded_data)
@ -464,7 +478,12 @@ where
    M: ConnectionManager + Send + Sync + Debug + 'static,
 {
    let path = req.uri().path().to_string();
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let Server {
+        app_server: server,
+        max_request_size,
+    } = req.data::<Server<M>>().expect("server state");
+    let max_request_size = *max_request_size;
+    let server = Arc::clone(&server);

    // TODO(edd): figure out best way of catching all errors in this observation.
    let obs = server.metrics.http_requests.observation(); // instrument request
@ -481,7 +500,7 @@ where
    let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket)
        .context(BucketMappingError)?;

-    let body = parse_body(req).await?;
+    let body = parse_body(req, max_request_size).await?;

    let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?;

@ -595,7 +614,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
    let path = req.uri().path().to_string();
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);

    // TODO(edd): figure out best way of catching all errors in this observation.
    let obs = server.metrics.http_requests.observation(); // instrument request
@ -661,7 +680,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
 async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
    let path = req.uri().path().to_string();
    server
        .metrics
@ -677,7 +696,7 @@ async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
 async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
    let path = req.uri().path().to_string();
    server
        .metrics
@ -700,7 +719,7 @@ async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
 ) -> Result<Response<Body>, ApplicationError> {
    let path = req.uri().path().to_string();

-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);

    // TODO - catch error conditions
    let obs = server.metrics.http_requests.observation();
@ -841,11 +860,12 @@ pub async fn serve<M>(
    addr: AddrIncoming,
    server: Arc<AppServer<M>>,
    shutdown: CancellationToken,
+    max_request_size: usize,
 ) -> Result<(), hyper::Error>
 where
    M: ConnectionManager + Send + Sync + Debug + 'static,
 {
-    let router = router(server);
+    let router = router(server, max_request_size);
    let service = RouterService::new(router).unwrap();

    hyper::Server::builder(addr)
@ -1234,6 +1254,8 @@ mod tests {
        .await;
    }

+    const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024;
+
    #[tokio::test]
    async fn client_hangup_during_parse() {
        #[derive(Debug, Snafu)]
@ -1253,7 +1275,9 @@ mod tests {
            .body(body)
            .unwrap();

-        let parse_result = parse_body(request).await.unwrap_err();
+        let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE)
+            .await
+            .unwrap_err();
        assert_eq!(
            parse_result.to_string(),
            "Client hung up while sending body: error reading a body from connection: Blarg Error"
@ -1334,7 +1358,12 @@ mod tests {
        let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server");
        let server_url = format!("http://{}", addr.local_addr());

-        tokio::task::spawn(serve(addr, server, CancellationToken::new()));
+        tokio::task::spawn(serve(
+            addr,
+            server,
+            CancellationToken::new(),
+            TEST_MAX_REQUEST_SIZE,
+        ));
        println!("Started server at {}", server_url);
        server_url
    }
--- a/src/influxdb_ioxd/rpc/error.rs
+++ b/src/influxdb_ioxd/rpc/error.rs
@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status {
    use server::Error;

    match error {
-        Error::GetIdError { .. } => PreconditionViolation {
+        Error::IdNotSet => PreconditionViolation {
            category: "Writer ID".to_string(),
            subject: "influxdata.com/iox".to_string(),
            description: "Writer ID must be set".to_string(),
--- a/src/influxdb_ioxd/rpc/management.rs
+++ b/src/influxdb_ioxd/rpc/management.rs
@ -56,7 +56,7 @@ where
        &self,
        _: Request<GetServerIdRequest>,
    ) -> Result<Response<GetServerIdResponse>, Status> {
-        match self.server.require_id().ok() {
+        match self.server.server_id() {
            Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })),
            None => return Err(NotFound::default().into()),
        }
@ -71,7 +71,7 @@ where

        match self.server.set_id(id) {
            Ok(_) => Ok(Response::new(UpdateServerIdResponse {})),
-            Err(e @ Error::SetIdError { .. }) => {
+            Err(e @ Error::IdAlreadySet) => {
                return Err(FieldViolation {
                    field: "id".to_string(),
                    description: e.to_string(),
@ -199,15 +199,18 @@ where
        &self,
        _: Request<ListRemotesRequest>,
    ) -> Result<Response<ListRemotesResponse>, Status> {
-        let remotes = self
-            .server
-            .remotes_sorted()
-            .into_iter()
-            .map(|(id, connection_string)| Remote {
-                id: id.get_u32(),
-                connection_string,
-            })
-            .collect();
+        let result = self.server.remotes_sorted();
+        let remotes = match result {
+            Ok(remotes) => remotes
+                .into_iter()
+                .map(|(id, connection_string)| Remote {
+                    id: id.get_u32(),
+                    connection_string,
+                })
+                .collect(),
+            Err(e) => return Err(default_server_error_handler(e)),
+        };
+
        Ok(Response::new(ListRemotesResponse { remotes }))
    }

@ -221,8 +224,16 @@ where
            .ok_or_else(|| FieldViolation::required("remote"))?;
        let remote_id = ServerId::try_from(remote.id)
            .map_err(|_| FieldViolation::required("id").scope("remote"))?;
-        self.server
+
+        let result = self
+            .server
            .update_remote(remote_id, remote.connection_string);
+
+        match result {
+            Ok(_) => {}
+            Err(e) => return Err(default_server_error_handler(e)),
+        }
+
        Ok(Response::new(UpdateRemoteResponse {}))
    }

@ -233,9 +244,12 @@ where
        let request = request.into_inner();
        let remote_id =
            ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?;
-        self.server
-            .delete_remote(remote_id)
-            .ok_or_else(NotFound::default)?;
+
+        match self.server.delete_remote(remote_id) {
+            Ok(Some(_)) => {}
+            Ok(None) => return Err(NotFound::default().into()),
+            Err(e) => return Err(default_server_error_handler(e)),
+        }

        Ok(Response::new(DeleteRemoteResponse {}))
    }
@ -455,7 +469,7 @@ where

        let tracker = self
            .server
-            .wipe_preserved_catalog(db_name)
+            .wipe_preserved_catalog(&db_name)
            .map_err(|e| match e {
                Error::DatabaseAlreadyExists { db_name } => AlreadyExists {
                    resource_type: "database".to_string(),
--- a/tests/end_to_end_cases/management_api.rs
+++ b/tests/end_to_end_cases/management_api.rs
@ -65,6 +65,8 @@ async fn test_list_update_remotes() {
    const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321";
    const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321";

+    client.update_server_id(123).await.unwrap();
+
    let res = client.list_remotes().await.expect("list remotes failed");
    assert_eq!(res.len(), 0);

--- a/tests/end_to_end_cases/management_cli.rs
+++ b/tests/end_to_end_cases/management_cli.rs
@ -244,6 +244,18 @@ async fn test_list_chunks_error() {
 async fn test_remotes() {
    let server_fixture = ServerFixture::create_single_use().await;
    let addr = server_fixture.grpc_base();
+
+    Command::cargo_bin("influxdb_iox")
+        .unwrap()
+        .arg("server")
+        .arg("set")
+        .arg("32")
+        .arg("--host")
+        .arg(addr)
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("Ok"));
+
    Command::cargo_bin("influxdb_iox")
        .unwrap()
        .arg("server")
--- a/tests/end_to_end_cases/persistence.rs
+++ b/tests/end_to_end_cases/persistence.rs
@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() {
    assert_eq!(chunks[0].row_count, 1_000);
 }

+async fn write_data(
+    write_client: &mut influxdb_iox_client::write::Client,
+    db_name: &str,
+    num_payloads: u64,
+    num_duplicates: u64,
+    payload_size: u64,
+) {
+    let payloads: Vec<_> = (0..num_payloads)
+        .map(|x| {
+            (0..payload_size)
+                .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
+                .join("\n")
+        })
+        .collect();
+
+    for payload in &payloads {
+        // Writing the same data multiple times should be compacted away
+        for _ in 0..=num_duplicates {
+            let num_lines_written = write_client
+                .write(db_name, payload)
+                .await
+                .expect("successful write");
+            assert_eq!(num_lines_written, payload_size as usize);
+        }
+    }
+}
+
 #[tokio::test]
 async fn test_full_lifecycle() {
    let fixture = ServerFixture::create_shared().await;
    let mut write_client = fixture.write_client();

    let num_payloads = 10;
-    let num_duplicates = 2;
+    let num_duplicates = 1;
    let payload_size = 1_000;

-    let total_rows = num_payloads * num_duplicates * payload_size;
+    let total_rows = num_payloads * (1 + num_duplicates) * payload_size;

    let db_name = rand_name();
    DatabaseBuilder::new(db_name.clone())
@ -73,24 +100,14 @@ async fn test_full_lifecycle() {
        .build(fixture.grpc_channel())
        .await;

-    let payloads: Vec<_> = (0..num_payloads)
-        .map(|x| {
-            (0..payload_size)
-                .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
-                .join("\n")
-        })
-        .collect();
-
-    for payload in &payloads {
-        // Writing the same data multiple times should be compacted away
-        for _ in 0..num_duplicates {
-            let num_lines_written = write_client
-                .write(&db_name, payload)
-                .await
-                .expect("successful write");
-            assert_eq!(num_lines_written, payload_size as usize);
-        }
-    }
+    write_data(
+        &mut write_client,
+        &db_name,
+        num_payloads,
+        num_duplicates,
+        payload_size,
+    )
+    .await;

    wait_for_exact_chunk_states(
        &fixture,
@ -123,6 +140,58 @@ async fn test_full_lifecycle() {
    assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize)
 }

+#[tokio::test]
+async fn test_update_late_arrival() {
+    let fixture = ServerFixture::create_shared().await;
+    let mut write_client = fixture.write_client();
+
+    let payload_size = 100;
+
+    let db_name = rand_name();
+    DatabaseBuilder::new(db_name.clone())
+        .persist(true)
+        // Don't close MUB automatically
+        .mub_row_threshold(payload_size * 2)
+        .persist_row_threshold(payload_size)
+        .persist_age_threshold_seconds(1000)
+        // Initially set to be a large value
+        .late_arrive_window_seconds(1000)
+        .build(fixture.grpc_channel())
+        .await;
+
+    write_data(&mut write_client, &db_name, 1, 0, payload_size).await;
+
+    let mut management = fixture.management_client();
+
+    let chunks = management.list_chunks(&db_name).await.unwrap();
+    assert_eq!(chunks.len(), 1);
+    assert_eq!(
+        chunks[0].storage,
+        influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32
+    );
+
+    let mut rules = management.get_database(&db_name).await.unwrap();
+    rules
+        .lifecycle_rules
+        .as_mut()
+        .unwrap()
+        .late_arrive_window_seconds = 1;
+
+    fixture
+        .management_client()
+        .update_database(rules)
+        .await
+        .unwrap();
+
+    wait_for_exact_chunk_states(
+        &fixture,
+        &db_name,
+        vec![ChunkStorage::ReadBufferAndObjectStore],
+        std::time::Duration::from_secs(5),
+    )
+    .await;
+}
+
 #[tokio::test]
 async fn test_query_chunk_after_restart() {
    // fixtures
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@ -13,7 +13,7 @@ use crate::{
 #[derive(Debug)]
 pub enum WriteBufferConfig {
    Writing(Arc<dyn WriteBufferWriting>),
-    Reading(Arc<dyn WriteBufferReading>),
+    Reading(Arc<tokio::sync::Mutex<Box<dyn WriteBufferReading>>>),
 }

 impl WriteBufferConfig {
@ -36,7 +36,9 @@ impl WriteBufferConfig {
            Some(WriteBufferConnection::Reading(conn)) => {
                let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?;

-                Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _)))
+                Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new(
+                    Box::new(kafka_buffer) as _,
+                )))))
            }
            None => Ok(None),
        }
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@ -1,6 +1,8 @@
+use std::fmt::Debug;
+
 use async_trait::async_trait;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::stream::BoxStream;
+use futures::{future::BoxFuture, stream::BoxStream};

 /// Generic boxed error type that is used in this crate.
 ///
@ -10,7 +12,7 @@ pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
 /// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading
 /// entries from the Write Buffer at a later time.
 #[async_trait]
-pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
+pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
    /// Send an `Entry` to the write buffer using the specified sequencer ID.
    ///
    /// Returns information that can be used to restore entries at a later time.
@ -21,16 +23,47 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
    ) -> Result<Sequence, WriteBufferError>;
 }

+pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
+pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
+
 /// Output stream of [`WriteBufferReading`].
-pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError>>;
+pub struct EntryStream<'a> {
+    /// Stream that produces entries.
+    pub stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
+
+    /// Get high watermark (= what we believe is the next sequence number to be added).
+    ///
+    /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
+    /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
+    pub fetch_high_watermark: FetchHighWatermark<'a>,
+}
+
+impl<'a> Debug for EntryStream<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("EntryStream").finish_non_exhaustive()
+    }
+}

 /// Produce streams (one per sequencer) of [`SequencedEntry`]s.
-pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
+#[async_trait]
+pub trait WriteBufferReading: Sync + Send + Debug + 'static {
    /// Returns a stream per sequencer.
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait;
+    ///
+    /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
+    /// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
+    /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
+    /// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>;
+
+    /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
+    /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    ///
+    /// Note that due to the mutable borrow, it is not possible to seek while streams exists.
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError>;
 }

 pub mod test_utils {
@ -65,6 +98,8 @@ pub mod test_utils {
        test_multi_stream_io(&adapter).await;
        test_multi_sequencer_io(&adapter).await;
        test_multi_writer_multi_reader(&adapter).await;
+        test_seek(&adapter).await;
+        test_watermark(&adapter).await;
    }

    async fn test_single_stream_io<T>(adapter: &T)
@ -78,7 +113,7 @@ pub mod test_utils {
        let entry_3 = lp_to_entry("upc user=3 300");

        let writer = context.writing();
-        let reader = context.reading().await;
+        let mut reader = context.reading().await;

        let mut streams = reader.streams();
        assert_eq!(streams.len(), 1);
@ -88,67 +123,32 @@ pub mod test_utils {
        let mut cx = futures::task::Context::from_waker(&waker);

        // empty stream is pending
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());

        // adding content allows us to get results
        writer.store_entry(&entry_1, sequencer_id).await.unwrap();
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1);
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );

        // stream is pending again
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());

        // adding more data unblocks the stream
        writer.store_entry(&entry_2, sequencer_id).await.unwrap();
        writer.store_entry(&entry_3, sequencer_id).await.unwrap();
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2);
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3);
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );

        // stream is pending again
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
-    }
-
-    async fn test_multi_sequencer_io<T>(adapter: &T)
-    where
-        T: TestAdapter,
-    {
-        let context = adapter.new_context(2).await;
-
-        let entry_1 = lp_to_entry("upc user=1 100");
-        let entry_2 = lp_to_entry("upc user=2 200");
-        let entry_3 = lp_to_entry("upc user=3 300");
-
-        let writer = context.writing();
-        let reader = context.reading().await;
-
-        let mut streams = reader.streams();
-        assert_eq!(streams.len(), 2);
-        let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
-        let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
-        assert_ne!(sequencer_id_1, sequencer_id_2);
-
-        let waker = futures::task::noop_waker();
-        let mut cx = futures::task::Context::from_waker(&waker);
-
-        // empty streams are pending
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-
-        // entries arrive at the right target stream
-        writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-
-        writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
-
-        writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
-
-        // streams are pending again
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
    }

    async fn test_multi_stream_io<T>(adapter: &T)
@ -162,34 +162,104 @@ pub mod test_utils {
        let entry_3 = lp_to_entry("upc user=3 300");

        let writer = context.writing();
-        let reader = context.reading().await;
-
-        let mut streams_1 = reader.streams();
-        let mut streams_2 = reader.streams();
-        assert_eq!(streams_1.len(), 1);
-        assert_eq!(streams_2.len(), 1);
-        let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap();
-        let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap();
-        assert_eq!(sequencer_id_1, sequencer_id_2);
+        let mut reader = context.reading().await;

        let waker = futures::task::noop_waker();
        let mut cx = futures::task::Context::from_waker(&waker);

-        // empty streams is pending
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        writer.store_entry(&entry_1, 0).await.unwrap();
+        writer.store_entry(&entry_2, 0).await.unwrap();
+        writer.store_entry(&entry_3, 0).await.unwrap();

-        // streams poll from same source
+        // creating stream, drop stream, re-create it => still starts at first entry
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, stream) = streams.pop().unwrap();
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );
+
+        // re-creating stream after reading remembers offset
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );
+
+        // re-creating stream after reading everything makes it pending
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+    }
+
+    async fn test_multi_sequencer_io<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(2).await;
+
+        let entry_1 = lp_to_entry("upc user=1 100");
+        let entry_2 = lp_to_entry("upc user=2 200");
+        let entry_3 = lp_to_entry("upc user=3 300");
+
+        let writer = context.writing();
+        let mut reader = context.reading().await;
+
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 2);
+        let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
+        let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
+        assert_ne!(sequencer_id_1, sequencer_id_2);
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+
+        // empty streams are pending
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+
+        // entries arrive at the right target stream
        writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
-        writer.store_entry(&entry_2, sequencer_id_1).await.unwrap();
-        writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
-        assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
+        assert_eq!(
+            stream_1.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());

-        // both streams are pending again
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert_eq!(
+            stream_2.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
+
+        writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        assert_eq!(
+            stream_1.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );
+
+        // streams are pending again
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
    }

    async fn test_multi_writer_multi_reader<T>(adapter: &T)
@ -204,8 +274,8 @@ pub mod test_utils {

        let writer_1 = context.writing();
        let writer_2 = context.writing();
-        let reader_1 = context.reading().await;
-        let reader_2 = context.reading().await;
+        let mut reader_1 = context.reading().await;
+        let mut reader_2 = context.reading().await;

        // TODO: do not hard-code sequencer IDs here but provide a proper interface
        writer_1.store_entry(&entry_east_1, 0).await.unwrap();
@ -213,18 +283,119 @@ pub mod test_utils {
        writer_2.store_entry(&entry_east_2, 0).await.unwrap();

        assert_reader_content(
-            reader_1,
+            &mut reader_1,
            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
        )
        .await;
        assert_reader_content(
-            reader_2,
+            &mut reader_2,
            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
        )
        .await;
    }

-    async fn assert_reader_content<R>(reader: R, expected: &[(u32, &[&Entry])])
+    async fn test_seek<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(2).await;
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+
+        let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
+        let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
+        let entry_east_3 = lp_to_entry("upc,region=east user=3 300");
+        let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
+
+        let writer = context.writing();
+        let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number;
+        let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number;
+        let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number;
+
+        let mut reader_1 = context.reading().await;
+        let mut reader_2 = context.reading().await;
+
+        // forward seek
+        reader_1.seek(0, sequence_number_east_2).await.unwrap();
+        assert_reader_content(
+            &mut reader_1,
+            &[(0, &[&entry_east_2]), (1, &[&entry_west_1])],
+        )
+        .await;
+        assert_reader_content(
+            &mut reader_2,
+            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
+        )
+        .await;
+
+        // backward seek
+        reader_1.seek(0, 0).await.unwrap();
+        assert_reader_content(
+            &mut reader_1,
+            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])],
+        )
+        .await;
+
+        // seek to far end and then at data
+        reader_1.seek(0, 1_000_000).await.unwrap();
+        let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
+        let mut streams = reader_1.streams();
+        assert_eq!(streams.len(), 2);
+        let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
+        let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        drop(stream_1);
+        drop(stream_2);
+        drop(streams);
+
+        // seeking unknown sequencer is NOT an error
+        reader_1.seek(0, 42).await.unwrap();
+    }
+
+    async fn test_watermark<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(2).await;
+
+        let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
+        let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
+        let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
+
+        let writer = context.writing();
+        let mut reader = context.reading().await;
+
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 2);
+        let (sequencer_id_1, stream_1) = streams.pop().unwrap();
+        let (sequencer_id_2, stream_2) = streams.pop().unwrap();
+
+        // start at watermark 0
+        assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
+        assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
+
+        // high water mark moves
+        writer
+            .store_entry(&entry_east_1, sequencer_id_1)
+            .await
+            .unwrap();
+        let mark_1 = writer
+            .store_entry(&entry_east_2, sequencer_id_1)
+            .await
+            .unwrap()
+            .number;
+        let mark_2 = writer
+            .store_entry(&entry_west_1, sequencer_id_2)
+            .await
+            .unwrap()
+            .number;
+        assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1);
+        assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1);
+    }
+
+    async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&Entry])])
    where
        R: WriteBufferReading,
    {
@ -239,6 +410,7 @@ pub mod test_utils {

            // we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
            let mut results: Vec<_> = actual_stream
+                .stream
                .take(expected_entries.len())
                .try_collect()
                .await
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@ -1,22 +1,28 @@
 use std::{
+    collections::BTreeMap,
    convert::{TryFrom, TryInto},
+    sync::Arc,
    time::Duration,
 };

 use async_trait::async_trait;
 use data_types::server_id::ServerId;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::StreamExt;
+use futures::{FutureExt, StreamExt};
 use observability_deps::tracing::{debug, info};
 use rdkafka::{
    consumer::{BaseConsumer, Consumer, StreamConsumer},
    error::KafkaError,
    producer::{FutureProducer, FutureRecord},
+    types::RDKafkaErrorCode,
    util::Timeout,
-    ClientConfig, Message, TopicPartitionList,
+    ClientConfig, Message, Offset, TopicPartitionList,
 };

-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::core::{
+    EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
+    WriteBufferWriting,
+};

 pub struct KafkaBufferProducer {
    conn: String,
@ -77,8 +83,8 @@ impl KafkaBufferProducer {
        let mut cfg = ClientConfig::new();
        cfg.set("bootstrap.servers", &conn);
        cfg.set("message.timeout.ms", "5000");
-        cfg.set("message.max.bytes", "10000000");
-        cfg.set("queue.buffering.max.kbytes", "10485760");
+        cfg.set("message.max.bytes", "31457280");
+        cfg.set("queue.buffering.max.kbytes", "31457280");
        cfg.set("request.required.acks", "all"); // equivalent to acks=-1

        let producer: FutureProducer = cfg.create()?;
@ -94,7 +100,7 @@ impl KafkaBufferProducer {
 pub struct KafkaBufferConsumer {
    conn: String,
    database_name: String,
-    consumers: Vec<(u32, StreamConsumer)>,
+    consumers: BTreeMap<u32, Arc<StreamConsumer>>,
 }

 // Needed because rdkafka's StreamConsumer doesn't impl Debug
@ -107,31 +113,94 @@ impl std::fmt::Debug for KafkaBufferConsumer {
    }
 }

+#[async_trait]
 impl WriteBufferReading for KafkaBufferConsumer {
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait,
-    {
-        self.consumers
-            .iter()
-            .map(|(sequencer_id, consumer)| {
-                let stream = consumer
-                    .stream()
-                    .map(|message| {
-                        let message = message?;
-                        let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
-                        let sequence = Sequence {
-                            id: message.partition().try_into()?,
-                            number: message.offset().try_into()?,
-                        };
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
+        let mut streams = vec![];

-                        Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
+        for (sequencer_id, consumer) in &self.consumers {
+            let sequencer_id = *sequencer_id;
+            let consumer_cloned = Arc::clone(consumer);
+            let database_name = self.database_name.clone();
+
+            let stream = consumer
+                .stream()
+                .map(move |message| {
+                    let message = message?;
+                    let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
+                    let sequence = Sequence {
+                        id: message.partition().try_into()?,
+                        number: message.offset().try_into()?,
+                    };
+
+                    Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
+                })
+                .boxed();
+
+            let fetch_high_watermark = move || {
+                let consumer_cloned = Arc::clone(&consumer_cloned);
+                let database_name = database_name.clone();
+
+                let fut = async move {
+                    match tokio::task::spawn_blocking(move || {
+                        consumer_cloned.fetch_watermarks(
+                            &database_name,
+                            sequencer_id as i32,
+                            Duration::from_secs(60),
+                        )
                    })
-                    .boxed();
-                (*sequencer_id, stream)
+                    .await
+                    .expect("subtask failed")
+                    {
+                        Ok((_low, high)) => Ok(high as u64),
+                        Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0),
+                        Err(e) => Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>),
+                    }
+                };
+
+                fut.boxed() as FetchHighWatermarkFut<'_>
+            };
+            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
+
+            streams.push((
+                sequencer_id,
+                EntryStream {
+                    stream,
+                    fetch_high_watermark,
+                },
+            ));
+        }
+
+        streams
+    }
+
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError> {
+        if let Some(consumer) = self.consumers.get(&sequencer_id) {
+            let consumer = Arc::clone(consumer);
+            let database_name = self.database_name.clone();
+            let offset = if sequence_number > 0 {
+                Offset::Offset(sequence_number as i64)
+            } else {
+                Offset::Beginning
+            };
+
+            tokio::task::spawn_blocking(move || {
+                consumer.seek(
+                    &database_name,
+                    sequencer_id as i32,
+                    offset,
+                    Duration::from_secs(60),
+                )
            })
-            .collect()
+            .await
+            .expect("subtask failed")?;
+        }
+
+        Ok(())
    }
 }

@ -169,11 +238,21 @@ impl KafkaBufferConsumer {

                let mut assignment = TopicPartitionList::new();
                assignment.add_partition(&database_name, partition as i32);
-                consumer.assign(&assignment)?;

-                Ok((partition, consumer))
+                // We must set the offset to `Beginning` here to avoid the following error during seek:
+                //     KafkaError (Seek error: Local: Erroneous state)
+                //
+                // Also see:
+                // - https://github.com/Blizzard/node-rdkafka/issues/237
+                // - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376
+                assignment
+                    .set_partition_offset(&database_name, partition as i32, Offset::Beginning)
+                    .expect("partition was set just before");
+
+                consumer.assign(&assignment)?;
+                Ok((partition, Arc::new(consumer)))
            })
-            .collect::<Result<Vec<(u32, StreamConsumer)>, KafkaError>>()?;
+            .collect::<Result<BTreeMap<u32, Arc<StreamConsumer>>, KafkaError>>()?;

        Ok(Self {
            conn,
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll};

 use async_trait::async_trait;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::{stream, StreamExt};
+use futures::{stream, FutureExt, StreamExt};
 use parking_lot::Mutex;

-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::core::{
+    EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
+    WriteBufferWriting,
+};

 type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;

@ -153,21 +156,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
    }
 }

+/// Sequencer-specific playback state
+struct PlaybackState {
+    /// Index within the entry vector.
+    vector_index: usize,
+
+    /// Offset within the sequencer IDs.
+    offset: u64,
+}
+
 pub struct MockBufferForReading {
-    state: MockBufferSharedState,
-    positions: Arc<Mutex<BTreeMap<u32, usize>>>,
+    shared_state: MockBufferSharedState,
+    playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
 }

 impl MockBufferForReading {
    pub fn new(state: MockBufferSharedState) -> Self {
        let n_sequencers = state.entries.lock().len() as u32;
-        let positions: BTreeMap<_, _> = (0..n_sequencers)
-            .map(|sequencer_id| (sequencer_id, 0))
+        let playback_states: BTreeMap<_, _> = (0..n_sequencers)
+            .map(|sequencer_id| {
+                (
+                    sequencer_id,
+                    PlaybackState {
+                        vector_index: 0,
+                        offset: 0,
+                    },
+                )
+            })
            .collect();

        Self {
-            state,
-            positions: Arc::new(Mutex::new(positions)),
+            shared_state: state,
+            playback_states: Arc::new(Mutex::new(playback_states)),
        }
    }
 }
@ -178,46 +198,110 @@ impl std::fmt::Debug for MockBufferForReading {
    }
 }

+#[async_trait]
 impl WriteBufferReading for MockBufferForReading {
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait,
-    {
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
        let sequencer_ids: Vec<_> = {
-            let positions = self.positions.lock();
-            positions.keys().copied().collect()
+            let playback_states = self.playback_states.lock();
+            playback_states.keys().copied().collect()
        };

        let mut streams = vec![];
        for sequencer_id in sequencer_ids {
-            let state = self.state.clone();
-            let positions = Arc::clone(&self.positions);
+            let shared_state = self.shared_state.clone();
+            let playback_states = Arc::clone(&self.playback_states);

            let stream = stream::poll_fn(move |_ctx| {
-                let entries = state.entries.lock();
-                let mut positions = positions.lock();
+                let entries = shared_state.entries.lock();
+                let mut playback_states = playback_states.lock();

                let entry_vec = entries.get(&sequencer_id).unwrap();
-                let position = positions.get_mut(&sequencer_id).unwrap();
+                let playback_state = playback_states.get_mut(&sequencer_id).unwrap();

-                if entry_vec.len() > *position {
-                    let entry = match &entry_vec[*position] {
-                        Ok(entry) => Ok(entry.clone()),
-                        Err(e) => Err(e.to_string().into()),
-                    };
-                    *position += 1;
-                    return Poll::Ready(Some(entry));
+                while entry_vec.len() > playback_state.vector_index {
+                    let entry_result = &entry_vec[playback_state.vector_index];
+
+                    // consume entry
+                    playback_state.vector_index += 1;
+
+                    match entry_result {
+                        Ok(entry) => {
+                            // found an entry => need to check if it is within the offset
+                            let sequence = entry.sequence().unwrap();
+                            if sequence.number >= playback_state.offset {
+                                // within offset => return entry to caller
+                                return Poll::Ready(Some(Ok(entry.clone())));
+                            } else {
+                                // offset is larger then the current entry => ignore entry and try next
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            // found an error => return entry to caller
+                            return Poll::Ready(Some(Err(e.to_string().into())));
+                        }
+                    }
                }

+                // we are at the end of the recorded entries => report pending
                Poll::Pending
            })
            .boxed();
-            streams.push((sequencer_id, stream));
+
+            let shared_state = self.shared_state.clone();
+
+            let fetch_high_watermark = move || {
+                let shared_state = shared_state.clone();
+
+                let fut = async move {
+                    let entries = shared_state.entries.lock();
+                    let entry_vec = entries.get(&sequencer_id).unwrap();
+                    let watermark = entry_vec
+                        .iter()
+                        .filter_map(|entry_res| {
+                            entry_res
+                                .as_ref()
+                                .ok()
+                                .map(|entry| entry.sequence().unwrap().number)
+                        })
+                        .max()
+                        .map(|n| n + 1)
+                        .unwrap_or(0);
+
+                    Ok(watermark)
+                };
+                fut.boxed() as FetchHighWatermarkFut<'_>
+            };
+            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
+
+            streams.push((
+                sequencer_id,
+                EntryStream {
+                    stream,
+                    fetch_high_watermark,
+                },
+            ));
        }

        streams
    }
+
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError> {
+        let mut playback_states = self.playback_states.lock();
+
+        if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
+            playback_state.offset = sequence_number;
+
+            // reset position to start since seeking might go backwards
+            playback_state.vector_index = 0;
+        }
+
+        Ok(())
+    }
 }

 #[cfg(test)]