From 449ba46b22250c3cd23ac16a5cc493c4781654b2 Mon Sep 17 00:00:00 2001
From: Jake Goulding <jake.goulding@integer32.com>
Date: Fri, 16 Jul 2021 15:56:38 -0400
Subject: [PATCH 01/27] refactor: Make more use of SNAFU's context methods and
 ensure! macro

---
 internal_types/src/schema.rs |   8 +--
 server/src/config.rs         | 108 +++++++++++++++++------------------
 server/src/db/catalog.rs     |  21 +++----
 server/src/db/chunk.rs       |   4 +-
 4 files changed, 66 insertions(+), 75 deletions(-)
diff --git a/internal_types/src/schema.rs b/internal_types/src/schema.rs
index 2afb0cede3..4427540576 100644
--- a/internal_types/src/schema.rs
+++ b/internal_types/src/schema.rs
@@ -11,7 +11,7 @@ use arrow::datatypes::{
     DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
     SchemaRef as ArrowSchemaRef, TimeUnit,
 };
-use snafu::Snafu;
+use snafu::{OptionExt, Snafu};
 
 use crate::{
     schema::sort::{ColumnSort, SortKey},
@@ -395,11 +395,9 @@ impl Schema {
     pub fn compute_select_indicies(&self, columns: &[&str]) -> Result<Vec<usize>> {
         columns
             .iter()
-            .map(|column_name| {
+            .map(|&column_name| {
                 self.find_index_of(column_name)
-                    .ok_or_else(|| Error::ColumnNotFound {
-                        column_name: column_name.to_string(),
-                    })
+                    .context(ColumnNotFound { column_name })
             })
             .collect()
     }
diff --git a/server/src/config.rs b/server/src/config.rs
index 3fb35941ae..4bfe453ab7 100644
--- a/server/src/config.rs
+++ b/server/src/config.rs
@@ -16,9 +16,12 @@ use write_buffer::config::WriteBufferConfig;
 /// This module contains code for managing the configuration of the server.
 use crate::{
     db::{catalog::Catalog, DatabaseToCommit, Db},
-    Error, JobRegistry, Result,
+    DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error,
+    InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch,
+    ServerShuttingDown,
 };
 use observability_deps::tracing::{self, error, info, warn, Instrument};
+use snafu::{ensure, OptionExt};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 
@@ -85,16 +88,14 @@ impl Config {
         db_name: DatabaseName<'static>,
     ) -> Result<DatabaseHandle<'_>> {
         let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
-        if state.databases.contains_key(&db_name) {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );
+        ensure!(
+            !state.databases.contains_key(&db_name),
+            DatabaseAlreadyExists { db_name }
+        );
 
         state.reservations.insert(db_name.clone());
         Ok(DatabaseHandle {
@@ -119,28 +120,23 @@ impl Config {
     /// without initializing it, see [`block_db`](Self::block_db).
     pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
         let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );
 
-        let db_state =
-            state
-                .databases
-                .get(&db_name)
-                .cloned()
-                .ok_or_else(|| Error::DatabaseNotFound {
-                    db_name: db_name.to_string(),
-                })?;
+        let db_state = state
+            .databases
+            .get(&db_name)
+            .cloned()
+            .context(DatabaseNotFound { db_name: &db_name })?;
 
-        if db_state.is_initialized() {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !db_state.is_initialized(),
+            DatabaseAlreadyExists { db_name }
+        );
 
-        state.reservations.insert(db_name.clone());
+        state.reservations.insert(db_name);
         Ok(DatabaseHandle {
             state: Some(db_state),
             config: &self,
@@ -159,16 +155,14 @@ impl Config {
         db_name: DatabaseName<'static>,
     ) -> Result<BlockDatabaseGuard<'_>> {
         let mut state = self.state.write().expect("mutex poisoned");
-        if state.reservations.contains(&db_name) {
-            return Err(Error::DatabaseReserved {
-                db_name: db_name.to_string(),
-            });
-        }
-        if state.databases.contains_key(&db_name) {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        ensure!(
+            !state.reservations.contains(&db_name),
+            DatabaseReserved { db_name }
+        );
+        ensure!(
+            !state.databases.contains_key(&db_name),
+            DatabaseAlreadyExists { db_name }
+        );
 
         state.reservations.insert(db_name.clone());
         Ok(BlockDatabaseGuard {
@@ -228,9 +222,7 @@ impl Config {
         // TODO: implement for non-initialized databases
         let db = self
             .db_initialized(db_name)
-            .ok_or_else(|| Error::DatabaseNotFound {
-                db_name: db_name.to_string(),
-            })?;
+            .context(DatabaseNotFound { db_name })?;
 
         let mut rules = db.rules.write();
         *rules = update(rules.clone()).map_err(UpdateError::Closure)?;
@@ -600,12 +592,13 @@ impl<'a> DatabaseHandle<'a> {
                 server_id,
                 db_name,
             } => {
-                if db_name != &rules.name {
-                    return Err(Error::RulesDatabaseNameMismatch {
-                        actual: rules.name.to_string(),
-                        expected: db_name.to_string(),
-                    });
-                }
+                ensure!(
+                    db_name == &rules.name,
+                    RulesDatabaseNameMismatch {
+                        actual: rules.name,
+                        expected: db_name,
+                    }
+                );
 
                 self.state = Some(Arc::new(DatabaseState::RulesLoaded {
                     object_store: Arc::clone(&object_store),
@@ -616,10 +609,11 @@ impl<'a> DatabaseHandle<'a> {
 
                 Ok(())
             }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                 actual: state.code(),
                 expected: DatabaseStateCode::Known,
-            }),
+            }
+            .fail(),
         }
     }
 
@@ -652,10 +646,11 @@ impl<'a> DatabaseHandle<'a> {
 
                 Ok(())
             }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                 actual: state.code(),
                 expected: DatabaseStateCode::RulesLoaded,
-            }),
+            }
+            .fail(),
         }
     }
 
@@ -665,7 +660,7 @@ impl<'a> DatabaseHandle<'a> {
             DatabaseState::Replay { db } => {
                 if self.config.shutdown.is_cancelled() {
                     error!("server is shutting down");
-                    return Err(Error::ServerShuttingDown);
+                    return ServerShuttingDown.fail();
                 }
 
                 let shutdown = self.config.shutdown.child_token();
@@ -688,10 +683,11 @@ impl<'a> DatabaseHandle<'a> {
 
                 Ok(())
             }
-            state => Err(Error::InvalidDatabaseStateTransition {
+            state => InvalidDatabaseStateTransition {
                 actual: state.code(),
                 expected: DatabaseStateCode::Replay,
-            }),
+            }
+            .fail(),
         }
     }
 }
diff --git a/server/src/db/catalog.rs b/server/src/db/catalog.rs
index 6a06b06731..c2953a3de6 100644
--- a/server/src/db/catalog.rs
+++ b/server/src/db/catalog.rs
@@ -8,7 +8,7 @@ use data_types::chunk_metadata::ChunkSummary;
 use data_types::chunk_metadata::DetailedChunkSummary;
 use data_types::partition_metadata::{PartitionSummary, TableSummary};
 use internal_types::schema::Schema;
-use snafu::Snafu;
+use snafu::{OptionExt, Snafu};
 use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
 
 use self::chunk::CatalogChunk;
@@ -135,11 +135,8 @@ impl Catalog {
     /// Get a specific table by name, returning `None` if there is no such table
     pub fn table(&self, table_name: impl AsRef<str>) -> Result<MappedRwLockReadGuard<'_, Table>> {
         let table_name = table_name.as_ref();
-        RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err(
-            |_| Error::TableNotFound {
-                table: table_name.to_string(),
-            },
-        )
+        RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name))
+            .map_err(|_| TableNotFound { table: table_name }.build())
     }
 
     /// Get a specific partition by name, returning an error if it can't be found
@@ -154,9 +151,9 @@ impl Catalog {
         self.table(table_name)?
             .partition(partition_key)
             .cloned()
-            .ok_or_else(|| Error::PartitionNotFound {
-                partition: partition_key.to_string(),
-                table: table_name.to_string(),
+            .context(PartitionNotFound {
+                partition: partition_key,
+                table: table_name,
             })
     }
 
@@ -174,9 +171,9 @@ impl Catalog {
             .read()
             .chunk(chunk_id)
             .cloned()
-            .ok_or_else(|| Error::ChunkNotFound {
-                partition: partition_key.to_string(),
-                table: table_name.to_string(),
+            .context(ChunkNotFound {
+                partition: partition_key,
+                table: table_name,
                 chunk_id,
             })
     }
diff --git a/server/src/db/chunk.rs b/server/src/db/chunk.rs
index 997030cb08..e4ba6870cd 100644
--- a/server/src/db/chunk.rs
+++ b/server/src/db/chunk.rs
@@ -5,7 +5,7 @@ use std::{
 
 use data_types::partition_metadata;
 use partition_metadata::TableSummary;
-use snafu::{ResultExt, Snafu};
+use snafu::{OptionExt, ResultExt, Snafu};
 
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
@@ -417,7 +417,7 @@ impl QueryChunk for DbChunk {
                 // column out to get the set of values.
                 let values = values
                     .remove(column_name)
-                    .ok_or_else(|| Error::ReadBufferError {
+                    .with_context(|| ReadBufferError {
                         chunk_id: self.id(),
                         msg: format!(
                             "failed to find column_name {:?} in results of tag_values",

From 1c16988a51a3ce720d5d2aaea58b7360e797e4b3 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Mon, 19 Jul 2021 14:09:06 -0400
Subject: [PATCH 02/27] chore: Update datafusion references (#2056)

---
 Cargo.lock                               |   2 +-
 datafusion/Cargo.toml                    |   2 +-
 query/src/exec.rs                        |  19 +-
 query/src/exec/context.rs                |  66 ++--
 query/src/frontend/sql.rs                |   2 +-
 query_tests/cases/in/duplicates.expected | 171 +++++-----
 query_tests/cases/in/duplicates.sql      |   6 +-
 query_tests/cases/in/pushdown.expected   | 383 ++++++++++-------------
 query_tests/cases/in/pushdown.sql        |  26 +-
 query_tests/src/runner.rs                |  14 +-
 10 files changed, 349 insertions(+), 342 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8474d61947..ec611883bf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -843,7 +843,7 @@ dependencies = [
 [[package]]
 name = "datafusion"
 version = "4.0.0-SNAPSHOT"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=3fb600df48ab1e53903b1a9bb12ebde33ad0856b#3fb600df48ab1e53903b1a9bb12ebde33ad0856b"
 dependencies = [
  "ahash 0.7.4",
  "arrow",
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index 64a9e97e69..f3a735d307 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version"
 
 # Rename to workaround doctest bug
 # Turn off optional datafusion features (function packages)
-upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" }
+upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="3fb600df48ab1e53903b1a9bb12ebde33ad0856b", default-features = false, package = "datafusion" }
diff --git a/query/src/exec.rs b/query/src/exec.rs
index ce5e085581..e5b7034c75 100644
--- a/query/src/exec.rs
+++ b/query/src/exec.rs
@@ -39,6 +39,7 @@ use crate::plan::{
 };
 
 use self::{
+    context::IOxExecutionConfig,
     split::StreamSplitNode,
     task::{DedicatedExecutor, Error as ExecutorError},
 };
@@ -111,6 +112,9 @@ pub struct Executor {
     /// Executor for running system/reorganization tasks such as
     /// compact
     reorg_exec: DedicatedExecutor,
+
+    /// The default configuration options with which to create contexts
+    config: IOxExecutionConfig,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -128,12 +132,25 @@ impl Executor {
         let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads);
         let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads);
 
+        let config = IOxExecutionConfig::new();
+
         Self {
             query_exec,
             reorg_exec,
+            config,
         }
     }
 
+    /// returns the config of this executor
+    pub fn config(&self) -> &IOxExecutionConfig {
+        &self.config
+    }
+
+    /// returns a mutable reference to this executor's config
+    pub fn config_mut(&mut self) -> &mut IOxExecutionConfig {
+        &mut self.config
+    }
+
     /// Executes this plan on the query pool, and returns the
     /// resulting set of strings
     pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
@@ -289,7 +306,7 @@ impl Executor {
     pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext {
         let executor = self.executor(executor_type).clone();
 
-        IOxExecutionContext::new(executor)
+        IOxExecutionContext::new(executor, self.config.clone())
     }
 
     /// Return the execution pool  of the specified type
diff --git a/query/src/exec/context.rs b/query/src/exec/context.rs
index 702e032bac..ffad4541c5 100644
--- a/query/src/exec/context.rs
+++ b/query/src/exec/context.rs
@@ -5,6 +5,7 @@ use std::{fmt, sync::Arc};
 
 use arrow::record_batch::RecordBatch;
 use datafusion::{
+    catalog::catalog::CatalogProvider,
     execution::context::{ExecutionContextState, QueryPlanner},
     logical_plan::{LogicalPlan, UserDefinedLogicalNode},
     physical_plan::{
@@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner {
     }
 }
 
+// Configuration for an IOx execution context
+#[derive(Clone)]
+pub struct IOxExecutionConfig {
+    /// Configuration options to pass to DataFusion
+    inner: ExecutionConfig,
+}
+
+impl Default for IOxExecutionConfig {
+    fn default() -> Self {
+        const BATCH_SIZE: usize = 1000;
+
+        // Setup default configuration
+        let inner = ExecutionConfig::new()
+            .with_batch_size(BATCH_SIZE)
+            .create_default_catalog_and_schema(true)
+            .with_information_schema(true)
+            .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
+            .with_query_planner(Arc::new(IOxQueryPlanner {}));
+
+        Self { inner }
+    }
+}
+
+impl fmt::Debug for IOxExecutionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "IOxExecutionConfig ...")
+    }
+}
+
+impl IOxExecutionConfig {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Set execution concurrency
+    pub fn set_concurrency(&mut self, concurrency: usize) {
+        self.inner.concurrency = concurrency;
+    }
+}
+
 /// This is an execution context for planning in IOx.  It wraps a
 /// DataFusion execution context with the information needed for planning.
 ///
@@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext {
 
 impl IOxExecutionContext {
     /// Create an ExecutionContext suitable for executing DataFusion plans
-    ///
-    /// The config is created with a default catalog and schema, but this
-    /// can be overridden at a later date
-    pub fn new(exec: DedicatedExecutor) -> Self {
-        const BATCH_SIZE: usize = 1000;
-
-        // TBD: Should we be reusing an execution context across all executions?
-        let config = ExecutionConfig::new()
-            .with_batch_size(BATCH_SIZE)
-            .create_default_catalog_and_schema(true)
-            .with_information_schema(true)
-            .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
-            .with_query_planner(Arc::new(IOxQueryPlanner {}));
-
-        let inner = ExecutionContext::with_config(config);
+    pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self {
+        let inner = ExecutionContext::with_config(config.inner);
 
         Self { inner, exec }
     }
@@ -160,11 +188,13 @@ impl IOxExecutionContext {
         &self.inner
     }
 
-    /// returns a mutable reference to the inner datafusion execution context
-    pub fn inner_mut(&mut self) -> &mut ExecutionContext {
-        &mut self.inner
+    /// registers a catalog with the inner context
+    pub fn register_catalog(&mut self, name: impl Into<String>, catalog: Arc<dyn CatalogProvider>) {
+        self.inner.register_catalog(name, catalog);
     }
 
+    ///
+
     /// Prepare a SQL statement for execution. This assumes that any
     /// tables referenced in the SQL have been registered with this context
     pub fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
diff --git a/query/src/frontend/sql.rs b/query/src/frontend/sql.rs
index be737fd4d6..c08ad7c671 100644
--- a/query/src/frontend/sql.rs
+++ b/query/src/frontend/sql.rs
@@ -87,7 +87,7 @@ impl SqlQueryPlanner {
         executor: &Executor,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut ctx = executor.new_context(ExecutorType::Query);
-        ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database);
+        ctx.register_catalog(DEFAULT_CATALOG, database);
         ctx.prepare_sql(query).context(Preparing)
     }
 }
diff --git a/query_tests/cases/in/duplicates.expected b/query_tests/cases/in/duplicates.expected
index 97ba6e6001..d83e04b0dc 100644
--- a/query_tests/cases/in/duplicates.expected
+++ b/query_tests/cases/in/duplicates.expected
@@ -1,86 +1,87 @@
 -- Test Setup: OneMeasurementThreeChunksWithDuplicates
--- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                      |
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=None                                                                                                        |
-| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                    |
-|                                         |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| physical_plan                           | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC]                                                                                             |
-|                                         |   ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
-|                                         |     ExecutionPlan(PlaceHolder)                                                                                                            |
-|                                         |       DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                                |
-|                                         |         SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                      |
-|                                         |           ExecutionPlan(PlaceHolder)                                                                                                      |
-|                                         |             IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |             IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
--- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o;
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                    |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=None                                                                                                        |
-| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
-|                                         |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
-| physical_plan                           | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
-|                                         |   ExecutionPlan(PlaceHolder)                                                                                                            |
-|                                         |     DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                                |
-|                                         |       SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                      |
-|                                         |         ExecutionPlan(PlaceHolder)                                                                                                      |
-|                                         |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                               |
-|                                         |     IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-|                                         |     IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                     |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
-+-----------------------------------------+-------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                          |
-+-----------------------------------------+-------------------------------------------------------------------------------+
-| logical_plan                            | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=None                                            |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=None                                            |
-| logical_plan after projection_push_down | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=Some([4])                                       |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=Some([1])                                       |
-| logical_plan after simplify_expressions | Union                                                                         |
-|                                         |   Projection: #h2o.state AS name                                              |
-|                                         |     TableScan: h2o projection=Some([4])                                       |
-|                                         |   Projection: #h2o.city AS name                                               |
-|                                         |     TableScan: h2o projection=Some([1])                                       |
-| physical_plan                           | ExecutionPlan(PlaceHolder)                                                    |
-|                                         |   ProjectionExec: expr=[state@0 as name]                                      |
-|                                         |     ExecutionPlan(PlaceHolder)                                                |
-|                                         |       ProjectionExec: expr=[state@1 as state]                                 |
-|                                         |         DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                  |
-|                                         |           SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]        |
-|                                         |             ExecutionPlan(PlaceHolder)                                        |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |   ProjectionExec: expr=[city@0 as name]                                       |
-|                                         |     ExecutionPlan(PlaceHolder)                                                |
-|                                         |       ProjectionExec: expr=[city@0 as city]                                   |
-|                                         |         DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                  |
-|                                         |           SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]        |
-|                                         |             ExecutionPlan(PlaceHolder)                                        |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-|                                         |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate         |
-+-----------------------------------------+-------------------------------------------------------------------------------+
+-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                        |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST                                                      |
+|               |   Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                     |
+|               |     TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                      |
+| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC]                                                                                               |
+|               |   CoalescePartitionsExec                                                                                                                    |
+|               |     ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
+|               |       ExecutionPlan(PlaceHolder)                                                                                                            |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                              |
+|               |             SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                    |
+|               |               ExecutionPlan(PlaceHolder)                                                                                                    |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |           IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                    |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area                                                   |
+|               |   TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5])                                                                                    |
+| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
+|               |   ExecutionPlan(PlaceHolder)                                                                                                            |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                              |
+|               |         SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC]                                                                    |
+|               |           ExecutionPlan(PlaceHolder)                                                                                                    |
+|               |             RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |             RepartitionExec: partitioning=RoundRobinBatch(4)                                                                            |
+|               |               IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                           |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
+|               |     RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                    |
+|               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate                                                                   |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
++---------------+-----------------------------------------------------------------------------------+
+| plan_type     | plan                                                                              |
++---------------+-----------------------------------------------------------------------------------+
+| logical_plan  | Union                                                                             |
+|               |   Projection: #h2o.state AS name                                                  |
+|               |     TableScan: h2o projection=Some([4])                                           |
+|               |   Projection: #h2o.city AS name                                                   |
+|               |     TableScan: h2o projection=Some([1])                                           |
+| physical_plan | ExecutionPlan(PlaceHolder)                                                        |
+|               |   ProjectionExec: expr=[state@0 as name]                                          |
+|               |     ExecutionPlan(PlaceHolder)                                                    |
+|               |       ProjectionExec: expr=[state@1 as state]                                     |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                          |
+|               |           DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                    |
+|               |             SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]          |
+|               |               ExecutionPlan(PlaceHolder)                                          |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |   ProjectionExec: expr=[city@0 as name]                                           |
+|               |     ExecutionPlan(PlaceHolder)                                                    |
+|               |       ProjectionExec: expr=[city@0 as city]                                       |
+|               |         RepartitionExec: partitioning=RoundRobinBatch(4)                          |
+|               |           DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC]                    |
+|               |             SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC]          |
+|               |               ExecutionPlan(PlaceHolder)                                          |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |                 RepartitionExec: partitioning=RoundRobinBatch(4)                  |
+|               |                   IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                            |
+|               |         IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
++---------------+-----------------------------------------------------------------------------------+
diff --git a/query_tests/cases/in/duplicates.sql b/query_tests/cases/in/duplicates.sql
index c9e0159199..5ef261c75e 100644
--- a/query_tests/cases/in/duplicates.sql
+++ b/query_tests/cases/in/duplicates.sql
@@ -2,11 +2,11 @@
 -- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates
 
 -- Plan with order by
-explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
+explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
 
 
 -- plan without order by
-explain verbose select time, state, city, min_temp, max_temp, area from h2o;
+EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
 
 -- Union plan
-EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
+EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
diff --git a/query_tests/cases/in/pushdown.expected b/query_tests/cases/in/pushdown.expected
index 7173930313..dd58342994 100644
--- a/query_tests/cases/in/pushdown.expected
+++ b/query_tests/cases/in/pushdown.expected
@@ -1,218 +1,167 @@
 -- Test Setup: TwoMeasurementsPredicatePushDown
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=None                                                     |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                    |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200                                                  |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Float64) > 200                                                |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 4                                                                  |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury             |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                             |
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=None                                                                                                                        |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                      |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence             |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                       |
-+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                   |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                              |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000                |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                             |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200  and count < 40000;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000               |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 4 AND system@1 < 7                                                 |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                        |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=None                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|                                         |   FilterExec: system@1 > 5 AND system@1 < 7                                                 |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
-+-----------------------------------------+---------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                        |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=None                                                                                   |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                 |
-|                                         |   FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                            |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                  |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                   |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                              |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|                                         |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|                                         |   FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                    |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                             |
-+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
--- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type                               | plan                                                                                                                                                                                                                                                                                    |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan                            | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=None                                                                                                                                                                                                                                               |
-| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
-| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|                                         |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|                                         |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
-| physical_plan                           | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                             |
-|                                         |   FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00)                                                                       |
-|                                         |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                                                                                                              |
-+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   TableScan: restaurant projection=Some([0, 1, 2, 3])                                       |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   RepartitionExec: partitioning=RoundRobinBatch(4)                                          |
+|               |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.count Gt Int64(200)                                                   |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200                                                |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.count Gt Float64(200)                                                 |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: CAST(count@0 AS Float64) > 200                                              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.system Gt Float64(4)                                                  |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: system@1 > 4                                                                |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                             |
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                      |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                     |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                           |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                   |
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                   |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                           |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                         |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where count > 200  and count < 40000;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000             |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: system@1 > 4 AND system@1 < 7                                               |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
++---------------+---------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                        |
++---------------+---------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                |
+|               |     FilterExec: system@1 > 5 AND system@1 < 7                                               |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
++---------------+---------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
++---------------+-----------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                        |
++---------------+-----------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                 |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                |
+|               |     FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                          |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                              |
++---------------+-----------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                   |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
+|               |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                           |
+|               |     FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                  |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                         |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                    |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
+|               |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                             |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                                                            |
+|               |     FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00)                                                                     |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                                                  |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                                                                                                          |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/query_tests/cases/in/pushdown.sql b/query_tests/cases/in/pushdown.sql
index 6ccf718c32..e97c34ef85 100644
--- a/query_tests/cases/in/pushdown.sql
+++ b/query_tests/cases/in/pushdown.sql
@@ -2,44 +2,44 @@
 -- IOX_SETUP: TwoMeasurementsPredicatePushDown
 
 -- Test 1: Select everything
-EXPLAIN VERBOSE SELECT * from restaurant;
+EXPLAIN SELECT * from restaurant;
 
 -- Test 2: One push-down expression: count > 200
 -- TODO: Make push-down predicates shown in explain verbose. Ticket #1538
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
+EXPLAIN SELECT * from restaurant where count > 200;
 
 -- Test 2.2: One push-down expression: count > 200.0
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
+EXPLAIN SELECT * from restaurant where count > 200.0;
 
 -- Test 2.3: One push-down expression: system > 4.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
+EXPLAIN SELECT * from restaurant where system > 4.0;
 
 
 -- Test 3: Two push-down expression: count > 200 and town != 'tewsbury'
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
 
 -- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury'
 -- even though the results are different
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
 
 -- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
+EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
 
 -- Test 6: two push-down expression: count > 200 and count < 40000
-EXPLAIN VERBOSE SELECT * from restaurant where count > 200  and count < 40000;
+EXPLAIN SELECT * from restaurant where count > 200  and count < 40000;
 
 -- Test 7: two push-down expression on float: system > 4.0 and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
+EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
 
 -- Test 8: two push-down expression on float: system > 5.0 and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
+EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
 
 -- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
+EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
 
 -- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
-EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
+EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
 
 -- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and
 -- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130)
-EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
+EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
diff --git a/query_tests/src/runner.rs b/query_tests/src/runner.rs
index 6edf05ce56..7abbf40403 100644
--- a/query_tests/src/runner.rs
+++ b/query_tests/src/runner.rs
@@ -4,12 +4,16 @@ mod parse;
 mod setup;
 
 use arrow::record_batch::RecordBatch;
-use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner};
+use query::{
+    exec::{Executor, ExecutorType},
+    frontend::sql::SqlQueryPlanner,
+};
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::{
     io::LineWriter,
     io::Write,
     path::{Path, PathBuf},
+    sync::Arc,
 };
 
 use self::{parse::TestQueries, setup::TestSetup};
@@ -261,7 +265,13 @@ impl<W: Write> Runner<W> {
             writeln!(self.log, "Running scenario '{}'", scenario_name)?;
             writeln!(self.log, "SQL: '{:#?}'", sql)?;
             let planner = SqlQueryPlanner::default();
-            let executor = db.executor();
+            let num_threads = 1;
+            let mut executor = Executor::new(num_threads);
+
+            // hardcode concurrency in tests as by default is is the
+            // number of cores, which varies across machines
+            executor.config_mut().set_concurrency(4);
+            let executor = Arc::new(executor);
 
             let physical_plan = planner
                 .query(db, &sql, executor.as_ref())

From 45ff5e214c8df63c3aeabab994cdbe554c0dbe56 Mon Sep 17 00:00:00 2001
From: Marko Mikulicic <mkm@influxdata.com>
Date: Mon, 19 Jul 2021 18:41:11 +0200
Subject: [PATCH 03/27] feat(iox): Make max http request size configurable

---
 src/commands/run.rs       |  8 +++++
 src/influxdb_ioxd.rs      | 10 ++++++-
 src/influxdb_ioxd/http.rs | 63 ++++++++++++++++++++++++++++-----------
 3 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/src/commands/run.rs b/src/commands/run.rs
index cab9cadb4d..878d98af97 100644
--- a/src/commands/run.rs
+++ b/src/commands/run.rs
@@ -231,6 +231,14 @@ Possible values (case insensitive):
         default_value = "serving"
     )]
     pub initial_serving_state: ServingReadinessState,
+
+    /// Maximum size of HTTP requests.
+    #[structopt(
+        long = "--max-http-request-size",
+        env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE",
+        default_value = "10485760" // 10 MiB
+    )]
+    pub max_http_request_size: usize,
 }
 
 pub async fn command(config: Config) -> Result<()> {
diff --git a/src/influxdb_ioxd.rs b/src/influxdb_ioxd.rs
index 696ad257cb..c871bc30c5 100644
--- a/src/influxdb_ioxd.rs
+++ b/src/influxdb_ioxd.rs
@@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> {
     let bind_addr = config.http_bind_address;
     let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?;
 
-    let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse();
+    let max_http_request_size = config.max_http_request_size;
+
+    let http_server = http::serve(
+        addr,
+        Arc::clone(&app_server),
+        frontend_shutdown.clone(),
+        max_http_request_size,
+    )
+    .fuse();
     info!(bind_address=?bind_addr, "HTTP server listening");
 
     info!(git_hash, "InfluxDB IOx server ready");
diff --git a/src/influxdb_ioxd/http.rs b/src/influxdb_ioxd/http.rs
index f8dab0bd8c..818ba09e74 100644
--- a/src/influxdb_ioxd/http.rs
+++ b/src/influxdb_ioxd/http.rs
@@ -342,12 +342,26 @@ impl ApplicationError {
     }
 }
 
-const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB
-
-fn router<M>(server: Arc<AppServer<M>>) -> Router<Body, ApplicationError>
+struct Server<M>
 where
     M: ConnectionManager + Send + Sync + Debug + 'static,
 {
+    app_server: Arc<AppServer<M>>,
+    max_request_size: usize,
+}
+
+fn router<M>(
+    app_server: Arc<AppServer<M>>,
+    max_request_size: usize,
+) -> Router<Body, ApplicationError>
+where
+    M: ConnectionManager + Send + Sync + Debug + 'static,
+{
+    let server = Server {
+        app_server,
+        max_request_size,
+    };
+
     // Create a router and specify the the handlers.
     Router::builder()
         .data(server)
@@ -408,7 +422,7 @@ struct WriteInfo {
 
 /// Parse the request's body into raw bytes, applying size limits and
 /// content encoding as needed.
-async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError> {
+async fn parse_body(req: hyper::Request<Body>, max_size: usize) -> Result<Bytes, ApplicationError> {
     // clippy says the const needs to be assigned to a local variable:
     // error: a `const` item with interior mutability should not be borrowed
     let header_name = CONTENT_ENCODING;
@@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
     while let Some(chunk) = payload.next().await {
         let chunk = chunk.context(ClientHangup)?;
         // limit max size of in-memory payload
-        if (body.len() + chunk.len()) > MAX_SIZE {
+        if (body.len() + chunk.len()) > max_size {
             return Err(ApplicationError::RequestSizeExceeded {
-                max_body_size: MAX_SIZE,
+                max_body_size: max_size,
             });
         }
         body.extend_from_slice(&chunk);
@@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
         use std::io::Read;
         let decoder = flate2::read::GzDecoder::new(&body[..]);
 
-        // Read at most MAX_SIZE bytes to prevent a decompression bomb based
+        // Read at most max_size bytes to prevent a decompression bomb based
         // DoS.
-        let mut decoder = decoder.take(MAX_SIZE as u64);
+        let mut decoder = decoder.take(max_size as u64);
         let mut decoded_data = Vec::new();
         decoder
             .read_to_end(&mut decoded_data)
@@ -464,7 +478,12 @@ where
     M: ConnectionManager + Send + Sync + Debug + 'static,
 {
     let path = req.uri().path().to_string();
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let Server {
+        app_server: server,
+        max_request_size,
+    } = req.data::<Server<M>>().expect("server state");
+    let max_request_size = *max_request_size;
+    let server = Arc::clone(&server);
 
     // TODO(edd): figure out best way of catching all errors in this observation.
     let obs = server.metrics.http_requests.observation(); // instrument request
@@ -481,7 +500,7 @@ where
     let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket)
         .context(BucketMappingError)?;
 
-    let body = parse_body(req).await?;
+    let body = parse_body(req, max_request_size).await?;
 
     let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?;
 
@@ -595,7 +614,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
     let path = req.uri().path().to_string();
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
 
     // TODO(edd): figure out best way of catching all errors in this observation.
     let obs = server.metrics.http_requests.observation(); // instrument request
@@ -661,7 +680,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
 async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
     let path = req.uri().path().to_string();
     server
         .metrics
@@ -677,7 +696,7 @@ async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
 async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApplicationError> {
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
     let path = req.uri().path().to_string();
     server
         .metrics
@@ -700,7 +719,7 @@ async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
 ) -> Result<Response<Body>, ApplicationError> {
     let path = req.uri().path().to_string();
 
-    let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
+    let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
 
     // TODO - catch error conditions
     let obs = server.metrics.http_requests.observation();
@@ -841,11 +860,12 @@ pub async fn serve<M>(
     addr: AddrIncoming,
     server: Arc<AppServer<M>>,
     shutdown: CancellationToken,
+    max_request_size: usize,
 ) -> Result<(), hyper::Error>
 where
     M: ConnectionManager + Send + Sync + Debug + 'static,
 {
-    let router = router(server);
+    let router = router(server, max_request_size);
     let service = RouterService::new(router).unwrap();
 
     hyper::Server::builder(addr)
@@ -1234,6 +1254,8 @@ mod tests {
         .await;
     }
 
+    const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024;
+
     #[tokio::test]
     async fn client_hangup_during_parse() {
         #[derive(Debug, Snafu)]
@@ -1253,7 +1275,9 @@ mod tests {
             .body(body)
             .unwrap();
 
-        let parse_result = parse_body(request).await.unwrap_err();
+        let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE)
+            .await
+            .unwrap_err();
         assert_eq!(
             parse_result.to_string(),
             "Client hung up while sending body: error reading a body from connection: Blarg Error"
@@ -1334,7 +1358,12 @@ mod tests {
         let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server");
         let server_url = format!("http://{}", addr.local_addr());
 
-        tokio::task::spawn(serve(addr, server, CancellationToken::new()));
+        tokio::task::spawn(serve(
+            addr,
+            server,
+            CancellationToken::new(),
+            TEST_MAX_REQUEST_SIZE,
+        ));
         println!("Started server at {}", server_url);
         server_url
     }

From 38f4eec20e73050d8013a5c50258caf5fa81b106 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Mon, 19 Jul 2021 19:04:58 +0200
Subject: [PATCH 04/27] feat: implement `seek` for write buffer

This is required to control replay ranges.
---
 write_buffer/src/core.rs  | 70 ++++++++++++++++++++++++++---
 write_buffer/src/kafka.rs | 54 +++++++++++++++++-----
 write_buffer/src/mock.rs  | 94 ++++++++++++++++++++++++++++-----------
 3 files changed, 176 insertions(+), 42 deletions(-)

diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index f747fde37b..27c7512884 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -25,12 +25,18 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
 pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError>>;
 
 /// Produce streams (one per sequencer) of [`SequencedEntry`]s.
+#[async_trait]
 pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
     /// Returns a stream per sequencer.
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait;
+    ///
+    /// Calling this method multiple times returns multiple streams that share the same state, i.e. entries for a
+    /// specific sequencer will only be deliver on on of the streams (likely the first that is polled). If you need
+    /// independent streams, create multiple [`WriteBufferReading`] objects.
+    fn streams(&self) -> Vec<(u32, EntryStream<'_>)>;
+
+    /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
+    /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>;
 }
 
 pub mod test_utils {
@@ -65,6 +71,7 @@ pub mod test_utils {
         test_multi_stream_io(&adapter).await;
         test_multi_sequencer_io(&adapter).await;
         test_multi_writer_multi_reader(&adapter).await;
+        test_seek(&adapter).await;
     }
 
     async fn test_single_stream_io<T>(adapter: &T)
@@ -213,18 +220,67 @@ pub mod test_utils {
         writer_2.store_entry(&entry_east_2, 0).await.unwrap();
 
         assert_reader_content(
-            reader_1,
+            &reader_1,
             &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
         )
         .await;
         assert_reader_content(
-            reader_2,
+            &reader_2,
             &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
         )
         .await;
     }
 
-    async fn assert_reader_content<R>(reader: R, expected: &[(u32, &[&Entry])])
+    async fn test_seek<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(2).await;
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+
+        let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
+        let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
+        let entry_east_3 = lp_to_entry("upc,region=east user=3 300");
+        let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
+
+        let writer = context.writing();
+        let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number;
+        let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number;
+        let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number;
+
+        let reader_1 = context.reading().await;
+        let reader_2 = context.reading().await;
+
+        // forward seek
+        reader_1.seek(0, sequence_number_east_2).await.unwrap();
+        assert_reader_content(&reader_1, &[(0, &[&entry_east_2]), (1, &[&entry_west_1])]).await;
+        assert_reader_content(
+            &reader_2,
+            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
+        )
+        .await;
+
+        // backward seek
+        reader_1.seek(0, 0).await.unwrap();
+        assert_reader_content(&reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])]).await;
+
+        // seek to far end and then at data
+        reader_1.seek(0, 1_000_000).await.unwrap();
+        let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
+        let mut streams = reader_1.streams();
+        assert_eq!(streams.len(), 2);
+        let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
+        let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
+        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+
+        // seeking unknown sequencer is NOT an error
+        reader_1.seek(0, 42).await.unwrap();
+    }
+
+    async fn assert_reader_content<R>(reader: &R, expected: &[(u32, &[&Entry])])
     where
         R: WriteBufferReading,
     {
diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index 15a27a401c..0624a06d18 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -1,5 +1,7 @@
 use std::{
+    collections::BTreeMap,
     convert::{TryFrom, TryInto},
+    sync::Arc,
     time::Duration,
 };
 
@@ -13,7 +15,7 @@ use rdkafka::{
     error::KafkaError,
     producer::{FutureProducer, FutureRecord},
     util::Timeout,
-    ClientConfig, Message, TopicPartitionList,
+    ClientConfig, Message, Offset, TopicPartitionList,
 };
 
 use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
@@ -94,7 +96,7 @@ impl KafkaBufferProducer {
 pub struct KafkaBufferConsumer {
     conn: String,
     database_name: String,
-    consumers: Vec<(u32, StreamConsumer)>,
+    consumers: BTreeMap<u32, Arc<StreamConsumer>>,
 }
 
 // Needed because rdkafka's StreamConsumer doesn't impl Debug
@@ -107,12 +109,9 @@ impl std::fmt::Debug for KafkaBufferConsumer {
     }
 }
 
+#[async_trait]
 impl WriteBufferReading for KafkaBufferConsumer {
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait,
-    {
+    fn streams(&self) -> Vec<(u32, EntryStream<'_>)> {
         self.consumers
             .iter()
             .map(|(sequencer_id, consumer)| {
@@ -133,6 +132,31 @@ impl WriteBufferReading for KafkaBufferConsumer {
             })
             .collect()
     }
+
+    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
+        if let Some(consumer) = self.consumers.get(&sequencer_id) {
+            let consumer = Arc::clone(consumer);
+            let database_name = self.database_name.clone();
+            let offset = if sequence_number > 0 {
+                Offset::Offset(sequence_number as i64)
+            } else {
+                Offset::Beginning
+            };
+
+            tokio::task::spawn_blocking(move || {
+                consumer.seek(
+                    &database_name,
+                    sequencer_id as i32,
+                    offset,
+                    Duration::from_secs(60),
+                )
+            })
+            .await
+            .expect("subtask failed")?;
+        }
+
+        Ok(())
+    }
 }
 
 impl KafkaBufferConsumer {
@@ -169,11 +193,21 @@ impl KafkaBufferConsumer {
 
                 let mut assignment = TopicPartitionList::new();
                 assignment.add_partition(&database_name, partition as i32);
-                consumer.assign(&assignment)?;
 
-                Ok((partition, consumer))
+                // We must set the offset to `Beginning` here to avoid the following error during seek:
+                //     KafkaError (Seek error: Local: Erroneous state)
+                //
+                // Also see:
+                // - https://github.com/Blizzard/node-rdkafka/issues/237
+                // - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376
+                assignment
+                    .set_partition_offset(&database_name, partition as i32, Offset::Beginning)
+                    .expect("partition was set just before");
+
+                consumer.assign(&assignment)?;
+                Ok((partition, Arc::new(consumer)))
             })
-            .collect::<Result<Vec<(u32, StreamConsumer)>, KafkaError>>()?;
+            .collect::<Result<BTreeMap<u32, Arc<StreamConsumer>>, KafkaError>>()?;
 
         Ok(Self {
             conn,
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index fc15ca2534..cb4322199a 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -153,21 +153,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
     }
 }
 
+/// Sequencer-specific playback state
+struct PlaybackState {
+    /// Index within the entry vector.
+    vector_index: usize,
+
+    /// Offset within the sequencer IDs.
+    offset: u64,
+}
+
 pub struct MockBufferForReading {
-    state: MockBufferSharedState,
-    positions: Arc<Mutex<BTreeMap<u32, usize>>>,
+    shared_state: MockBufferSharedState,
+    playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
 }
 
 impl MockBufferForReading {
     pub fn new(state: MockBufferSharedState) -> Self {
         let n_sequencers = state.entries.lock().len() as u32;
-        let positions: BTreeMap<_, _> = (0..n_sequencers)
-            .map(|sequencer_id| (sequencer_id, 0))
+        let playback_states: BTreeMap<_, _> = (0..n_sequencers)
+            .map(|sequencer_id| {
+                (
+                    sequencer_id,
+                    PlaybackState {
+                        vector_index: 0,
+                        offset: 0,
+                    },
+                )
+            })
             .collect();
 
         Self {
-            state,
-            positions: Arc::new(Mutex::new(positions)),
+            shared_state: state,
+            playback_states: Arc::new(Mutex::new(playback_states)),
         }
     }
 }
@@ -178,38 +195,52 @@ impl std::fmt::Debug for MockBufferForReading {
     }
 }
 
+#[async_trait]
 impl WriteBufferReading for MockBufferForReading {
-    fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
-    where
-        'life0: 'async_trait,
-        Self: 'async_trait,
-    {
+    fn streams(&self) -> Vec<(u32, EntryStream<'_>)> {
         let sequencer_ids: Vec<_> = {
-            let positions = self.positions.lock();
-            positions.keys().copied().collect()
+            let playback_states = self.playback_states.lock();
+            playback_states.keys().copied().collect()
         };
 
         let mut streams = vec![];
         for sequencer_id in sequencer_ids {
-            let state = self.state.clone();
-            let positions = Arc::clone(&self.positions);
+            let shared_state = self.shared_state.clone();
+            let playback_states = Arc::clone(&self.playback_states);
 
             let stream = stream::poll_fn(move |_ctx| {
-                let entries = state.entries.lock();
-                let mut positions = positions.lock();
+                let entries = shared_state.entries.lock();
+                let mut playback_states = playback_states.lock();
 
                 let entry_vec = entries.get(&sequencer_id).unwrap();
-                let position = positions.get_mut(&sequencer_id).unwrap();
+                let playback_state = playback_states.get_mut(&sequencer_id).unwrap();
 
-                if entry_vec.len() > *position {
-                    let entry = match &entry_vec[*position] {
-                        Ok(entry) => Ok(entry.clone()),
-                        Err(e) => Err(e.to_string().into()),
-                    };
-                    *position += 1;
-                    return Poll::Ready(Some(entry));
+                while entry_vec.len() > playback_state.vector_index {
+                    let entry_result = &entry_vec[playback_state.vector_index];
+
+                    // consume entry
+                    playback_state.vector_index += 1;
+
+                    match entry_result {
+                        Ok(entry) => {
+                            // found an entry => need to check if it is within the offset
+                            let sequence = entry.sequence().unwrap();
+                            if sequence.number >= playback_state.offset {
+                                // within offset => return entry to caller
+                                return Poll::Ready(Some(Ok(entry.clone())));
+                            } else {
+                                // offset is larger then the current entry => ignore entry and try next
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            // found an error => return entry to caller
+                            return Poll::Ready(Some(Err(e.to_string().into())));
+                        }
+                    }
                 }
 
+                // we are at the end of the recorded entries => report pending
                 Poll::Pending
             })
             .boxed();
@@ -218,6 +249,19 @@ impl WriteBufferReading for MockBufferForReading {
 
         streams
     }
+
+    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
+        let mut playback_states = self.playback_states.lock();
+
+        if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
+            playback_state.offset = sequence_number;
+
+            // reset position to start since seeking might go backwards
+            playback_state.vector_index = 0;
+        }
+
+        Ok(())
+    }
 }
 
 #[cfg(test)]

From 8e5d5928cf700201fe9b0f17b506ebc6dc1b47ac Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 09:46:52 +0100
Subject: [PATCH 05/27] feat: compute WriteSummary from PersistenceWindows
 (#2030) (#2054)

* feat: compute WriteSummary from PersistenceWindows (#2030)

* chore: review feedback

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 data_types/src/lib.rs                         |   1 +
 data_types/src/write_summary.rs               |  20 +++
 .../src/persistence_windows.rs                | 148 +++++++++++++++++-
 3 files changed, 164 insertions(+), 5 deletions(-)
 create mode 100644 data_types/src/write_summary.rs

diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs
index bea9629bc3..76d7ca0306 100644
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@@ -22,3 +22,4 @@ pub mod names;
 pub mod partition_metadata;
 pub mod server_id;
 pub mod timestamp;
+pub mod write_summary;
diff --git a/data_types/src/write_summary.rs b/data_types/src/write_summary.rs
new file mode 100644
index 0000000000..9574910262
--- /dev/null
+++ b/data_types/src/write_summary.rs
@@ -0,0 +1,20 @@
+use chrono::{DateTime, Utc};
+
+/// A description of a set of writes
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct WriteSummary {
+    /// The wall clock timestamp of the last write in this summary
+    pub time_of_first_write: DateTime<Utc>,
+
+    /// The wall clock timestamp of the last write in this summary
+    pub time_of_last_write: DateTime<Utc>,
+
+    /// The minimum row timestamp for data in this summary
+    pub min_timestamp: DateTime<Utc>,
+
+    /// The maximum row timestamp value for data in this summary
+    pub max_timestamp: DateTime<Utc>,
+
+    /// The number of rows in this summary
+    pub row_count: usize,
+}
diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs
index 8dd287f5ba..1b5ae73d29 100644
--- a/persistence_windows/src/persistence_windows.rs
+++ b/persistence_windows/src/persistence_windows.rs
@@ -7,7 +7,7 @@ use std::{
 
 use chrono::{DateTime, TimeZone, Utc};
 
-use data_types::partition_metadata::PartitionAddr;
+use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary};
 use entry::Sequence;
 use internal_types::guard::{ReadGuard, ReadLock};
 
@@ -45,6 +45,16 @@ pub struct PersistenceWindows {
     late_arrival_period: Duration,
     closed_window_period: Duration,
 
+    /// The datetime this PersistenceWindows was created
+    ///
+    /// `PersistenceWindows` internally uses monotonic `Instant`, however,
+    /// these cannot be rendered. To provide a stable rendering of Wall timestamp,
+    /// a single timestamp is recorded at creation time
+    created_at_time: DateTime<Utc>,
+
+    /// The instant this PersistenceWindows was created
+    created_at_instant: Instant,
+
     /// The last instant passed to PersistenceWindows::add_range
     last_instant: Instant,
 
@@ -106,6 +116,9 @@ impl PersistenceWindows {
 
         let closed_window_count = late_arrival_seconds / closed_window_seconds;
 
+        let created_at_time = Utc::now();
+        let created_at_instant = Instant::now();
+
         Self {
             persistable: ReadLock::new(None),
             closed: VecDeque::with_capacity(closed_window_count as usize),
@@ -113,7 +126,9 @@ impl PersistenceWindows {
             addr,
             late_arrival_period,
             closed_window_period,
-            last_instant: Instant::now(),
+            created_at_time,
+            created_at_instant,
+            last_instant: created_at_instant,
             max_sequence_numbers: Default::default(),
         }
     }
@@ -165,7 +180,7 @@ impl PersistenceWindows {
         self.rotate(received_at);
 
         match self.open.as_mut() {
-            Some(w) => w.add_range(sequence, row_count, min_time, max_time),
+            Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at),
             None => {
                 self.open = Some(Window::new(
                     received_at,
@@ -335,6 +350,34 @@ impl PersistenceWindows {
         self.windows().next()
     }
 
+    /// Returns approximate summaries of the unpersisted writes contained
+    /// recorded by this PersistenceWindow instance
+    ///
+    /// These are approximate because persistence may partially flush a window, which will
+    /// update the min row timestamp but not the row count
+    pub fn summaries(&self) -> impl Iterator<Item = WriteSummary> + '_ {
+        self.windows().map(move |window| {
+            let window_age = chrono::Duration::from_std(
+                window.created_at.duration_since(self.created_at_instant),
+            )
+            .expect("duration overflow");
+
+            let time_of_first_write = self.created_at_time + window_age;
+
+            let window_duration =
+                chrono::Duration::from_std(window.last_instant.duration_since(window.created_at))
+                    .expect("duration overflow");
+
+            WriteSummary {
+                time_of_first_write,
+                time_of_last_write: time_of_first_write + window_duration,
+                min_timestamp: window.min_time,
+                max_timestamp: window.max_time,
+                row_count: window.row_count,
+            }
+        })
+    }
+
     /// Returns true if this PersistenceWindows instance is empty
     pub fn is_empty(&self) -> bool {
         self.minimum_window().is_none()
@@ -374,9 +417,14 @@ struct Window {
     /// The server time when this window was created. Used to determine how long data in this
     /// window has been sitting in memory.
     created_at: Instant,
+    /// The server time of the last write to this window
+    last_instant: Instant,
+    /// The number of rows in the window
     row_count: usize,
-    min_time: DateTime<Utc>, // min time value for data in the window
-    max_time: DateTime<Utc>, // max time value for data in the window
+    /// min time value for data in the window
+    min_time: DateTime<Utc>,
+    /// max time value for data in the window
+    max_time: DateTime<Utc>,
     /// maps sequencer_id to the minimum and maximum sequence numbers seen
     sequencer_numbers: BTreeMap<u32, MinMaxSequence>,
 }
@@ -399,6 +447,7 @@ impl Window {
 
         Self {
             created_at,
+            last_instant: created_at,
             row_count,
             min_time,
             max_time,
@@ -414,7 +463,11 @@ impl Window {
         row_count: usize,
         min_time: DateTime<Utc>,
         max_time: DateTime<Utc>,
+        instant: Instant,
     ) {
+        assert!(self.created_at <= instant);
+        self.last_instant = instant;
+
         self.row_count += row_count;
         if self.min_time > min_time {
             self.min_time = min_time;
@@ -1265,4 +1318,89 @@ mod tests {
         assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2));
         assert_eq!(w.closed[1].row_count, 11);
     }
+
+    #[test]
+    fn test_summaries() {
+        let mut w = make_windows(Duration::from_secs(100));
+        let instant = w.created_at_instant;
+
+        // Window 1
+        w.add_range(
+            Some(&Sequence { id: 1, number: 1 }),
+            11,
+            Utc.timestamp_nanos(10),
+            Utc.timestamp_nanos(11),
+            instant + Duration::from_millis(1),
+        );
+
+        w.add_range(
+            Some(&Sequence { id: 1, number: 2 }),
+            4,
+            Utc.timestamp_nanos(10),
+            Utc.timestamp_nanos(340),
+            instant + Duration::from_millis(30),
+        );
+
+        w.add_range(
+            Some(&Sequence { id: 1, number: 3 }),
+            6,
+            Utc.timestamp_nanos(1),
+            Utc.timestamp_nanos(5),
+            instant + Duration::from_millis(50),
+        );
+
+        // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2
+        w.add_range(
+            Some(&Sequence { id: 1, number: 4 }),
+            3,
+            Utc.timestamp_nanos(89),
+            Utc.timestamp_nanos(90),
+            instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1),
+        );
+
+        // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3
+        w.add_range(
+            Some(&Sequence { id: 1, number: 5 }),
+            8,
+            Utc.timestamp_nanos(3),
+            Utc.timestamp_nanos(4),
+            instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3,
+        );
+
+        let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap();
+
+        let summaries: Vec<_> = w.summaries().collect();
+
+        assert_eq!(summaries.len(), 3);
+        assert_eq!(
+            summaries,
+            vec![
+                WriteSummary {
+                    time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: w.created_at_time + chrono::Duration::milliseconds(50),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(340),
+                    row_count: 21
+                },
+                WriteSummary {
+                    time_of_first_write: w.created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    time_of_last_write: w.created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    min_timestamp: Utc.timestamp_nanos(89),
+                    max_timestamp: Utc.timestamp_nanos(90),
+                    row_count: 3
+                },
+                WriteSummary {
+                    time_of_first_write: w.created_at_time + closed_duration * 3,
+                    time_of_last_write: w.created_at_time + closed_duration * 3,
+                    min_timestamp: Utc.timestamp_nanos(3),
+                    max_timestamp: Utc.timestamp_nanos(4),
+                    row_count: 8
+                },
+            ]
+        )
+    }
 }

From 2c20528c6926f16b5c78f0fcac652a2416326631 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Tue, 20 Jul 2021 04:53:46 -0400
Subject: [PATCH 06/27] chore: use upstream versions of some workarounds
 (#2057)

* chore: use upstream versions of some workarounds

* docs: update docstring

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 internal_types/src/schema.rs           |  2 +-
 internal_types/src/schema/sort.rs      | 19 +------------------
 query/src/frontend/reorg.rs            |  3 ++-
 query/src/provider/deduplicate/algo.rs | 11 +----------
 4 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/internal_types/src/schema.rs b/internal_types/src/schema.rs
index 4427540576..9ac26a8731 100644
--- a/internal_types/src/schema.rs
+++ b/internal_types/src/schema.rs
@@ -786,12 +786,12 @@ macro_rules! assert_column_eq {
 
 #[cfg(test)]
 mod test {
+    use arrow::compute::SortOptions;
     use InfluxColumnType::*;
     use InfluxFieldType::*;
 
     use super::{builder::SchemaBuilder, *};
     use crate::schema::merge::SchemaMerger;
-    use crate::schema::sort::SortOptions;
 
     fn make_field(
         name: &str,
diff --git a/internal_types/src/schema/sort.rs b/internal_types/src/schema/sort.rs
index a56fd0a495..0612b11dbc 100644
--- a/internal_types/src/schema/sort.rs
+++ b/internal_types/src/schema/sort.rs
@@ -1,5 +1,6 @@
 use std::{fmt::Display, str::FromStr};
 
+use arrow::compute::SortOptions;
 use indexmap::{map::Iter, IndexMap};
 use itertools::Itertools;
 use snafu::Snafu;
@@ -23,24 +24,6 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-/// Temporary - <https://github.com/apache/arrow-rs/pull/425>
-#[derive(Debug, Clone, Copy, Eq, PartialEq)]
-pub struct SortOptions {
-    /// Whether to sort in descending order
-    pub descending: bool,
-    /// Whether to sort nulls first
-    pub nulls_first: bool,
-}
-
-impl Default for SortOptions {
-    fn default() -> Self {
-        Self {
-            descending: false,
-            nulls_first: true,
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct ColumnSort {
     /// Position of this column in the sort key
diff --git a/query/src/frontend/reorg.rs b/query/src/frontend/reorg.rs
index a1c2df8599..43e1c824e7 100644
--- a/query/src/frontend/reorg.rs
+++ b/query/src/frontend/reorg.rs
@@ -268,8 +268,9 @@ struct ScanPlan<C: QueryChunk + 'static> {
 
 #[cfg(test)]
 mod test {
+    use arrow::compute::SortOptions;
     use arrow_util::assert_batches_eq;
-    use internal_types::schema::{merge::SchemaMerger, sort::SortOptions};
+    use internal_types::schema::merge::SchemaMerger;
 
     use crate::{
         exec::{Executor, ExecutorType},
diff --git a/query/src/provider/deduplicate/algo.rs b/query/src/provider/deduplicate/algo.rs
index 6cc7ecb77e..df14aebfa9 100644
--- a/query/src/provider/deduplicate/algo.rs
+++ b/query/src/provider/deduplicate/algo.rs
@@ -339,21 +339,12 @@ impl RecordBatchDeduplicator {
     }
 
     /// Create a new record batch from offset --> len
-    ///
-    /// <https://github.com/apache/arrow-rs/issues/460> for adding this upstream
     fn slice_record_batch(
         batch: &RecordBatch,
         offset: usize,
         len: usize,
     ) -> ArrowResult<RecordBatch> {
-        let schema = batch.schema();
-        let new_columns: Vec<_> = batch
-            .columns()
-            .iter()
-            .map(|old_column| old_column.slice(offset, len))
-            .collect();
-
-        let batch = RecordBatch::try_new(schema, new_columns)?;
+        let batch = batch.slice(offset, len);
 
         // At time of writing, `concat_batches` concatenates the
         // contents of dictionaries as well; Do a post pass to remove the

From 767c2a6fe15eff33c134f947a1cd8af20ec8d4b6 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 11:11:18 +0100
Subject: [PATCH 07/27] refactor: explicit server startup state machine (#2040)

* refactor: explicit server startup state machine

* chore: update `ServerStage` docs

* chore: further docs

* chore: more logging

* chore: format
---
 server/src/config.rs                     | 321 +++--------
 server/src/db.rs                         |   4 +-
 server/src/init.rs                       | 683 ++++++++---------------
 server/src/lib.rs                        | 506 +++++++++++------
 src/influxdb_ioxd/rpc/error.rs           |   2 +-
 src/influxdb_ioxd/rpc/management.rs      |  46 +-
 tests/end_to_end_cases/management_api.rs |   2 +
 tests/end_to_end_cases/management_cli.rs |  12 +
 8 files changed, 701 insertions(+), 875 deletions(-)

diff --git a/server/src/config.rs b/server/src/config.rs
index 3088661906..4554e8e912 100644
--- a/server/src/config.rs
+++ b/server/src/config.rs
@@ -8,7 +8,7 @@ use data_types::{
     DatabaseName,
 };
 use metrics::MetricRegistry;
-use object_store::{path::ObjectStorePath, ObjectStore};
+use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi};
 use parquet_file::catalog::PreservedCatalog;
 use query::exec::Executor;
 use write_buffer::config::WriteBufferConfig;
@@ -20,6 +20,7 @@ use crate::{
     InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch,
     ServerShuttingDown,
 };
+use object_store::path::Path;
 use observability_deps::tracing::{self, error, info, warn, Instrument};
 use snafu::{ensure, OptionExt};
 use tokio::task::JoinHandle;
@@ -37,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb";
 /// run to completion if the tokio runtime is dropped
 #[derive(Debug)]
 pub(crate) struct Config {
-    shutdown: CancellationToken,
     jobs: Arc<JobRegistry>,
-    state: RwLock<ConfigState>,
+    object_store: Arc<ObjectStore>,
+    exec: Arc<Executor>,
+    server_id: ServerId,
     metric_registry: Arc<MetricRegistry>,
+
+    shutdown: CancellationToken,
+    state: RwLock<ConfigState>,
 }
 
 pub(crate) enum UpdateError<E> {
@@ -58,14 +63,20 @@ impl Config {
     /// Create new empty config.
     pub(crate) fn new(
         jobs: Arc<JobRegistry>,
+        object_store: Arc<ObjectStore>,
+        exec: Arc<Executor>,
+        server_id: ServerId,
         metric_registry: Arc<MetricRegistry>,
         remote_template: Option<RemoteTemplate>,
     ) -> Self {
         Self {
+            jobs,
+            object_store,
+            exec,
+            server_id,
+            metric_registry,
             shutdown: Default::default(),
             state: RwLock::new(ConfigState::new(remote_template)),
-            jobs,
-            metric_registry,
         }
     }
 
@@ -80,13 +91,7 @@ impl Config {
     /// This only works if the database is not yet known. To recover a database out of an uninitialized state, see
     /// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog)
     /// without initializing it, see [`block_db`](Self::block_db).
-    pub(crate) fn create_db(
-        &self,
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    ) -> Result<DatabaseHandle<'_>> {
+    pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
         let mut state = self.state.write().expect("mutex poisoned");
         ensure!(
             !state.reservations.contains(&db_name),
@@ -99,12 +104,7 @@ impl Config {
 
         state.reservations.insert(db_name.clone());
         Ok(DatabaseHandle {
-            state: Some(Arc::new(DatabaseState::Known {
-                object_store,
-                exec,
-                server_id,
-                db_name,
-            })),
+            state: Some(Arc::new(DatabaseState::Known { db_name })),
             config: &self,
         })
     }
@@ -116,7 +116,7 @@ impl Config {
     /// While the handle is held, no other operations for the given database can be executed.
     ///
     /// This only works if the database is known but is uninitialized. To create a new database that is not yet known,
-    /// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog)
+    /// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog)
     /// without initializing it, see [`block_db`](Self::block_db).
     pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
         let mut state = self.state.write().expect("mutex poisoned");
@@ -303,6 +303,24 @@ impl Config {
     pub fn metrics_registry(&self) -> Arc<MetricRegistry> {
         Arc::clone(&self.metric_registry)
     }
+
+    /// Returns the object store of this server
+    pub fn object_store(&self) -> Arc<ObjectStore> {
+        Arc::clone(&self.object_store)
+    }
+
+    /// Returns the server id of this server
+    pub fn server_id(&self) -> ServerId {
+        self.server_id
+    }
+
+    /// Base location in object store for this server.
+    pub fn root_path(&self) -> Path {
+        let id = self.server_id.get();
+        let mut path = self.object_store.new_path();
+        path.push_dir(format!("{}", id));
+        path
+    }
 }
 
 /// Get object store path for the database config under the given root (= path under with the server with the current ID
@@ -365,41 +383,14 @@ impl RemoteTemplate {
 }
 
 /// Internal representation of the different database states.
-///
-/// # Shared Data During Transitions
-/// The following elements can safely be shared between states because they won't be poisoned by any half-done
-/// transition (e.g. starting a transition and then failing due to an IO error):
-/// - `object_store`
-/// - `exec`
-///
-/// The following elements can trivially be copied from one state to the next:
-/// - `server_id`
-/// - `db_name`
-///
-/// The following elements MUST be copied from one state to the next because partial modifications are not allowed:
-/// - `rules`
-///
-/// Exceptions to the above rules are the following states:
-/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm
-/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else
 #[derive(Debug)]
 #[allow(clippy::large_enum_variant)]
 enum DatabaseState {
     /// Database is known but nothing is loaded.
-    Known {
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    },
+    Known { db_name: DatabaseName<'static> },
 
     /// Rules are loaded
-    RulesLoaded {
-        object_store: Arc<ObjectStore>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-        rules: Arc<DatabaseRules>,
-    },
+    RulesLoaded { rules: Arc<DatabaseRules> },
 
     /// Catalog is loaded but data from sequencers / write buffers is not yet replayed.
     Replay { db: Arc<Db> },
@@ -457,24 +448,6 @@ impl DatabaseState {
         }
     }
 
-    fn object_store(&self) -> Arc<ObjectStore> {
-        match self {
-            DatabaseState::Known { object_store, .. } => Arc::clone(object_store),
-            DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store),
-            DatabaseState::Replay { db, .. } => Arc::clone(&db.store),
-            DatabaseState::Initialized { db, .. } => Arc::clone(&db.store),
-        }
-    }
-
-    fn server_id(&self) -> ServerId {
-        match self {
-            DatabaseState::Known { server_id, .. } => *server_id,
-            DatabaseState::RulesLoaded { server_id, .. } => *server_id,
-            DatabaseState::Replay { db, .. } => db.server_id,
-            DatabaseState::Initialized { db, .. } => db.server_id,
-        }
-    }
-
     fn rules(&self) -> Option<Arc<DatabaseRules>> {
         match self {
             DatabaseState::Known { .. } => None,
@@ -540,12 +513,12 @@ impl<'a> DatabaseHandle<'a> {
 
     /// Get object store.
     pub fn object_store(&self) -> Arc<ObjectStore> {
-        self.state().object_store()
+        Arc::clone(&self.config.object_store)
     }
 
     /// Get server ID.
     pub fn server_id(&self) -> ServerId {
-        self.state().server_id()
+        self.config.server_id
     }
 
     /// Get metrics registry.
@@ -584,12 +557,7 @@ impl<'a> DatabaseHandle<'a> {
     /// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded).
     pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> {
         match self.state().as_ref() {
-            DatabaseState::Known {
-                object_store,
-                exec,
-                server_id,
-                db_name,
-            } => {
+            DatabaseState::Known { db_name } => {
                 ensure!(
                     db_name == &rules.name,
                     RulesDatabaseNameMismatch {
@@ -599,9 +567,6 @@ impl<'a> DatabaseHandle<'a> {
                 );
 
                 self.state = Some(Arc::new(DatabaseState::RulesLoaded {
-                    object_store: Arc::clone(&object_store),
-                    exec: Arc::clone(&exec),
-                    server_id: *server_id,
                     rules: Arc::new(rules),
                 }));
 
@@ -623,16 +588,11 @@ impl<'a> DatabaseHandle<'a> {
         write_buffer: Option<WriteBufferConfig>,
     ) -> Result<()> {
         match self.state().as_ref() {
-            DatabaseState::RulesLoaded {
-                object_store,
-                exec,
-                server_id,
-                rules,
-            } => {
+            DatabaseState::RulesLoaded { rules } => {
                 let database_to_commit = DatabaseToCommit {
-                    server_id: *server_id,
-                    object_store: Arc::clone(&object_store),
-                    exec: Arc::clone(&exec),
+                    server_id: self.config.server_id,
+                    object_store: Arc::clone(&self.config.object_store),
+                    exec: Arc::clone(&self.config.exec),
                     preserved_catalog,
                     catalog,
                     rules: Arc::clone(&rules),
@@ -726,40 +686,32 @@ mod test {
     use super::*;
     use std::num::NonZeroU32;
 
+    fn make_config(remote_template: Option<RemoteTemplate>) -> Config {
+        let store = Arc::new(ObjectStore::new_in_memory());
+        let server_id = ServerId::try_from(1).unwrap();
+        let metric_registry = Arc::new(metrics::MetricRegistry::new());
+        Config::new(
+            Arc::new(JobRegistry::new()),
+            Arc::clone(&store),
+            Arc::new(Executor::new(1)),
+            server_id,
+            Arc::clone(&metric_registry),
+            remote_template,
+        )
+    }
+
     #[tokio::test]
     async fn create_db() {
         // setup
         let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
         let rules = DatabaseRules::new(name.clone());
 
         // getting handle while DB is reserved => fails
         {
-            let _db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let _db_reservation = config.create_db(name.clone()).unwrap();
 
-            let err = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap_err();
+            let err = config.create_db(name.clone()).unwrap_err();
             assert!(matches!(err, Error::DatabaseReserved { .. }));
 
             let err = config.block_db(name.clone()).unwrap_err();
@@ -771,14 +723,7 @@ mod test {
 
         // name in rules must match reserved name
         {
-            let mut db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    DatabaseName::new("bar").unwrap(),
-                )
-                .unwrap();
+            let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
 
             let err = db_reservation
                 .advance_rules_loaded(rules.clone())
@@ -791,14 +736,7 @@ mod test {
 
         // handle.abort just works (aka does not mess up the transaction afterwards)
         {
-            let db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    DatabaseName::new("bar").unwrap(),
-                )
-                .unwrap();
+            let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
 
             db_reservation.abort();
         }
@@ -808,21 +746,14 @@ mod test {
 
         // create DB successfull
         {
-            let mut db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let mut db_reservation = config.create_db(name.clone()).unwrap();
 
             db_reservation.advance_rules_loaded(rules).unwrap();
 
             let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
                 &name,
-                Arc::clone(&store),
-                server_id,
+                config.object_store(),
+                config.server_id(),
                 config.metrics_registry(),
                 false,
             )
@@ -862,14 +793,7 @@ mod test {
         assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
 
         // create DB as second time => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
         assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
 
         // block fully initiliazed DB => fail
@@ -884,40 +808,18 @@ mod test {
     async fn recover_db() {
         // setup
         let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
         let rules = DatabaseRules::new(name.clone());
 
         // create DB but don't continue with rules loaded (e.g. because the rules file is broken)
         {
-            let db_reservation = config
-                .create_db(
-                    Arc::clone(&store),
-                    Arc::clone(&exec),
-                    server_id,
-                    name.clone(),
-                )
-                .unwrap();
+            let db_reservation = config.create_db(name.clone()).unwrap();
             db_reservation.commit();
         }
         assert!(config.has_uninitialized_database(&name));
 
         // create DB while it is uninitialized => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
         assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
 
         // recover an unknown DB => fail
@@ -931,19 +833,19 @@ mod test {
             let mut db_reservation = config.recover_db(name.clone()).unwrap();
             assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known);
             assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
             assert!(db_reservation.rules().is_none());
 
             db_reservation.advance_rules_loaded(rules).unwrap();
             assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded);
             assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
             assert!(db_reservation.rules().is_some());
 
             let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
                 &name,
-                Arc::clone(&store),
-                server_id,
+                config.object_store(),
+                config.server_id(),
                 config.metrics_registry(),
                 false,
             )
@@ -954,13 +856,13 @@ mod test {
                 .unwrap();
             assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay);
             assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
             assert!(db_reservation.rules().is_some());
 
             db_reservation.advance_init().unwrap();
             assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized);
             assert_eq!(db_reservation.db_name(), name);
-            assert_eq!(db_reservation.server_id(), server_id);
+            assert_eq!(db_reservation.server_id(), config.server_id());
             assert!(db_reservation.rules().is_some());
 
             db_reservation.commit();
@@ -974,14 +876,7 @@ mod test {
         assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
 
         // create recovered DB => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
         assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
 
         // block recovered DB => fail
@@ -996,28 +891,13 @@ mod test {
     async fn block_db() {
         // setup
         let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
 
         // block DB
         let handle = config.block_db(name.clone()).unwrap();
 
         // create while blocked => fail
-        let err = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap_err();
+        let err = config.create_db(name.clone()).unwrap_err();
         assert!(matches!(err, Error::DatabaseReserved { .. }));
 
         // recover while blocked => fail
@@ -1030,14 +910,7 @@ mod test {
 
         // unblock => DB can be created
         drop(handle);
-        config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap();
+        config.create_db(name.clone()).unwrap();
 
         // cleanup
         config.drain().await
@@ -1047,20 +920,12 @@ mod test {
     async fn test_db_drop() {
         // setup
         let name = DatabaseName::new("foo").unwrap();
-        let store = Arc::new(ObjectStore::new_in_memory());
-        let exec = Arc::new(Executor::new(1));
-        let server_id = ServerId::try_from(1).unwrap();
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            None,
-        );
+        let config = make_config(None);
         let rules = DatabaseRules::new(name.clone());
         let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
             &name,
-            Arc::clone(&store),
-            server_id,
+            config.object_store(),
+            config.server_id(),
             config.metrics_registry(),
             false,
         )
@@ -1068,14 +933,7 @@ mod test {
         .unwrap();
 
         // create DB
-        let mut db_reservation = config
-            .create_db(
-                Arc::clone(&store),
-                Arc::clone(&exec),
-                server_id,
-                name.clone(),
-            )
-            .unwrap();
+        let mut db_reservation = config.create_db(name.clone()).unwrap();
         db_reservation.advance_rules_loaded(rules).unwrap();
         db_reservation
             .advance_replay(preserved_catalog, catalog, None)
@@ -1122,12 +980,7 @@ mod test {
 
     #[test]
     fn resolve_remote() {
-        let metric_registry = Arc::new(metrics::MetricRegistry::new());
-        let config = Config::new(
-            Arc::new(JobRegistry::new()),
-            Arc::clone(&metric_registry),
-            Some(RemoteTemplate::new("http://iox-query-{id}:8082")),
-        );
+        let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082")));
 
         let server_id = ServerId::new(NonZeroU32::new(42).unwrap());
         let remote = config.resolve_remote(server_id);
diff --git a/server/src/db.rs b/server/src/db.rs
index c39655f107..4ec0d92f6b 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -203,10 +203,10 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 pub struct Db {
     rules: RwLock<Arc<DatabaseRules>>,
 
-    pub server_id: ServerId, // this is also the Query Server ID
+    server_id: ServerId, // this is also the Query Server ID
 
     /// Interface to use for persistence
-    pub store: Arc<ObjectStore>,
+    store: Arc<ObjectStore>,
 
     /// Executor for running queries
     exec: Arc<Executor>,
diff --git a/server/src/init.rs b/server/src/init.rs
index c06cbc61b6..2351821f9c 100644
--- a/server/src/init.rs
+++ b/server/src/init.rs
@@ -2,29 +2,19 @@
 use data_types::{
     database_rules::{DatabaseRules, WriteBufferConnection},
     database_state::DatabaseStateCode,
-    server_id::ServerId,
+    error::ErrorLogger,
     DatabaseName,
 };
 use futures::TryStreamExt;
 use generated_types::database_rules::decode_database_rules;
-use internal_types::once::OnceNonZeroU32;
 use object_store::{
     path::{parsed::DirsAndFileName, ObjectStorePath, Path},
     ObjectStore, ObjectStoreApi,
 };
-use observability_deps::tracing::{debug, error, info, warn};
-use parking_lot::Mutex;
+use observability_deps::tracing::{error, info, warn};
 use parquet_file::catalog::PreservedCatalog;
-use query::exec::Executor;
-use snafu::{OptionExt, ResultExt, Snafu};
-use std::{
-    collections::HashMap,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-};
-use tokio::sync::Semaphore;
+use snafu::{ResultExt, Snafu};
+use std::sync::Arc;
 use write_buffer::config::WriteBufferConfig;
 
 use crate::{
@@ -45,9 +35,6 @@ pub enum Error {
         source: generated_types::database_rules::DecodeError,
     },
 
-    #[snafu(display("id already set"))]
-    IdAlreadySet { id: ServerId },
-
     #[snafu(display("unable to use server until id is set"))]
     IdNotSet,
 
@@ -97,472 +84,254 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-#[derive(Debug, Default)]
-pub struct CurrentServerId(OnceNonZeroU32);
+/// Loads the database configurations based on the databases in the
+/// object store. Any databases in the config already won't be
+/// replaced.
+///
+/// Returns a Vec containing the results of loading the contained databases
+pub(crate) async fn initialize_server(
+    config: Arc<Config>,
+    wipe_on_error: bool,
+) -> Result<Vec<(DatabaseName<'static>, Result<()>)>> {
+    let root = config.root_path();
 
-impl CurrentServerId {
-    pub fn set(&self, id: ServerId) -> Result<()> {
-        let id = id.get();
+    // get the database names from the object store prefixes
+    // TODO: update object store to pull back all common prefixes by
+    //       following the next tokens.
+    let list_result = config
+        .object_store()
+        .list_with_delimiter(&root)
+        .await
+        .context(StoreError)?;
 
-        match self.0.set(id) {
-            Ok(()) => {
-                info!(server_id = id, "server ID set");
-                Ok(())
-            }
-            Err(id) => Err(Error::IdAlreadySet {
-                id: ServerId::new(id),
-            }),
-        }
-    }
+    let handles: Vec<_> = list_result
+        .common_prefixes
+        .into_iter()
+        .filter_map(|mut path| {
+            let config = Arc::clone(&config);
+            let root = root.clone();
+            path.set_file_name(DB_RULES_FILE_NAME);
+            let db_name = db_name_from_rules_path(&path)
+                .log_if_error("invalid database path")
+                .ok()?;
 
-    pub fn get(&self) -> Result<ServerId> {
-        self.0.get().map(ServerId::new).context(IdNotSet)
-    }
-}
-
-#[derive(Debug)]
-pub struct InitStatus {
-    pub server_id: CurrentServerId,
-
-    /// Flags that databases are loaded and server is ready to read/write data.
-    initialized: AtomicBool,
-
-    /// Semaphore that limits the number of jobs that load DBs when the serverID is set.
-    ///
-    /// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex,
-    /// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We
-    /// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical
-    /// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore
-    /// cannot be configured.
-    initialize_semaphore: Semaphore,
-
-    /// Error occurred during generic server init (e.g. listing store content).
-    error_generic: Mutex<Option<Arc<Error>>>,
-
-    /// Errors that occurred during some DB init.
-    errors_databases: Arc<Mutex<HashMap<String, Arc<Error>>>>,
-
-    /// Automatic wipe-on-error recovery
-    ///
-    /// See <https://github.com/influxdata/influxdb_iox/issues/1522>
-    pub(crate) wipe_on_error: AtomicBool,
-}
-
-impl InitStatus {
-    /// Create new "not initialized" status.
-    pub fn new() -> Self {
-        Self {
-            server_id: Default::default(),
-            initialized: AtomicBool::new(false),
-            // Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`.
-            initialize_semaphore: Semaphore::new(1),
-            error_generic: Default::default(),
-            errors_databases: Default::default(),
-            wipe_on_error: AtomicBool::new(true),
-        }
-    }
-
-    /// Base location in object store for this writer.
-    pub fn root_path(&self, store: &ObjectStore) -> Result<Path> {
-        let id = self.server_id.get()?;
-
-        let mut path = store.new_path();
-        path.push_dir(format!("{}", id));
-        Ok(path)
-    }
-
-    /// Check if server is loaded. Databases are loaded and server is ready to read/write.
-    pub fn initialized(&self) -> bool {
-        // Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that
-        // `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that
-        // every data acccess AFTER the following line will also stay AFTER this line.
-        self.initialized.load(Ordering::Acquire)
-    }
-
-    /// Error occurred during generic server init (e.g. listing store content).
-    pub fn error_generic(&self) -> Option<Arc<Error>> {
-        let guard = self.error_generic.lock();
-        guard.clone()
-    }
-
-    /// List all databases with errors in sorted order.
-    pub fn databases_with_errors(&self) -> Vec<String> {
-        let guard = self.errors_databases.lock();
-        let mut names: Vec<_> = guard.keys().cloned().collect();
-        names.sort();
-        names
-    }
-
-    /// Error that occurred during initialization of a specific database.
-    pub fn error_database(&self, db_name: &str) -> Option<Arc<Error>> {
-        let guard = self.errors_databases.lock();
-        guard.get(db_name).cloned()
-    }
-
-    /// Loads the database configurations based on the databases in the
-    /// object store. Any databases in the config already won't be
-    /// replaced.
-    ///
-    /// This requires the serverID to be set (will be a no-op otherwise).
-    ///
-    /// It will be a no-op if the configs are already loaded and the server is ready.
-    pub(crate) async fn maybe_initialize_server(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-    ) {
-        let server_id = match self.server_id.get() {
-            Ok(id) => id,
-            Err(e) => {
-                debug!(%e, "cannot initialize server because cannot get serverID");
-                return;
-            }
-        };
-
-        let _guard = self
-            .initialize_semaphore
-            .acquire()
-            .await
-            .expect("semaphore should not be closed");
-
-        // Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread
-        // that enters this semaphore after we've left actually sees the correct `is_ready` flag.
-        if self.initialized.load(Ordering::Acquire) {
-            // already loaded, so do nothing
-            return;
-        }
-
-        // Check if there was a previous failed attempt
-        if self.error_generic().is_some() {
-            return;
-        }
-
-        match self
-            .maybe_initialize_server_inner(store, config, exec, server_id)
-            .await
-        {
-            Ok(_) => {
-                // mark as ready (use correct ordering for Acquire-Release)
-                self.initialized.store(true, Ordering::Release);
-                info!("loaded databases, server is initalized");
-            }
-            Err(e) => {
-                error!(%e, "error during server init");
-                let mut guard = self.error_generic.lock();
-                *guard = Some(Arc::new(e));
-            }
-        }
-    }
-
-    async fn maybe_initialize_server_inner(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-        server_id: ServerId,
-    ) -> Result<()> {
-        let root = self.root_path(&store)?;
-
-        // get the database names from the object store prefixes
-        // TODO: update object store to pull back all common prefixes by
-        //       following the next tokens.
-        let list_result = store.list_with_delimiter(&root).await.context(StoreError)?;
-
-        let handles: Vec<_> = list_result
-            .common_prefixes
-            .into_iter()
-            .filter_map(|mut path| {
-                let store = Arc::clone(&store);
-                let config = Arc::clone(&config);
-                let exec = Arc::clone(&exec);
-                let errors_databases = Arc::clone(&self.errors_databases);
-                let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
-                let root = root.clone();
-
-                path.set_file_name(DB_RULES_FILE_NAME);
-
-                match db_name_from_rules_path(&path) {
-                    Ok(db_name) => {
-                        let handle = tokio::task::spawn(async move {
-                            match Self::initialize_database(
-                                server_id,
-                                store,
-                                config,
-                                exec,
-                                root,
-                                db_name.clone(),
-                                wipe_on_error,
-                            )
-                            .await
-                            {
-                                Ok(()) => {
-                                    info!(%db_name, "database initialized");
-                                }
-                                Err(e) => {
-                                    error!(%e, %db_name, "cannot load database");
-                                    let mut guard = errors_databases.lock();
-                                    guard.insert(db_name.to_string(), Arc::new(e));
-                                }
-                            }
-                        });
-                        Some(handle)
-                    }
-                    Err(e) => {
-                        error!(%e, "invalid database path");
-                        None
-                    }
-                }
+            Some(async move {
+                let result =
+                    initialize_database(config, root, db_name.clone(), wipe_on_error).await;
+                (db_name, result)
             })
-            .collect();
+        })
+        .collect();
 
-        futures::future::join_all(handles).await;
+    Ok(futures::future::join_all(handles).await)
+}
 
+async fn initialize_database(
+    config: Arc<Config>,
+    root: Path,
+    db_name: DatabaseName<'static>,
+    wipe_on_error: bool,
+) -> Result<()> {
+    // Reserve name before expensive IO (e.g. loading the preserved catalog)
+    let mut handle = config
+        .create_db(db_name)
+        .map_err(Box::new)
+        .context(InitDbError)?;
+
+    match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await
+    {
+        Ok(true) => {
+            // finished init and keep DB
+            handle.commit();
+            Ok(())
+        }
+        Ok(false) => {
+            // finished but do not keep DB
+            handle.abort();
+            Ok(())
+        }
+        Err(e) => {
+            // encountered some error, still commit intermediate result
+            handle.commit();
+            Err(e)
+        }
+    }
+}
+
+async fn load_database_rules(store: Arc<ObjectStore>, path: Path) -> Result<Option<DatabaseRules>> {
+    let serialized_rules = loop {
+        match get_database_config_bytes(&path, &store).await {
+            Ok(data) => break data,
+            Err(e) => {
+                if let Error::NoDatabaseConfigError { location } = &e {
+                    warn!(?location, "{}", e);
+                    return Ok(None);
+                }
+                error!(
+                    "error getting database config {:?} from object store: {}",
+                    path, e
+                );
+                tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
+                    .await;
+            }
+        }
+    };
+    let rules = decode_database_rules(serialized_rules.freeze())
+        .context(ErrorDeserializingRulesProtobuf)?;
+
+    Ok(Some(rules))
+}
+
+pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
+    config: Arc<Config>,
+    db_name: &DatabaseName<'static>,
+) -> Result<()> {
+    let store = config.object_store();
+
+    if config.has_uninitialized_database(db_name) {
+        let mut handle = config
+            .recover_db(db_name.clone())
+            .map_err(|e| Arc::new(e) as _)
+            .context(RecoverDbError)?;
+
+        if !((handle.state_code() == DatabaseStateCode::Known)
+            || (handle.state_code() == DatabaseStateCode::RulesLoaded))
+        {
+            // cannot wipe because init state is already too far
+            return Err(Error::DbPartiallyInitialized {
+                db_name: db_name.to_string(),
+            });
+        }
+
+        // wipe while holding handle so no other init/wipe process can interact with the catalog
+        PreservedCatalog::wipe(&store, handle.server_id(), db_name)
+            .await
+            .map_err(Box::new)
+            .context(PreservedCatalogWipeError)?;
+
+        let root = config.root_path();
+
+        let result =
+            try_advance_database_init_process_until_complete(&mut handle, &root, true).await;
+
+        // Commit changes even if failed
+        handle.commit();
+        result.map(|_| ())
+    } else {
+        let handle = config
+            .block_db(db_name.clone())
+            .map_err(|e| Arc::new(e) as _)
+            .context(RecoverDbError)?;
+
+        PreservedCatalog::wipe(&store, config.server_id(), db_name)
+            .await
+            .map_err(Box::new)
+            .context(PreservedCatalogWipeError)?;
+
+        drop(handle);
+
+        info!(%db_name, "wiped preserved catalog of non-registered database");
         Ok(())
     }
+}
 
-    async fn initialize_database(
-        server_id: ServerId,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        exec: Arc<Executor>,
-        root: Path,
-        db_name: DatabaseName<'static>,
-        wipe_on_error: bool,
-    ) -> Result<()> {
-        // Reserve name before expensive IO (e.g. loading the preserved catalog)
-        let mut handle = config
-            .create_db(store, exec, server_id, db_name)
-            .map_err(Box::new)
-            .context(InitDbError)?;
-
-        match Self::try_advance_database_init_process_until_complete(
-            &mut handle,
-            &root,
-            wipe_on_error,
-        )
-        .await
-        {
-            Ok(true) => {
-                // finished init and keep DB
-                handle.commit();
-                Ok(())
+/// Try to make as much progress as possible with DB init.
+///
+/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
+/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
+/// (e.g. because not rules file is present.)
+async fn try_advance_database_init_process_until_complete(
+    handle: &mut DatabaseHandle<'_>,
+    root: &Path,
+    wipe_on_error: bool,
+) -> Result<bool> {
+    loop {
+        match try_advance_database_init_process(handle, root, wipe_on_error).await? {
+            InitProgress::Unfinished => {}
+            InitProgress::Done => {
+                return Ok(true);
             }
-            Ok(false) => {
-                // finished but do not keep DB
-                handle.abort();
-                Ok(())
-            }
-            Err(e) => {
-                // encountered some error, still commit intermediate result
-                handle.commit();
-                Err(e)
+            InitProgress::Forget => {
+                return Ok(false);
             }
         }
     }
+}
 
-    async fn load_database_rules(
-        store: Arc<ObjectStore>,
-        path: Path,
-    ) -> Result<Option<DatabaseRules>> {
-        let serialized_rules = loop {
-            match get_database_config_bytes(&path, &store).await {
-                Ok(data) => break data,
-                Err(e) => {
-                    if let Error::NoDatabaseConfigError { location } = &e {
-                        warn!(?location, "{}", e);
-                        return Ok(None);
-                    }
-                    error!(
-                        "error getting database config {:?} from object store: {}",
-                        path, e
-                    );
-                    tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
-                        .await;
+/// Try to make some progress in the DB init.
+async fn try_advance_database_init_process(
+    handle: &mut DatabaseHandle<'_>,
+    root: &Path,
+    wipe_on_error: bool,
+) -> Result<InitProgress> {
+    match handle.state_code() {
+        DatabaseStateCode::Known => {
+            // known => load DB rules
+            let path = object_store_path_for_database_config(root, &handle.db_name());
+            match load_database_rules(handle.object_store(), path).await? {
+                Some(rules) => {
+                    handle
+                        .advance_rules_loaded(rules)
+                        .map_err(Box::new)
+                        .context(InitDbError)?;
+
+                    // there is still more work to do for this DB
+                    Ok(InitProgress::Unfinished)
+                }
+                None => {
+                    // no rules file present, advice to forget his DB
+                    Ok(InitProgress::Forget)
                 }
             }
-        };
-        let rules = decode_database_rules(serialized_rules.freeze())
-            .context(ErrorDeserializingRulesProtobuf)?;
-
-        Ok(Some(rules))
-    }
-
-    pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
-        &self,
-        store: Arc<ObjectStore>,
-        config: Arc<Config>,
-        server_id: ServerId,
-        db_name: DatabaseName<'static>,
-    ) -> Result<()> {
-        if config.has_uninitialized_database(&db_name) {
-            let mut handle = config
-                .recover_db(db_name.clone())
-                .map_err(|e| Arc::new(e) as _)
-                .context(RecoverDbError)?;
-
-            if !((handle.state_code() == DatabaseStateCode::Known)
-                || (handle.state_code() == DatabaseStateCode::RulesLoaded))
-            {
-                // cannot wipe because init state is already too far
-                return Err(Error::DbPartiallyInitialized {
-                    db_name: db_name.to_string(),
-                });
-            }
-
-            // wipe while holding handle so no other init/wipe process can interact with the catalog
-            PreservedCatalog::wipe(&store, handle.server_id(), &db_name)
-                .await
-                .map_err(Box::new)
-                .context(PreservedCatalogWipeError)?;
-
-            let root = self.root_path(&store)?;
-            let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
-            match Self::try_advance_database_init_process_until_complete(
-                &mut handle,
-                &root,
+        }
+        DatabaseStateCode::RulesLoaded => {
+            // rules already loaded => continue with loading preserved catalog
+            let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
+                &handle.db_name(),
+                handle.object_store(),
+                handle.server_id(),
+                handle.metrics_registry(),
                 wipe_on_error,
             )
             .await
-            {
-                Ok(_) => {
-                    // yeah, recovered DB
-                    handle.commit();
+            .map_err(|e| Box::new(e) as _)
+            .context(CatalogLoadError)?;
 
-                    let mut guard = self.errors_databases.lock();
-                    guard.remove(&db_name.to_string());
-
-                    info!(%db_name, "wiped preserved catalog of registered database and recovered");
-                    Ok(())
-                }
-                Err(e) => {
-                    // could not recover, but still keep new result
-                    handle.commit();
-
-                    let mut guard = self.errors_databases.lock();
-                    let e = Arc::new(e);
-                    guard.insert(db_name.to_string(), Arc::clone(&e));
-
-                    warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
-                    Err(Error::RecoverDbError { source: e })
-                }
-            }
-        } else {
-            let handle = config
-                .block_db(db_name.clone())
-                .map_err(|e| Arc::new(e) as _)
-                .context(RecoverDbError)?;
-
-            PreservedCatalog::wipe(&store, server_id, &db_name)
+            let rules = handle
+                .rules()
+                .expect("in this state rules should be loaded");
+            let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
                 .await
+                .context(CreateWriteBuffer {
+                    config: rules.write_buffer_connection.clone(),
+                })?;
+            info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
+
+            handle
+                .advance_replay(preserved_catalog, catalog, write_buffer)
                 .map_err(Box::new)
-                .context(PreservedCatalogWipeError)?;
+                .context(InitDbError)?;
 
-            drop(handle);
-
-            info!(%db_name, "wiped preserved catalog of non-registered database");
-            Ok(())
+            // there is still more work to do for this DB
+            Ok(InitProgress::Unfinished)
         }
-    }
+        DatabaseStateCode::Replay => {
+            let db = handle
+                .db_any_state()
+                .expect("DB should be available in this state");
+            db.perform_replay().await;
 
-    /// Try to make as much progress as possible with DB init.
-    ///
-    /// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
-    /// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
-    /// (e.g. because not rules file is present.)
-    async fn try_advance_database_init_process_until_complete(
-        handle: &mut DatabaseHandle<'_>,
-        root: &Path,
-        wipe_on_error: bool,
-    ) -> Result<bool> {
-        loop {
-            match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? {
-                InitProgress::Unfinished => {}
-                InitProgress::Done => {
-                    return Ok(true);
-                }
-                InitProgress::Forget => {
-                    return Ok(false);
-                }
-            }
+            handle
+                .advance_init()
+                .map_err(Box::new)
+                .context(InitDbError)?;
+
+            // there is still more work to do for this DB
+            Ok(InitProgress::Unfinished)
         }
-    }
-
-    /// Try to make some progress in the DB init.
-    async fn try_advance_database_init_process(
-        handle: &mut DatabaseHandle<'_>,
-        root: &Path,
-        wipe_on_error: bool,
-    ) -> Result<InitProgress> {
-        match handle.state_code() {
-            DatabaseStateCode::Known => {
-                // known => load DB rules
-                let path = object_store_path_for_database_config(root, &handle.db_name());
-                match Self::load_database_rules(handle.object_store(), path).await? {
-                    Some(rules) => {
-                        handle
-                            .advance_rules_loaded(rules)
-                            .map_err(Box::new)
-                            .context(InitDbError)?;
-
-                        // there is still more work to do for this DB
-                        Ok(InitProgress::Unfinished)
-                    }
-                    None => {
-                        // no rules file present, advice to forget his DB
-                        Ok(InitProgress::Forget)
-                    }
-                }
-            }
-            DatabaseStateCode::RulesLoaded => {
-                // rules already loaded => continue with loading preserved catalog
-                let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
-                    &handle.db_name(),
-                    handle.object_store(),
-                    handle.server_id(),
-                    handle.metrics_registry(),
-                    wipe_on_error,
-                )
-                .await
-                .map_err(|e| Box::new(e) as _)
-                .context(CatalogLoadError)?;
-
-                let rules = handle
-                    .rules()
-                    .expect("in this state rules should be loaded");
-                let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
-                    .await
-                    .context(CreateWriteBuffer {
-                        config: rules.write_buffer_connection.clone(),
-                    })?;
-                info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
-
-                handle
-                    .advance_replay(preserved_catalog, catalog, write_buffer)
-                    .map_err(Box::new)
-                    .context(InitDbError)?;
-
-                // there is still more work to do for this DB
-                Ok(InitProgress::Unfinished)
-            }
-            DatabaseStateCode::Replay => {
-                let db = handle
-                    .db_any_state()
-                    .expect("DB should be available in this state");
-                db.perform_replay().await;
-
-                handle
-                    .advance_init()
-                    .map_err(Box::new)
-                    .context(InitDbError)?;
-
-                // there is still more work to do for this DB
-                Ok(InitProgress::Unfinished)
-            }
-            DatabaseStateCode::Initialized => {
-                // database fully initialized => nothing to do
-                Ok(InitProgress::Done)
-            }
+        DatabaseStateCode::Initialized => {
+            // database fully initialized => nothing to do
+            Ok(InitProgress::Done)
         }
     }
 }
diff --git a/server/src/lib.rs b/server/src/lib.rs
index 2c4d666888..48246918d1 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -74,9 +74,8 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use bytes::BytesMut;
 use db::load::create_preserved_catalog;
-use init::InitStatus;
-use observability_deps::tracing::{debug, info, warn};
-use parking_lot::Mutex;
+use observability_deps::tracing::{debug, error, info, warn};
+use parking_lot::{Mutex, RwLockUpgradableReadGuard};
 use snafu::{OptionExt, ResultExt, Snafu};
 
 use data_types::{
@@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb;
 use influxdb_line_protocol::ParsedLine;
 use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry};
 use object_store::{ObjectStore, ObjectStoreApi};
+use parking_lot::RwLock;
 use query::{exec::Executor, DatabaseStore};
 use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt};
 use write_buffer::config::WriteBufferConfig;
@@ -220,11 +220,11 @@ pub enum Error {
     #[snafu(display("cannot create preserved catalog: {}", source))]
     CannotCreatePreservedCatalog { source: DatabaseError },
 
-    #[snafu(display("cannot set id: {}", source))]
-    SetIdError { source: crate::init::Error },
+    #[snafu(display("id already set"))]
+    IdAlreadySet,
 
-    #[snafu(display("cannot get id: {}", source))]
-    GetIdError { source: crate::init::Error },
+    #[snafu(display("id not set"))]
+    IdNotSet,
 
     #[snafu(display(
         "cannot create write buffer with config: {:?}, error: {}",
@@ -297,6 +297,8 @@ pub struct ServerConfig {
     metric_registry: Arc<MetricRegistry>,
 
     remote_template: Option<RemoteTemplate>,
+
+    wipe_catalog_on_error: bool,
 }
 
 impl ServerConfig {
@@ -311,6 +313,7 @@ impl ServerConfig {
             object_store,
             metric_registry,
             remote_template,
+            wipe_catalog_on_error: true,
         }
     }
 
@@ -414,7 +417,6 @@ impl ServerMetrics {
 /// of these structs, which keeps track of all replication and query rules.
 #[derive(Debug)]
 pub struct Server<M: ConnectionManager> {
-    config: Arc<Config>,
     connection_manager: Arc<M>,
     pub store: Arc<ObjectStore>,
     exec: Arc<Executor>,
@@ -426,7 +428,50 @@ pub struct Server<M: ConnectionManager> {
     /// and populates the endpoint with this data.
     pub registry: Arc<metrics::MetricRegistry>,
 
-    init_status: Arc<InitStatus>,
+    /// The state machine for server startup
+    stage: Arc<RwLock<ServerStage>>,
+}
+
+/// The stage of the server in the startup process
+///
+/// The progression is linear Startup -> InitReady -> Initializing -> Initialized
+/// with the sole exception that on failure Initializing -> InitReady
+///
+/// Errors encountered on server init will be retried, however, errors encountered
+/// during database init will require operator intervention
+///
+/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively
+///
+/// They do not impact the state machine's progression, but instead are exposed to the
+/// gRPC management API to allow an operator to assess the state of the system
+#[derive(Debug)]
+enum ServerStage {
+    /// Server has started but doesn't have a server id yet
+    Startup {
+        remote_template: Option<RemoteTemplate>,
+        wipe_catalog_on_error: bool,
+    },
+
+    /// Server can be initialized
+    InitReady {
+        wipe_catalog_on_error: bool,
+        config: Arc<Config>,
+        last_error: Option<Arc<init::Error>>,
+    },
+
+    /// Server has a server id, has started loading
+    Initializing {
+        wipe_catalog_on_error: bool,
+        config: Arc<Config>,
+        last_error: Option<Arc<init::Error>>,
+    },
+
+    /// Server has finish initializing, possibly with errors
+    Initialized {
+        config: Arc<Config>,
+        /// Errors that occurred during some DB init.
+        database_errors: HashMap<String, Arc<init::Error>>,
+    },
 }
 
 #[derive(Debug)]
@@ -454,22 +499,23 @@ where
             // to test the metrics provide a different registry to the `ServerConfig`.
             metric_registry,
             remote_template,
+            wipe_catalog_on_error,
         } = config;
+
         let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get);
+        let exec = Arc::new(Executor::new(num_worker_threads));
 
         Self {
-            config: Arc::new(Config::new(
-                Arc::clone(&jobs),
-                Arc::clone(&metric_registry),
-                remote_template,
-            )),
             store: object_store,
             connection_manager: Arc::new(connection_manager),
-            exec: Arc::new(Executor::new(num_worker_threads)),
+            exec,
             jobs,
             metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))),
             registry: Arc::clone(&metric_registry),
-            init_status: Arc::new(InitStatus::new()),
+            stage: Arc::new(RwLock::new(ServerStage::Startup {
+                remote_template,
+                wipe_catalog_on_error,
+            })),
         }
     }
 
@@ -478,68 +524,112 @@ where
     ///
     /// A valid server ID Must be non-zero.
     pub fn set_id(&self, id: ServerId) -> Result<()> {
-        self.init_status.server_id.set(id).context(SetIdError)
-    }
+        let mut stage = self.stage.write();
+        match &mut *stage {
+            ServerStage::Startup {
+                remote_template,
+                wipe_catalog_on_error,
+            } => {
+                let remote_template = remote_template.take();
 
-    /// Returns the current server ID, or an error if not yet set.
-    pub fn require_id(&self) -> Result<ServerId> {
-        self.init_status.server_id.get().context(GetIdError)
+                *stage = ServerStage::InitReady {
+                    wipe_catalog_on_error: *wipe_catalog_on_error,
+                    config: Arc::new(Config::new(
+                        Arc::clone(&self.jobs),
+                        Arc::clone(&self.store),
+                        Arc::clone(&self.exec),
+                        id,
+                        Arc::clone(&self.registry),
+                        remote_template,
+                    )),
+                    last_error: None,
+                };
+                Ok(())
+            }
+            _ => Err(Error::IdAlreadySet),
+        }
     }
 
     /// Check if server is loaded. Databases are loaded and server is ready to read/write.
     pub fn initialized(&self) -> bool {
-        self.init_status.initialized()
+        matches!(&*self.stage.read(), ServerStage::Initialized { .. })
+    }
+
+    /// Require that server is loaded. Databases are loaded and server is ready to read/write.
+    fn require_initialized(&self) -> Result<Arc<Config>> {
+        match &*self.stage.read() {
+            ServerStage::Startup { .. } => Err(Error::IdNotSet),
+            ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => {
+                Err(Error::ServerNotInitialized {
+                    server_id: config.server_id(),
+                })
+            }
+            ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
+        }
+    }
+
+    /// Returns the config for this server if server id has been set
+    fn config(&self) -> Result<Arc<Config>> {
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Startup { .. } => Err(Error::IdNotSet),
+            ServerStage::InitReady { config, .. }
+            | ServerStage::Initializing { config, .. }
+            | ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
+        }
+    }
+
+    /// Returns the server id for this server if set
+    pub fn server_id(&self) -> Option<ServerId> {
+        self.config().map(|x| x.server_id()).ok()
     }
 
     /// Error occurred during generic server init (e.g. listing store content).
     pub fn error_generic(&self) -> Option<Arc<crate::init::Error>> {
-        self.init_status.error_generic()
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::InitReady { last_error, .. } => last_error.clone(),
+            ServerStage::Initializing { last_error, .. } => last_error.clone(),
+            _ => None,
+        }
     }
 
     /// List all databases with errors in sorted order.
     pub fn databases_with_errors(&self) -> Vec<String> {
-        self.init_status.databases_with_errors()
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Initialized {
+                database_errors, ..
+            } => database_errors.keys().cloned().collect(),
+            _ => Default::default(),
+        }
     }
 
     /// Error that occurred during initialization of a specific database.
     pub fn error_database(&self, db_name: &str) -> Option<Arc<crate::init::Error>> {
-        self.init_status.error_database(db_name)
+        let stage = self.stage.read();
+        match &*stage {
+            ServerStage::Initialized {
+                database_errors, ..
+            } => database_errors.get(db_name).cloned(),
+            _ => None,
+        }
     }
 
     /// Current database init state.
     pub fn database_state(&self, name: &str) -> Option<DatabaseStateCode> {
-        if let Ok(name) = DatabaseName::new(name) {
-            self.config.db_state(&name)
-        } else {
-            None
-        }
-    }
-
-    /// Require that server is loaded. Databases are loaded and server is ready to read/write.
-    fn require_initialized(&self) -> Result<ServerId> {
-        // since a server ID is the pre-requirement for init, check this first
-        let server_id = self.require_id()?;
-
-        // ordering here isn't that important since this method is not used to check-and-modify the flag
-        if self.initialized() {
-            Ok(server_id)
-        } else {
-            Err(Error::ServerNotInitialized { server_id })
-        }
+        let db_name = DatabaseName::new(name).ok()?;
+        let config = self.config().ok()?;
+        config.db_state(&db_name)
     }
 
     /// Tells the server the set of rules for a database.
     pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> {
         // Return an error if this server is not yet ready
-        let server_id = self.require_initialized()?;
+        let config = self.require_initialized()?;
 
         // Reserve name before expensive IO (e.g. loading the preserved catalog)
-        let mut db_reservation = self.config.create_db(
-            Arc::clone(&self.store),
-            Arc::clone(&self.exec),
-            server_id,
-            rules.name.clone(),
-        )?;
+        let mut db_reservation = config.create_db(rules.name.clone())?;
 
         // register rules
         db_reservation.advance_rules_loaded(rules.clone())?;
@@ -548,14 +638,14 @@ where
         let (preserved_catalog, catalog) = create_preserved_catalog(
             rules.db_name(),
             Arc::clone(&self.store),
-            server_id,
-            self.config.metrics_registry(),
+            config.server_id(),
+            config.metrics_registry(),
         )
         .await
         .map_err(|e| Box::new(e) as _)
         .context(CannotCreatePreservedCatalog)?;
 
-        let write_buffer = WriteBufferConfig::new(server_id, &rules)
+        let write_buffer = WriteBufferConfig::new(config.server_id(), &rules)
             .await
             .map_err(|e| Error::CreatingWriteBuffer {
                 config: rules.write_buffer_connection.clone(),
@@ -575,13 +665,8 @@ where
     }
 
     pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> {
-        let location = object_store_path_for_database_config(
-            &self
-                .init_status
-                .root_path(&self.store)
-                .context(GetIdError)?,
-            &rules.name,
-        );
+        let config = self.config()?;
+        let location = object_store_path_for_database_config(&config.root_path(), &rules.name);
 
         let mut data = BytesMut::new();
         encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?;
@@ -604,15 +689,62 @@ where
     /// object store. Any databases in the config already won't be
     /// replaced.
     ///
-    /// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready.
+    /// This requires the serverID to be set.
+    ///
+    /// It will be a no-op if the configs are already loaded and the server is ready.
     pub async fn maybe_initialize_server(&self) {
-        self.init_status
-            .maybe_initialize_server(
-                Arc::clone(&self.store),
-                Arc::clone(&self.config),
-                Arc::clone(&self.exec),
-            )
-            .await;
+        // Explicit scope to help async generator
+        let (wipe_catalog_on_error, config) = {
+            let state = self.stage.upgradable_read();
+            match &*state {
+                ServerStage::InitReady {
+                    wipe_catalog_on_error,
+                    config,
+                    last_error,
+                } => {
+                    let config = Arc::clone(config);
+                    let last_error = last_error.clone();
+                    let wipe_catalog_on_error = *wipe_catalog_on_error;
+
+                    // Mark the server as initializing and drop lock
+
+                    let mut state = RwLockUpgradableReadGuard::upgrade(state);
+                    *state = ServerStage::Initializing {
+                        config: Arc::clone(&config),
+                        wipe_catalog_on_error,
+                        last_error,
+                    };
+                    (wipe_catalog_on_error, config)
+                }
+                _ => return,
+            }
+        };
+
+        let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await;
+        let new_stage = match init_result {
+            // Success -> move to next stage
+            Ok(results) => {
+                info!(server_id=%config.server_id(), "server initialized");
+                ServerStage::Initialized {
+                    config,
+                    database_errors: results
+                        .into_iter()
+                        .filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?))))
+                        .collect(),
+                }
+            }
+            // Error -> return to InitReady
+            Err(err) => {
+                error!(%err, "error during server init");
+                ServerStage::InitReady {
+                    wipe_catalog_on_error,
+                    config,
+                    last_error: Some(Arc::new(err)),
+                }
+            }
+        };
+
+        *self.stage.write() = new_stage;
     }
 
     pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> {
@@ -640,11 +772,10 @@ where
         default_time: i64,
     ) -> Result<()> {
         // Return an error if this server is not yet ready
-        self.require_initialized()?;
+        let config = self.require_initialized()?;
 
         let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
-        let db = self
-            .config
+        let db = config
             .db_initialized(&db_name)
             .context(DatabaseNotFound { db_name: &*db_name })?;
 
@@ -744,9 +875,12 @@ where
         node_group: &[ServerId],
         entry: Entry,
     ) -> Result<()> {
+        // Return an error if this server is not yet ready
+        let config = self.config()?;
+
         let addrs: Vec<_> = node_group
             .iter()
-            .filter_map(|&node| self.config.resolve_remote(node))
+            .filter_map(|&node| config.resolve_remote(node))
             .collect();
         if addrs.is_empty() {
             return NoRemoteConfigured { node_group }.fail();
@@ -775,11 +909,10 @@ where
 
     pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec<u8>) -> Result<()> {
         // Return an error if this server is not yet ready
-        self.require_initialized()?;
+        let config = self.require_initialized()?;
 
         let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
-        let db = self
-            .config
+        let db = config
             .db_initialized(&db_name)
             .context(DatabaseNotFound { db_name: &*db_name })?;
 
@@ -825,11 +958,11 @@ where
     }
 
     pub fn db(&self, name: &DatabaseName<'_>) -> Option<Arc<Db>> {
-        self.config.db_initialized(name)
+        self.config().ok()?.db_initialized(name)
     }
 
     pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option<Arc<DatabaseRules>> {
-        self.config.db_initialized(name).map(|d| d.rules())
+        self.db(name).map(|d| d.rules())
     }
 
     // Update database rules and save on success.
@@ -841,8 +974,8 @@ where
     where
         F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E> + Send,
     {
-        let rules = self
-            .config
+        let config = self.config()?;
+        let rules = config
             .update_db_rules(db_name, update)
             .map_err(|e| match e {
                 crate::config::UpdateError::Closure(e) => UpdateError::Closure(e),
@@ -854,16 +987,23 @@ where
         Ok(rules)
     }
 
-    pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> {
-        self.config.remotes_sorted()
+    pub fn remotes_sorted(&self) -> Result<Vec<(ServerId, String)>> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        Ok(config.remotes_sorted())
     }
 
-    pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) {
-        self.config.update_remote(id, addr)
+    pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        config.update_remote(id, addr);
+        Ok(())
     }
 
-    pub fn delete_remote(&self, id: ServerId) -> Option<GRpcConnectionString> {
-        self.config.delete_remote(id)
+    pub fn delete_remote(&self, id: ServerId) -> Result<Option<GRpcConnectionString>> {
+        // TODO: Should these be on ConnectionManager and not Config
+        let config = self.config()?;
+        Ok(config.delete_remote(id))
     }
 
     pub fn spawn_dummy_job(&self, nanos: Vec<u64>) -> TaskTracker<Job> {
@@ -893,14 +1033,15 @@ where
         partition_key: impl Into<String>,
         chunk_id: u32,
     ) -> Result<TaskTracker<Job>> {
+        let config = self.require_initialized()?;
+
         let db_name = db_name.to_string();
         let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?;
 
         let partition_key = partition_key.into();
         let table_name = table_name.into();
 
-        let db = self
-            .config
+        let db = config
             .db_initialized(&name)
             .context(DatabaseNotFound { db_name: &db_name })?;
 
@@ -921,25 +1062,62 @@ where
     /// DB jobs and this command.
     pub fn wipe_preserved_catalog(
         &self,
-        db_name: DatabaseName<'static>,
+        db_name: &DatabaseName<'static>,
     ) -> Result<TaskTracker<Job>> {
-        if self.config.db_initialized(&db_name).is_some() {
-            return Err(Error::DatabaseAlreadyExists {
-                db_name: db_name.to_string(),
-            });
-        }
+        // Can only wipe catalog of database that failed to initialize
+        let config = match &*self.stage.read() {
+            ServerStage::Initialized {
+                config,
+                database_errors,
+            } => {
+                if config.db_initialized(db_name).is_some() {
+                    return Err(Error::DatabaseAlreadyExists {
+                        db_name: db_name.to_string(),
+                    });
+                }
+
+                if !database_errors.contains_key(db_name.as_str()) {
+                    // TODO: Should this be an error? Some end-to-end tests assume it is non-fatal
+                    warn!(%db_name, "wiping database not present at startup");
+                }
+                Arc::clone(config)
+            }
+            ServerStage::Startup { .. } => return Err(Error::IdNotSet),
+            ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => {
+                return Err(Error::ServerNotInitialized {
+                    server_id: config.server_id(),
+                })
+            }
+        };
 
         let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog {
             db_name: db_name.to_string(),
         });
-        let object_store = Arc::clone(&self.store);
-        let config = Arc::clone(&self.config);
-        let server_id = self.require_id()?;
-        let init_status = Arc::clone(&self.init_status);
+
+        let state = Arc::clone(&self.stage);
+        let db_name = db_name.clone();
+
         let task = async move {
-            init_status
-                .wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name)
-                .await
+            let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await;
+
+            match &mut *state.write() {
+                ServerStage::Initialized {
+                    database_errors, ..
+                } => match result {
+                    Ok(_) => {
+                        info!(%db_name, "wiped preserved catalog of registered database and recovered");
+                        database_errors.remove(db_name.as_str());
+                        Ok(())
+                    }
+                    Err(e) => {
+                        warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
+                        let e = Arc::new(e);
+                        database_errors.insert(db_name.to_string(), Arc::clone(&e));
+                        Err(e)
+                    }
+                },
+                _ => unreachable!("server cannot become uninitialized"),
+            }
         };
         tokio::spawn(task.track(registration));
 
@@ -973,7 +1151,9 @@ where
         }
 
         info!("shutting down background workers");
-        self.config.drain().await;
+        if let Ok(config) = self.config() {
+            config.drain().await;
+        }
 
         info!("draining tracker registry");
 
@@ -999,11 +1179,15 @@ where
     type Error = Error;
 
     fn db_names_sorted(&self) -> Vec<String> {
-        self.config
-            .db_names_sorted()
-            .iter()
-            .map(|i| i.clone().into())
-            .collect()
+        self.config()
+            .map(|config| {
+                config
+                    .db_names_sorted()
+                    .iter()
+                    .map(ToString::to_string)
+                    .collect()
+            })
+            .unwrap_or_default()
     }
 
     fn db(&self, name: &str) -> Option<Arc<Self::Database>> {
@@ -1214,25 +1398,15 @@ mod tests {
         let manager = TestConnectionManager::new();
         let server = Server::new(manager, config());
 
-        let resp = server.require_id().unwrap_err();
-        assert!(matches!(
-            resp,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        let resp = server.config().unwrap_err();
+        assert!(matches!(resp, Error::IdNotSet));
 
         let lines = parsed_lines("cpu foo=1 10");
         let resp = server
             .write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME)
             .await
             .unwrap_err();
-        assert!(matches!(
-            resp,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        assert!(matches!(resp, Error::IdNotSet));
     }
 
     #[tokio::test]
@@ -1589,7 +1763,9 @@ mod tests {
         );
 
         // one remote is configured but it's down and we'll get connection error
-        server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into());
+        server
+            .update_remote(bad_remote_id, BAD_REMOTE_ADDR.into())
+            .unwrap();
         let err = server
             .write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME)
             .await
@@ -1606,8 +1782,12 @@ mod tests {
 
         // We configure the address for the other remote, this time connection will succeed
         // despite the bad remote failing to connect.
-        server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into());
-        server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into());
+        server
+            .update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into())
+            .unwrap();
+        server
+            .update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into())
+            .unwrap();
 
         // Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable
         // probability both the remotes will get hit.
@@ -1844,12 +2024,7 @@ mod tests {
         let err = create_simple_database(&server, "bananas")
             .await
             .unwrap_err();
-        assert!(matches!(
-            err,
-            Error::GetIdError {
-                source: crate::init::Error::IdNotSet
-            }
-        ));
+        assert!(matches!(err, Error::IdNotSet));
 
         server.set_id(ServerId::try_from(1).unwrap()).unwrap();
         // do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready
@@ -1873,7 +2048,7 @@ mod tests {
 
         let t_0 = Instant::now();
         loop {
-            if server.require_initialized().is_ok() {
+            if server.config().is_ok() {
                 break;
             }
             assert!(t_0.elapsed() < Duration::from_secs(10));
@@ -1916,9 +2091,12 @@ mod tests {
         create_simple_database(&server, "foo")
             .await
             .expect("failed to create database");
-        let root = server.init_status.root_path(&store).unwrap();
-        server.config.drain().await;
+
+        let config = server.require_initialized().unwrap();
+        let root = config.root_path();
+        config.drain().await;
         drop(server);
+        drop(config);
 
         // tamper store
         let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap());
@@ -2003,18 +2181,24 @@ mod tests {
         let server = Server::new(manager, config);
         server.set_id(server_id).unwrap();
         server.maybe_initialize_server().await;
+
         create_simple_database(&server, db_name_existing.clone())
             .await
             .expect("failed to create database");
+
         create_simple_database(&server, db_name_rules_broken.clone())
             .await
             .expect("failed to create database");
+
         create_simple_database(&server, db_name_catalog_broken.clone())
             .await
             .expect("failed to create database");
-        let root = server.init_status.root_path(&store).unwrap();
-        server.config.drain().await;
+
+        let config = server.require_initialized().unwrap();
+        let root = config.root_path();
+        config.drain().await;
         drop(server);
+        drop(config);
 
         // tamper store to break one database
         let path = object_store_path_for_database_config(&root, &db_name_rules_broken);
@@ -2045,22 +2229,18 @@ mod tests {
         let store = Arc::try_unwrap(store).unwrap();
         store.get(&path).await.unwrap();
         let manager = TestConnectionManager::new();
-        let config = config_with_store(store);
-        let server = Server::new(manager, config);
-
         // need to disable auto-wipe for this test
-        server
-            .init_status
-            .wipe_on_error
-            .store(false, std::sync::atomic::Ordering::Relaxed);
+        let mut config = config_with_store(store);
+        config.wipe_catalog_on_error = false;
+        let server = Server::new(manager, config);
 
         // cannot wipe if server ID is not set
         assert_eq!(
             server
-                .wipe_preserved_catalog(db_name_non_existing.clone())
+                .wipe_preserved_catalog(&db_name_non_existing)
                 .unwrap_err()
                 .to_string(),
-            "cannot get id: unable to use server until id is set"
+            "id not set"
         );
 
         server.set_id(ServerId::try_from(1).unwrap()).unwrap();
@@ -2069,31 +2249,29 @@ mod tests {
         // 1. cannot wipe if DB exists
         assert_eq!(
             server
-                .wipe_preserved_catalog(db_name_existing.clone())
+                .wipe_preserved_catalog(&db_name_existing)
                 .unwrap_err()
                 .to_string(),
             "database already exists: db_existing"
         );
-        assert!(PreservedCatalog::exists(
-            &server.store,
-            server.require_id().unwrap(),
-            &db_name_existing.to_string()
-        )
-        .await
-        .unwrap());
+        assert!(
+            PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str())
+                .await
+                .unwrap()
+        );
 
         // 2. wiping a non-existing DB just works, but won't bring DB into existence
         assert!(server.error_database(&db_name_non_existing).is_none());
         PreservedCatalog::new_empty::<TestCatalogState>(
             Arc::clone(&server.store),
-            server.require_id().unwrap(),
+            server_id,
             db_name_non_existing.to_string(),
             (),
         )
         .await
         .unwrap();
         let tracker = server
-            .wipe_preserved_catalog(db_name_non_existing.clone())
+            .wipe_preserved_catalog(&db_name_non_existing)
             .unwrap();
         let metadata = tracker.metadata();
         let expected_metadata = Job::WipePreservedCatalog {
@@ -2103,7 +2281,7 @@ mod tests {
         tracker.join().await;
         assert!(!PreservedCatalog::exists(
             &server.store,
-            server.require_id().unwrap(),
+            server_id,
             &db_name_non_existing.to_string()
         )
         .await
@@ -2114,7 +2292,7 @@ mod tests {
         // 3. wipe DB with broken rules file, this won't bring DB back to life
         assert!(server.error_database(&db_name_rules_broken).is_some());
         let tracker = server
-            .wipe_preserved_catalog(db_name_rules_broken.clone())
+            .wipe_preserved_catalog(&db_name_rules_broken)
             .unwrap();
         let metadata = tracker.metadata();
         let expected_metadata = Job::WipePreservedCatalog {
@@ -2124,7 +2302,7 @@ mod tests {
         tracker.join().await;
         assert!(!PreservedCatalog::exists(
             &server.store,
-            server.require_id().unwrap(),
+            server_id,
             &db_name_rules_broken.to_string()
         )
         .await
@@ -2135,7 +2313,7 @@ mod tests {
         // 4. wipe DB with broken catalog, this will bring the DB back to life
         assert!(server.error_database(&db_name_catalog_broken).is_some());
         let tracker = server
-            .wipe_preserved_catalog(db_name_catalog_broken.clone())
+            .wipe_preserved_catalog(&db_name_catalog_broken)
             .unwrap();
         let metadata = tracker.metadata();
         let expected_metadata = Job::WipePreservedCatalog {
@@ -2145,7 +2323,7 @@ mod tests {
         tracker.join().await;
         assert!(PreservedCatalog::exists(
             &server.store,
-            server.require_id().unwrap(),
+            server_id,
             &db_name_catalog_broken.to_string()
         )
         .await
@@ -2166,18 +2344,16 @@ mod tests {
             .unwrap();
         assert_eq!(
             server
-                .wipe_preserved_catalog(db_name_created.clone())
+                .wipe_preserved_catalog(&db_name_created)
                 .unwrap_err()
                 .to_string(),
             "database already exists: db_created"
         );
-        assert!(PreservedCatalog::exists(
-            &server.store,
-            server.require_id().unwrap(),
-            &db_name_created.to_string()
-        )
-        .await
-        .unwrap());
+        assert!(
+            PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string())
+                .await
+                .unwrap()
+        );
     }
 
     #[tokio::test]
diff --git a/src/influxdb_ioxd/rpc/error.rs b/src/influxdb_ioxd/rpc/error.rs
index 4b3b95f314..a6ab258497 100644
--- a/src/influxdb_ioxd/rpc/error.rs
+++ b/src/influxdb_ioxd/rpc/error.rs
@@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status {
     use server::Error;
 
     match error {
-        Error::GetIdError { .. } => PreconditionViolation {
+        Error::IdNotSet => PreconditionViolation {
             category: "Writer ID".to_string(),
             subject: "influxdata.com/iox".to_string(),
             description: "Writer ID must be set".to_string(),
diff --git a/src/influxdb_ioxd/rpc/management.rs b/src/influxdb_ioxd/rpc/management.rs
index 5f81db5e46..2b26040d04 100644
--- a/src/influxdb_ioxd/rpc/management.rs
+++ b/src/influxdb_ioxd/rpc/management.rs
@@ -56,7 +56,7 @@ where
         &self,
         _: Request<GetServerIdRequest>,
     ) -> Result<Response<GetServerIdResponse>, Status> {
-        match self.server.require_id().ok() {
+        match self.server.server_id() {
             Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })),
             None => return Err(NotFound::default().into()),
         }
@@ -71,7 +71,7 @@ where
 
         match self.server.set_id(id) {
             Ok(_) => Ok(Response::new(UpdateServerIdResponse {})),
-            Err(e @ Error::SetIdError { .. }) => {
+            Err(e @ Error::IdAlreadySet) => {
                 return Err(FieldViolation {
                     field: "id".to_string(),
                     description: e.to_string(),
@@ -199,15 +199,18 @@ where
         &self,
         _: Request<ListRemotesRequest>,
     ) -> Result<Response<ListRemotesResponse>, Status> {
-        let remotes = self
-            .server
-            .remotes_sorted()
-            .into_iter()
-            .map(|(id, connection_string)| Remote {
-                id: id.get_u32(),
-                connection_string,
-            })
-            .collect();
+        let result = self.server.remotes_sorted();
+        let remotes = match result {
+            Ok(remotes) => remotes
+                .into_iter()
+                .map(|(id, connection_string)| Remote {
+                    id: id.get_u32(),
+                    connection_string,
+                })
+                .collect(),
+            Err(e) => return Err(default_server_error_handler(e)),
+        };
+
         Ok(Response::new(ListRemotesResponse { remotes }))
     }
 
@@ -221,8 +224,16 @@ where
             .ok_or_else(|| FieldViolation::required("remote"))?;
         let remote_id = ServerId::try_from(remote.id)
             .map_err(|_| FieldViolation::required("id").scope("remote"))?;
-        self.server
+
+        let result = self
+            .server
             .update_remote(remote_id, remote.connection_string);
+
+        match result {
+            Ok(_) => {}
+            Err(e) => return Err(default_server_error_handler(e)),
+        }
+
         Ok(Response::new(UpdateRemoteResponse {}))
     }
 
@@ -233,9 +244,12 @@ where
         let request = request.into_inner();
         let remote_id =
             ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?;
-        self.server
-            .delete_remote(remote_id)
-            .ok_or_else(NotFound::default)?;
+
+        match self.server.delete_remote(remote_id) {
+            Ok(Some(_)) => {}
+            Ok(None) => return Err(NotFound::default().into()),
+            Err(e) => return Err(default_server_error_handler(e)),
+        }
 
         Ok(Response::new(DeleteRemoteResponse {}))
     }
@@ -455,7 +469,7 @@ where
 
         let tracker = self
             .server
-            .wipe_preserved_catalog(db_name)
+            .wipe_preserved_catalog(&db_name)
             .map_err(|e| match e {
                 Error::DatabaseAlreadyExists { db_name } => AlreadyExists {
                     resource_type: "database".to_string(),
diff --git a/tests/end_to_end_cases/management_api.rs b/tests/end_to_end_cases/management_api.rs
index 2f26969085..d543e7fb68 100644
--- a/tests/end_to_end_cases/management_api.rs
+++ b/tests/end_to_end_cases/management_api.rs
@@ -65,6 +65,8 @@ async fn test_list_update_remotes() {
     const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321";
     const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321";
 
+    client.update_server_id(123).await.unwrap();
+
     let res = client.list_remotes().await.expect("list remotes failed");
     assert_eq!(res.len(), 0);
 
diff --git a/tests/end_to_end_cases/management_cli.rs b/tests/end_to_end_cases/management_cli.rs
index a09285c695..3e0be27290 100644
--- a/tests/end_to_end_cases/management_cli.rs
+++ b/tests/end_to_end_cases/management_cli.rs
@@ -244,6 +244,18 @@ async fn test_list_chunks_error() {
 async fn test_remotes() {
     let server_fixture = ServerFixture::create_single_use().await;
     let addr = server_fixture.grpc_base();
+
+    Command::cargo_bin("influxdb_iox")
+        .unwrap()
+        .arg("server")
+        .arg("set")
+        .arg("32")
+        .arg("--host")
+        .arg(addr)
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("Ok"));
+
     Command::cargo_bin("influxdb_iox")
         .unwrap()
         .arg("server")

From b0663a0337610dfbebc7ec28224a1814e847bf8b Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Tue, 20 Jul 2021 12:35:20 +0200
Subject: [PATCH 08/27] feat: disallow multiple write buffer streams and
 seeking while streams

Multiple streams will mess up ordering. Seeking while streaming is
likely a bug and should not work.
---
 server/src/db.rs          |  5 ++-
 write_buffer/src/core.rs  | 68 ++++++++++++------------------
 write_buffer/src/guard.rs | 87 +++++++++++++++++++++++++++++++++++++++
 write_buffer/src/kafka.rs | 32 +++++++++++---
 write_buffer/src/lib.rs   |  1 +
 write_buffer/src/mock.rs  | 24 +++++++++--
 6 files changed, 167 insertions(+), 50 deletions(-)
 create mode 100644 write_buffer/src/guard.rs

diff --git a/server/src/db.rs b/server/src/db.rs
index c39655f107..59348fc956 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -657,7 +657,10 @@ impl Db {
             async {
                 if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer {
                     let mut futures = vec![];
-                    for (_sequencer_id, stream) in write_buffer.streams() {
+                    for (_sequencer_id, stream) in write_buffer
+                        .streams()
+                        .expect("no streams should exist at this point")
+                    {
                         let fut = self.stream_in_sequenced_entries(stream);
                         futures.push(fut);
                     }
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index 27c7512884..1f91d955ab 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -29,13 +29,13 @@ pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError
 pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
     /// Returns a stream per sequencer.
     ///
-    /// Calling this method multiple times returns multiple streams that share the same state, i.e. entries for a
-    /// specific sequencer will only be deliver on on of the streams (likely the first that is polled). If you need
-    /// independent streams, create multiple [`WriteBufferReading`] objects.
-    fn streams(&self) -> Vec<(u32, EntryStream<'_>)>;
+    /// When calling this method a second time while the streams of the first call are still in use, an error will be returned.
+    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError>;
 
     /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
     /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    ///
+    /// When calling this methods while streams (from [`streams`](Self::streams)) are in use, an error will be returned.
     async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>;
 }
 
@@ -68,7 +68,7 @@ pub mod test_utils {
         T: TestAdapter,
     {
         test_single_stream_io(&adapter).await;
-        test_multi_stream_io(&adapter).await;
+        test_multi_stream(&adapter).await;
         test_multi_sequencer_io(&adapter).await;
         test_multi_writer_multi_reader(&adapter).await;
         test_seek(&adapter).await;
@@ -87,7 +87,7 @@ pub mod test_utils {
         let writer = context.writing();
         let reader = context.reading().await;
 
-        let mut streams = reader.streams();
+        let mut streams = reader.streams().unwrap();
         assert_eq!(streams.len(), 1);
         let (sequencer_id, mut stream) = streams.pop().unwrap();
 
@@ -127,7 +127,7 @@ pub mod test_utils {
         let writer = context.writing();
         let reader = context.reading().await;
 
-        let mut streams = reader.streams();
+        let mut streams = reader.streams().unwrap();
         assert_eq!(streams.len(), 2);
         let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
         let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
@@ -158,45 +158,26 @@ pub mod test_utils {
         assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
     }
 
-    async fn test_multi_stream_io<T>(adapter: &T)
+    async fn test_multi_stream<T>(adapter: &T)
     where
         T: TestAdapter,
     {
-        let context = adapter.new_context(1).await;
-
-        let entry_1 = lp_to_entry("upc user=1 100");
-        let entry_2 = lp_to_entry("upc user=2 200");
-        let entry_3 = lp_to_entry("upc user=3 300");
-
-        let writer = context.writing();
+        let context = adapter.new_context(2).await;
         let reader = context.reading().await;
 
-        let mut streams_1 = reader.streams();
-        let mut streams_2 = reader.streams();
-        assert_eq!(streams_1.len(), 1);
-        assert_eq!(streams_2.len(), 1);
-        let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap();
-        let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap();
-        assert_eq!(sequencer_id_1, sequencer_id_2);
+        let mut streams = reader.streams().unwrap();
+        assert_eq!(streams.len(), 2);
+        let (_sequencer_id, stream_1) = streams.pop().unwrap();
+        let (_sequencer_id, stream_2) = streams.pop().unwrap();
 
-        let waker = futures::task::noop_waker();
-        let mut cx = futures::task::Context::from_waker(&waker);
+        // cannot get another stream while streams are in use
+        assert!(reader.streams().is_err());
+        drop(stream_1);
+        assert!(reader.streams().is_err());
+        drop(stream_2);
 
-        // empty streams is pending
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-
-        // streams poll from same source
-        writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
-        writer.store_entry(&entry_2, sequencer_id_1).await.unwrap();
-        writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
-        assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
-
-        // both streams are pending again
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        // when all streams are dropped, we can get new ones
+        reader.streams().unwrap();
     }
 
     async fn test_multi_writer_multi_reader<T>(adapter: &T)
@@ -269,13 +250,18 @@ pub mod test_utils {
         // seek to far end and then at data
         reader_1.seek(0, 1_000_000).await.unwrap();
         let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
-        let mut streams = reader_1.streams();
+        let mut streams = reader_1.streams().unwrap();
         assert_eq!(streams.len(), 2);
         let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
         let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
         assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
         assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
 
+        // seeking while streams are in use is an error
+        reader_1.seek(0, 0).await.unwrap_err();
+        drop(stream_1);
+        drop(stream_2);
+
         // seeking unknown sequencer is NOT an error
         reader_1.seek(0, 42).await.unwrap();
     }
@@ -284,7 +270,7 @@ pub mod test_utils {
     where
         R: WriteBufferReading,
     {
-        let mut streams = reader.streams();
+        let mut streams = reader.streams().unwrap();
         assert_eq!(streams.len(), expected.len());
         streams.sort_by_key(|(sequencer_id, _stream)| *sequencer_id);
 
diff --git a/write_buffer/src/guard.rs b/write_buffer/src/guard.rs
new file mode 100644
index 0000000000..636193a15c
--- /dev/null
+++ b/write_buffer/src/guard.rs
@@ -0,0 +1,87 @@
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc,
+};
+
+/// A semaphore that produces [`Send`]able guards.
+pub struct Semaphore {
+    user_count: Arc<AtomicUsize>,
+}
+
+impl Semaphore {
+    /// Creates new semaphore with a single permit.
+    pub fn new() -> Self {
+        Self {
+            user_count: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    /// Creates guard if no permit exists.
+    ///
+    /// To produce multiple guards, you can clone an existing one.
+    pub fn guard(&self) -> Option<Guard> {
+        let count = self.user_count.fetch_add(1, Ordering::SeqCst);
+        if count > 0 {
+            self.user_count.fetch_sub(1, Ordering::SeqCst);
+            None
+        } else {
+            Some(Guard {
+                user_count: Arc::clone(&self.user_count),
+            })
+        }
+    }
+}
+
+/// Guard that hols a [`Semaphore`] permit.
+///
+/// New guards can be produced in two ways:
+/// - cloning an existing one
+/// - when no guard exists: using [`Semaphore::guard`].
+pub struct Guard {
+    user_count: Arc<AtomicUsize>,
+}
+
+impl Guard {
+    /// Use a guard.
+    ///
+    /// This is a no-op but is helpful if you need to reference a guard within a closure.
+    pub fn use_here(&self) {}
+}
+
+impl Clone for Guard {
+    /// Clone guard and increase usage count.
+    fn clone(&self) -> Self {
+        self.user_count.fetch_add(1, Ordering::SeqCst);
+        Self {
+            user_count: Arc::clone(&self.user_count),
+        }
+    }
+}
+
+impl Drop for Guard {
+    fn drop(&mut self) {
+        self.user_count.fetch_sub(1, Ordering::SeqCst);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test() {
+        let s = Semaphore::new();
+
+        let g = s.guard().unwrap();
+        assert!(s.guard().is_none());
+        drop(g);
+
+        let g1 = s.guard().unwrap();
+        let g2 = g1.clone();
+        assert!(s.guard().is_none());
+        drop(g1);
+        assert!(s.guard().is_none());
+        drop(g2);
+        s.guard().unwrap();
+    }
+}
diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index 0624a06d18..abd960a521 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -18,7 +18,10 @@ use rdkafka::{
     ClientConfig, Message, Offset, TopicPartitionList,
 };
 
-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::{
+    core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting},
+    guard::Semaphore,
+};
 
 pub struct KafkaBufferProducer {
     conn: String,
@@ -97,6 +100,7 @@ pub struct KafkaBufferConsumer {
     conn: String,
     database_name: String,
     consumers: BTreeMap<u32, Arc<StreamConsumer>>,
+    semaphore: Semaphore,
 }
 
 // Needed because rdkafka's StreamConsumer doesn't impl Debug
@@ -111,13 +115,23 @@ impl std::fmt::Debug for KafkaBufferConsumer {
 
 #[async_trait]
 impl WriteBufferReading for KafkaBufferConsumer {
-    fn streams(&self) -> Vec<(u32, EntryStream<'_>)> {
-        self.consumers
+    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError> {
+        let guard = self
+            .semaphore
+            .guard()
+            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
+
+        let streams: Vec<_> = self
+            .consumers
             .iter()
             .map(|(sequencer_id, consumer)| {
+                let guard = guard.clone();
+
                 let stream = consumer
                     .stream()
-                    .map(|message| {
+                    .map(move |message| {
+                        guard.use_here();
+
                         let message = message?;
                         let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
                         let sequence = Sequence {
@@ -130,10 +144,17 @@ impl WriteBufferReading for KafkaBufferConsumer {
                     .boxed();
                 (*sequencer_id, stream)
             })
-            .collect()
+            .collect();
+
+        Ok(streams)
     }
 
     async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
+        let _guard = self
+            .semaphore
+            .guard()
+            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
+
         if let Some(consumer) = self.consumers.get(&sequencer_id) {
             let consumer = Arc::clone(consumer);
             let database_name = self.database_name.clone();
@@ -213,6 +234,7 @@ impl KafkaBufferConsumer {
             conn,
             database_name,
             consumers,
+            semaphore: Semaphore::new(),
         })
     }
 
diff --git a/write_buffer/src/lib.rs b/write_buffer/src/lib.rs
index 9e9472940a..a165fa3eb8 100644
--- a/write_buffer/src/lib.rs
+++ b/write_buffer/src/lib.rs
@@ -10,5 +10,6 @@
 
 pub mod config;
 pub mod core;
+mod guard;
 pub mod kafka;
 pub mod mock;
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index cb4322199a..701b8af703 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -5,7 +5,10 @@ use entry::{Entry, Sequence, SequencedEntry};
 use futures::{stream, StreamExt};
 use parking_lot::Mutex;
 
-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::{
+    core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting},
+    guard::Semaphore,
+};
 
 type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;
 
@@ -165,6 +168,7 @@ struct PlaybackState {
 pub struct MockBufferForReading {
     shared_state: MockBufferSharedState,
     playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
+    semaphore: Semaphore,
 }
 
 impl MockBufferForReading {
@@ -185,6 +189,7 @@ impl MockBufferForReading {
         Self {
             shared_state: state,
             playback_states: Arc::new(Mutex::new(playback_states)),
+            semaphore: Semaphore::new(),
         }
     }
 }
@@ -197,7 +202,12 @@ impl std::fmt::Debug for MockBufferForReading {
 
 #[async_trait]
 impl WriteBufferReading for MockBufferForReading {
-    fn streams(&self) -> Vec<(u32, EntryStream<'_>)> {
+    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError> {
+        let guard = self
+            .semaphore
+            .guard()
+            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
+
         let sequencer_ids: Vec<_> = {
             let playback_states = self.playback_states.lock();
             playback_states.keys().copied().collect()
@@ -207,8 +217,11 @@ impl WriteBufferReading for MockBufferForReading {
         for sequencer_id in sequencer_ids {
             let shared_state = self.shared_state.clone();
             let playback_states = Arc::clone(&self.playback_states);
+            let guard = guard.clone();
 
             let stream = stream::poll_fn(move |_ctx| {
+                guard.use_here();
+
                 let entries = shared_state.entries.lock();
                 let mut playback_states = playback_states.lock();
 
@@ -247,10 +260,15 @@ impl WriteBufferReading for MockBufferForReading {
             streams.push((sequencer_id, stream));
         }
 
-        streams
+        Ok(streams)
     }
 
     async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
+        let _guard = self
+            .semaphore
+            .guard()
+            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
+
         let mut playback_states = self.playback_states.lock();
 
         if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {

From cc0aaa58a7bd71e1a8489ecc5af4dd3731c0ccf2 Mon Sep 17 00:00:00 2001
From: Edd Robinson <me@edd.io>
Date: Tue, 20 Jul 2021 12:43:10 +0100
Subject: [PATCH 09/27] test: ensure high enough limit

---
 lifecycle/src/policy.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lifecycle/src/policy.rs b/lifecycle/src/policy.rs
index dfe713fa2c..beb35464ea 100644
--- a/lifecycle/src/policy.rs
+++ b/lifecycle/src/policy.rs
@@ -1399,6 +1399,7 @@ mod tests {
         let rules = LifecycleRules {
             late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
             persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
+            max_active_compactions: NonZeroU32::new(10).unwrap(),
             ..Default::default()
         };
 
@@ -1538,6 +1539,7 @@ mod tests {
             persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
             late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
             persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(),
+            max_active_compactions: NonZeroU32::new(10).unwrap(),
             ..Default::default()
         };
         let now = Instant::now();

From ec7ebdff2994a3b8ebd85c14d48a7827e388ae5c Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Tue, 20 Jul 2021 13:52:33 +0200
Subject: [PATCH 10/27] refactor: use lifetimes to ensure single stream / no
 seek while streaming

---
 server/src/db.rs           | 26 +++++++-----
 write_buffer/src/config.rs |  6 ++-
 write_buffer/src/core.rs   | 79 ++++++++++++++--------------------
 write_buffer/src/guard.rs  | 87 --------------------------------------
 write_buffer/src/kafka.rs  | 36 ++++------------
 write_buffer/src/lib.rs    |  1 -
 write_buffer/src/mock.rs   | 30 ++++---------
 7 files changed, 68 insertions(+), 197 deletions(-)
 delete mode 100644 write_buffer/src/guard.rs

diff --git a/server/src/db.rs b/server/src/db.rs
index 59348fc956..020c91fd21 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -656,11 +656,11 @@ impl Db {
             // streaming from the write buffer loop
             async {
                 if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer {
+                    let mut write_buffer = write_buffer
+                        .try_lock()
+                        .expect("no streams should exist at this point");
                     let mut futures = vec![];
-                    for (_sequencer_id, stream) in write_buffer
-                        .streams()
-                        .expect("no streams should exist at this point")
-                    {
+                    for (_sequencer_id, stream) in write_buffer.streams() {
                         let fut = self.stream_in_sequenced_entries(stream);
                         futures.push(fut);
                     }
@@ -1215,10 +1215,12 @@ mod tests {
         let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1);
         write_buffer_state
             .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        let write_buffer = MockBufferForReading::new(write_buffer_state);
 
         let db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
             .build()
             .await
             .db;
@@ -1274,10 +1276,12 @@ mod tests {
             String::from("Something bad happened on the way to creating a SequencedEntry").into(),
             0,
         );
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        let write_buffer = MockBufferForReading::new(write_buffer_state);
 
         let test_db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
             .build()
             .await;
 
@@ -2262,10 +2266,12 @@ mod tests {
         );
         write_buffer_state
             .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap());
-        let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
+        let write_buffer = MockBufferForReading::new(write_buffer_state);
 
         let db = TestDb::builder()
-            .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
+            .write_buffer(WriteBufferConfig::Reading(Arc::new(
+                tokio::sync::Mutex::new(Box::new(write_buffer) as _),
+            )))
             .build()
             .await
             .db;
diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs
index 6aacefd530..d6c69e6341 100644
--- a/write_buffer/src/config.rs
+++ b/write_buffer/src/config.rs
@@ -13,7 +13,7 @@ use crate::{
 #[derive(Debug)]
 pub enum WriteBufferConfig {
     Writing(Arc<dyn WriteBufferWriting>),
-    Reading(Arc<dyn WriteBufferReading>),
+    Reading(Arc<tokio::sync::Mutex<Box<dyn WriteBufferReading>>>),
 }
 
 impl WriteBufferConfig {
@@ -36,7 +36,9 @@ impl WriteBufferConfig {
             Some(WriteBufferConnection::Reading(conn)) => {
                 let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?;
 
-                Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _)))
+                Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new(
+                    Box::new(kafka_buffer) as _,
+                )))))
             }
             None => Ok(None),
         }
diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index 1f91d955ab..fdefc76746 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -28,15 +28,15 @@ pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError
 #[async_trait]
 pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
     /// Returns a stream per sequencer.
-    ///
-    /// When calling this method a second time while the streams of the first call are still in use, an error will be returned.
-    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError>;
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>;
 
     /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
     /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
-    ///
-    /// When calling this methods while streams (from [`streams`](Self::streams)) are in use, an error will be returned.
-    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>;
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError>;
 }
 
 pub mod test_utils {
@@ -68,7 +68,6 @@ pub mod test_utils {
         T: TestAdapter,
     {
         test_single_stream_io(&adapter).await;
-        test_multi_stream(&adapter).await;
         test_multi_sequencer_io(&adapter).await;
         test_multi_writer_multi_reader(&adapter).await;
         test_seek(&adapter).await;
@@ -85,9 +84,9 @@ pub mod test_utils {
         let entry_3 = lp_to_entry("upc user=3 300");
 
         let writer = context.writing();
-        let reader = context.reading().await;
+        let mut reader = context.reading().await;
 
-        let mut streams = reader.streams().unwrap();
+        let mut streams = reader.streams();
         assert_eq!(streams.len(), 1);
         let (sequencer_id, mut stream) = streams.pop().unwrap();
 
@@ -125,9 +124,9 @@ pub mod test_utils {
         let entry_3 = lp_to_entry("upc user=3 300");
 
         let writer = context.writing();
-        let reader = context.reading().await;
+        let mut reader = context.reading().await;
 
-        let mut streams = reader.streams().unwrap();
+        let mut streams = reader.streams();
         assert_eq!(streams.len(), 2);
         let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
         let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
@@ -158,28 +157,6 @@ pub mod test_utils {
         assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
     }
 
-    async fn test_multi_stream<T>(adapter: &T)
-    where
-        T: TestAdapter,
-    {
-        let context = adapter.new_context(2).await;
-        let reader = context.reading().await;
-
-        let mut streams = reader.streams().unwrap();
-        assert_eq!(streams.len(), 2);
-        let (_sequencer_id, stream_1) = streams.pop().unwrap();
-        let (_sequencer_id, stream_2) = streams.pop().unwrap();
-
-        // cannot get another stream while streams are in use
-        assert!(reader.streams().is_err());
-        drop(stream_1);
-        assert!(reader.streams().is_err());
-        drop(stream_2);
-
-        // when all streams are dropped, we can get new ones
-        reader.streams().unwrap();
-    }
-
     async fn test_multi_writer_multi_reader<T>(adapter: &T)
     where
         T: TestAdapter,
@@ -192,8 +169,8 @@ pub mod test_utils {
 
         let writer_1 = context.writing();
         let writer_2 = context.writing();
-        let reader_1 = context.reading().await;
-        let reader_2 = context.reading().await;
+        let mut reader_1 = context.reading().await;
+        let mut reader_2 = context.reading().await;
 
         // TODO: do not hard-code sequencer IDs here but provide a proper interface
         writer_1.store_entry(&entry_east_1, 0).await.unwrap();
@@ -201,12 +178,12 @@ pub mod test_utils {
         writer_2.store_entry(&entry_east_2, 0).await.unwrap();
 
         assert_reader_content(
-            &reader_1,
+            &mut reader_1,
             &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
         )
         .await;
         assert_reader_content(
-            &reader_2,
+            &mut reader_2,
             &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
         )
         .await;
@@ -231,46 +208,52 @@ pub mod test_utils {
         let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number;
         let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number;
 
-        let reader_1 = context.reading().await;
-        let reader_2 = context.reading().await;
+        let mut reader_1 = context.reading().await;
+        let mut reader_2 = context.reading().await;
 
         // forward seek
         reader_1.seek(0, sequence_number_east_2).await.unwrap();
-        assert_reader_content(&reader_1, &[(0, &[&entry_east_2]), (1, &[&entry_west_1])]).await;
         assert_reader_content(
-            &reader_2,
+            &mut reader_1,
+            &[(0, &[&entry_east_2]), (1, &[&entry_west_1])],
+        )
+        .await;
+        assert_reader_content(
+            &mut reader_2,
             &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
         )
         .await;
 
         // backward seek
         reader_1.seek(0, 0).await.unwrap();
-        assert_reader_content(&reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])]).await;
+        assert_reader_content(
+            &mut reader_1,
+            &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])],
+        )
+        .await;
 
         // seek to far end and then at data
         reader_1.seek(0, 1_000_000).await.unwrap();
         let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
-        let mut streams = reader_1.streams().unwrap();
+        let mut streams = reader_1.streams();
         assert_eq!(streams.len(), 2);
         let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
         let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
         assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
         assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-
-        // seeking while streams are in use is an error
-        reader_1.seek(0, 0).await.unwrap_err();
         drop(stream_1);
         drop(stream_2);
+        drop(streams);
 
         // seeking unknown sequencer is NOT an error
         reader_1.seek(0, 42).await.unwrap();
     }
 
-    async fn assert_reader_content<R>(reader: &R, expected: &[(u32, &[&Entry])])
+    async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&Entry])])
     where
         R: WriteBufferReading,
     {
-        let mut streams = reader.streams().unwrap();
+        let mut streams = reader.streams();
         assert_eq!(streams.len(), expected.len());
         streams.sort_by_key(|(sequencer_id, _stream)| *sequencer_id);
 
diff --git a/write_buffer/src/guard.rs b/write_buffer/src/guard.rs
deleted file mode 100644
index 636193a15c..0000000000
--- a/write_buffer/src/guard.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc,
-};
-
-/// A semaphore that produces [`Send`]able guards.
-pub struct Semaphore {
-    user_count: Arc<AtomicUsize>,
-}
-
-impl Semaphore {
-    /// Creates new semaphore with a single permit.
-    pub fn new() -> Self {
-        Self {
-            user_count: Arc::new(AtomicUsize::new(0)),
-        }
-    }
-
-    /// Creates guard if no permit exists.
-    ///
-    /// To produce multiple guards, you can clone an existing one.
-    pub fn guard(&self) -> Option<Guard> {
-        let count = self.user_count.fetch_add(1, Ordering::SeqCst);
-        if count > 0 {
-            self.user_count.fetch_sub(1, Ordering::SeqCst);
-            None
-        } else {
-            Some(Guard {
-                user_count: Arc::clone(&self.user_count),
-            })
-        }
-    }
-}
-
-/// Guard that hols a [`Semaphore`] permit.
-///
-/// New guards can be produced in two ways:
-/// - cloning an existing one
-/// - when no guard exists: using [`Semaphore::guard`].
-pub struct Guard {
-    user_count: Arc<AtomicUsize>,
-}
-
-impl Guard {
-    /// Use a guard.
-    ///
-    /// This is a no-op but is helpful if you need to reference a guard within a closure.
-    pub fn use_here(&self) {}
-}
-
-impl Clone for Guard {
-    /// Clone guard and increase usage count.
-    fn clone(&self) -> Self {
-        self.user_count.fetch_add(1, Ordering::SeqCst);
-        Self {
-            user_count: Arc::clone(&self.user_count),
-        }
-    }
-}
-
-impl Drop for Guard {
-    fn drop(&mut self) {
-        self.user_count.fetch_sub(1, Ordering::SeqCst);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test() {
-        let s = Semaphore::new();
-
-        let g = s.guard().unwrap();
-        assert!(s.guard().is_none());
-        drop(g);
-
-        let g1 = s.guard().unwrap();
-        let g2 = g1.clone();
-        assert!(s.guard().is_none());
-        drop(g1);
-        assert!(s.guard().is_none());
-        drop(g2);
-        s.guard().unwrap();
-    }
-}
diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index abd960a521..3255f43b2b 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -18,10 +18,7 @@ use rdkafka::{
     ClientConfig, Message, Offset, TopicPartitionList,
 };
 
-use crate::{
-    core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting},
-    guard::Semaphore,
-};
+use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
 
 pub struct KafkaBufferProducer {
     conn: String,
@@ -100,7 +97,6 @@ pub struct KafkaBufferConsumer {
     conn: String,
     database_name: String,
     consumers: BTreeMap<u32, Arc<StreamConsumer>>,
-    semaphore: Semaphore,
 }
 
 // Needed because rdkafka's StreamConsumer doesn't impl Debug
@@ -115,23 +111,13 @@ impl std::fmt::Debug for KafkaBufferConsumer {
 
 #[async_trait]
 impl WriteBufferReading for KafkaBufferConsumer {
-    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError> {
-        let guard = self
-            .semaphore
-            .guard()
-            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
-
-        let streams: Vec<_> = self
-            .consumers
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
+        self.consumers
             .iter()
             .map(|(sequencer_id, consumer)| {
-                let guard = guard.clone();
-
                 let stream = consumer
                     .stream()
                     .map(move |message| {
-                        guard.use_here();
-
                         let message = message?;
                         let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
                         let sequence = Sequence {
@@ -144,17 +130,14 @@ impl WriteBufferReading for KafkaBufferConsumer {
                     .boxed();
                 (*sequencer_id, stream)
             })
-            .collect();
-
-        Ok(streams)
+            .collect()
     }
 
-    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
-        let _guard = self
-            .semaphore
-            .guard()
-            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
-
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError> {
         if let Some(consumer) = self.consumers.get(&sequencer_id) {
             let consumer = Arc::clone(consumer);
             let database_name = self.database_name.clone();
@@ -234,7 +217,6 @@ impl KafkaBufferConsumer {
             conn,
             database_name,
             consumers,
-            semaphore: Semaphore::new(),
         })
     }
 
diff --git a/write_buffer/src/lib.rs b/write_buffer/src/lib.rs
index a165fa3eb8..9e9472940a 100644
--- a/write_buffer/src/lib.rs
+++ b/write_buffer/src/lib.rs
@@ -10,6 +10,5 @@
 
 pub mod config;
 pub mod core;
-mod guard;
 pub mod kafka;
 pub mod mock;
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index 701b8af703..37659ba05b 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -5,10 +5,7 @@ use entry::{Entry, Sequence, SequencedEntry};
 use futures::{stream, StreamExt};
 use parking_lot::Mutex;
 
-use crate::{
-    core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting},
-    guard::Semaphore,
-};
+use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
 
 type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;
 
@@ -168,7 +165,6 @@ struct PlaybackState {
 pub struct MockBufferForReading {
     shared_state: MockBufferSharedState,
     playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
-    semaphore: Semaphore,
 }
 
 impl MockBufferForReading {
@@ -189,7 +185,6 @@ impl MockBufferForReading {
         Self {
             shared_state: state,
             playback_states: Arc::new(Mutex::new(playback_states)),
-            semaphore: Semaphore::new(),
         }
     }
 }
@@ -202,12 +197,7 @@ impl std::fmt::Debug for MockBufferForReading {
 
 #[async_trait]
 impl WriteBufferReading for MockBufferForReading {
-    fn streams(&self) -> Result<Vec<(u32, EntryStream<'_>)>, WriteBufferError> {
-        let guard = self
-            .semaphore
-            .guard()
-            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
-
+    fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
         let sequencer_ids: Vec<_> = {
             let playback_states = self.playback_states.lock();
             playback_states.keys().copied().collect()
@@ -217,11 +207,8 @@ impl WriteBufferReading for MockBufferForReading {
         for sequencer_id in sequencer_ids {
             let shared_state = self.shared_state.clone();
             let playback_states = Arc::clone(&self.playback_states);
-            let guard = guard.clone();
 
             let stream = stream::poll_fn(move |_ctx| {
-                guard.use_here();
-
                 let entries = shared_state.entries.lock();
                 let mut playback_states = playback_states.lock();
 
@@ -260,15 +247,14 @@ impl WriteBufferReading for MockBufferForReading {
             streams.push((sequencer_id, stream));
         }
 
-        Ok(streams)
+        streams
     }
 
-    async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> {
-        let _guard = self
-            .semaphore
-            .guard()
-            .ok_or_else::<WriteBufferError, _>(|| "stream already in use".to_string().into())?;
-
+    async fn seek(
+        &mut self,
+        sequencer_id: u32,
+        sequence_number: u64,
+    ) -> Result<(), WriteBufferError> {
         let mut playback_states = self.playback_states.lock();
 
         if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {

From c01cfbc34c21d98c3fda8ac2a90c83139a171be4 Mon Sep 17 00:00:00 2001
From: Marko Mikulicic <mkm@influxdata.com>
Date: Tue, 20 Jul 2021 14:17:37 +0200
Subject: [PATCH 11/27] fix: Increase kafka message size

---
 write_buffer/src/kafka.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index 15a27a401c..bcdea87659 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -77,8 +77,8 @@ impl KafkaBufferProducer {
         let mut cfg = ClientConfig::new();
         cfg.set("bootstrap.servers", &conn);
         cfg.set("message.timeout.ms", "5000");
-        cfg.set("message.max.bytes", "10000000");
-        cfg.set("queue.buffering.max.kbytes", "10485760");
+        cfg.set("message.max.bytes", "31457280");
+        cfg.set("queue.buffering.max.kbytes", "31457280");
         cfg.set("request.required.acks", "all"); // equivalent to acks=-1
 
         let producer: FutureProducer = cfg.create()?;

From cf8a60252d03ce52b7960bd23e1c891b5965da6a Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 13:19:20 +0100
Subject: [PATCH 12/27] refactor: split system_tables module into smaller
 modules (#2061)

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 server/src/db/system_tables.rs            | 698 +---------------------
 server/src/db/system_tables/chunks.rs     | 201 +++++++
 server/src/db/system_tables/columns.rs    | 404 +++++++++++++
 server/src/db/system_tables/operations.rs | 108 ++++
 4 files changed, 729 insertions(+), 682 deletions(-)
 create mode 100644 server/src/db/system_tables/chunks.rs
 create mode 100644 server/src/db/system_tables/columns.rs
 create mode 100644 server/src/db/system_tables/operations.rs

diff --git a/server/src/db/system_tables.rs b/server/src/db/system_tables.rs
index f80f06b9bc..f83c793fa5 100644
--- a/server/src/db/system_tables.rs
+++ b/server/src/db/system_tables.rs
@@ -7,38 +7,30 @@
 //!
 //! For example `SELECT * FROM system.chunks`
 
-use std::convert::AsRef;
+use std::any::Any;
 use std::sync::Arc;
-use std::{any::Any, collections::HashMap};
-
-use chrono::{DateTime, Utc};
 
 use arrow::{
-    array::{
-        ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray,
-        UInt32Array, UInt32Builder, UInt64Array, UInt64Builder,
-    },
-    datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
+    datatypes::{Field, Schema, SchemaRef},
     error::Result,
     record_batch::RecordBatch,
 };
-use data_types::{
-    chunk_metadata::{ChunkSummary, DetailedChunkSummary},
-    error::ErrorLogger,
-    job::Job,
-    partition_metadata::PartitionSummary,
-};
+use chrono::{DateTime, Utc};
+
 use datafusion::{
     catalog::schema::SchemaProvider,
     datasource::{datasource::Statistics, TableProvider},
     error::{DataFusionError, Result as DataFusionResult},
     physical_plan::{memory::MemoryExec, ExecutionPlan},
 };
-use tracker::TaskTracker;
+
+use crate::JobRegistry;
 
 use super::catalog::Catalog;
-use crate::JobRegistry;
-use data_types::partition_metadata::TableSummary;
+
+mod chunks;
+mod columns;
+mod operations;
 
 // The IOx system schema
 pub const SYSTEM_SCHEMA: &str = "system";
@@ -67,16 +59,16 @@ impl SystemSchemaProvider {
     pub fn new(db_name: impl Into<String>, catalog: Arc<Catalog>, jobs: Arc<JobRegistry>) -> Self {
         let db_name = db_name.into();
         let chunks = Arc::new(SystemTableProvider {
-            inner: ChunksTable::new(Arc::clone(&catalog)),
+            inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
         });
         let columns = Arc::new(SystemTableProvider {
-            inner: ColumnsTable::new(Arc::clone(&catalog)),
+            inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
         });
         let chunk_columns = Arc::new(SystemTableProvider {
-            inner: ChunkColumnsTable::new(catalog),
+            inner: columns::ChunkColumnsTable::new(catalog),
         });
         let operations = Arc::new(SystemTableProvider {
-            inner: OperationsTable::new(db_name, jobs),
+            inner: operations::OperationsTable::new(db_name, jobs),
         });
         Self {
             chunks,
@@ -162,407 +154,6 @@ fn time_to_ts(time: Option<DateTime<Utc>>) -> Option<i64> {
     time.map(|ts| ts.timestamp_nanos())
 }
 
-/// Implementation of system.chunks table
-#[derive(Debug)]
-struct ChunksTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ChunksTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: chunk_summaries_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ChunksTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
-            .log_if_error("system.chunks table")
-    }
-}
-
-fn chunk_summaries_schema() -> SchemaRef {
-    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
-    Arc::new(Schema::new(vec![
-        Field::new("id", DataType::UInt32, false),
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("storage", DataType::Utf8, false),
-        Field::new("lifecycle_action", DataType::Utf8, true),
-        Field::new("memory_bytes", DataType::UInt64, false),
-        Field::new("object_store_bytes", DataType::UInt64, false),
-        Field::new("row_count", DataType::UInt64, false),
-        Field::new("time_of_first_write", ts.clone(), true),
-        Field::new("time_of_last_write", ts.clone(), true),
-        Field::new("time_closed", ts, true),
-    ]))
-}
-
-fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
-    let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
-    let partition_key = chunks
-        .iter()
-        .map(|c| Some(c.partition_key.as_ref()))
-        .collect::<StringArray>();
-    let table_name = chunks
-        .iter()
-        .map(|c| Some(c.table_name.as_ref()))
-        .collect::<StringArray>();
-    let storage = chunks
-        .iter()
-        .map(|c| Some(c.storage.as_str()))
-        .collect::<StringArray>();
-    let lifecycle_action = chunks
-        .iter()
-        .map(|c| c.lifecycle_action.map(|a| a.name()))
-        .collect::<StringArray>();
-    let memory_bytes = chunks
-        .iter()
-        .map(|c| Some(c.memory_bytes as u64))
-        .collect::<UInt64Array>();
-    let object_store_bytes = chunks
-        .iter()
-        .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
-        .collect::<UInt64Array>();
-    let row_counts = chunks
-        .iter()
-        .map(|c| Some(c.row_count as u64))
-        .collect::<UInt64Array>();
-    let time_of_first_write = chunks
-        .iter()
-        .map(|c| c.time_of_first_write)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-    let time_of_last_write = chunks
-        .iter()
-        .map(|c| c.time_of_last_write)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-    let time_closed = chunks
-        .iter()
-        .map(|c| c.time_closed)
-        .map(time_to_ts)
-        .collect::<TimestampNanosecondArray>();
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(id),
-            Arc::new(partition_key),
-            Arc::new(table_name),
-            Arc::new(storage),
-            Arc::new(lifecycle_action),
-            Arc::new(memory_bytes),
-            Arc::new(object_store_bytes),
-            Arc::new(row_counts),
-            Arc::new(time_of_first_write),
-            Arc::new(time_of_last_write),
-            Arc::new(time_closed),
-        ],
-    )
-}
-
-/// Implementation of `system.columns` system table
-#[derive(Debug)]
-struct ColumnsTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ColumnsTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: partition_summaries_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ColumnsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-    fn batch(&self) -> Result<RecordBatch> {
-        from_partition_summaries(self.schema(), self.catalog.partition_summaries())
-            .log_if_error("system.columns table")
-    }
-}
-
-fn partition_summaries_schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("column_name", DataType::Utf8, false),
-        Field::new("column_type", DataType::Utf8, false),
-        Field::new("influxdb_type", DataType::Utf8, true),
-    ]))
-}
-
-fn from_partition_summaries(
-    schema: SchemaRef,
-    partitions: Vec<PartitionSummary>,
-) -> Result<RecordBatch> {
-    // Assume each partition has roughly 5 tables with 5 columns
-    let row_estimate = partitions.len() * 25;
-
-    let mut partition_key = StringBuilder::new(row_estimate);
-    let mut table_name = StringBuilder::new(row_estimate);
-    let mut column_name = StringBuilder::new(row_estimate);
-    let mut column_type = StringBuilder::new(row_estimate);
-    let mut influxdb_type = StringBuilder::new(row_estimate);
-
-    // Note no rows are produced for partitions with no tabes, or
-    // tables with no columns: There are other tables to list tables
-    // and columns
-    for partition in partitions {
-        let table = partition.table;
-        for column in table.columns {
-            partition_key.append_value(&partition.key)?;
-            table_name.append_value(&table.name)?;
-            column_name.append_value(&column.name)?;
-            column_type.append_value(column.type_name())?;
-            if let Some(t) = &column.influxdb_type {
-                influxdb_type.append_value(t.as_str())?;
-            } else {
-                influxdb_type.append_null()?;
-            }
-        }
-    }
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(partition_key.finish()) as ArrayRef,
-            Arc::new(table_name.finish()),
-            Arc::new(column_name.finish()),
-            Arc::new(column_type.finish()),
-            Arc::new(influxdb_type.finish()),
-        ],
-    )
-}
-
-/// Implementation of system.column_chunks table
-#[derive(Debug)]
-struct ChunkColumnsTable {
-    schema: SchemaRef,
-    catalog: Arc<Catalog>,
-}
-
-impl ChunkColumnsTable {
-    fn new(catalog: Arc<Catalog>) -> Self {
-        Self {
-            schema: chunk_columns_schema(),
-            catalog,
-        }
-    }
-}
-
-impl IoxSystemTable for ChunkColumnsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
-            .log_if_error("system.column_chunks table")
-    }
-}
-
-fn chunk_columns_schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("partition_key", DataType::Utf8, false),
-        Field::new("chunk_id", DataType::UInt32, false),
-        Field::new("table_name", DataType::Utf8, false),
-        Field::new("column_name", DataType::Utf8, false),
-        Field::new("storage", DataType::Utf8, false),
-        Field::new("row_count", DataType::UInt64, true),
-        Field::new("min_value", DataType::Utf8, true),
-        Field::new("max_value", DataType::Utf8, true),
-        Field::new("memory_bytes", DataType::UInt64, true),
-    ]))
-}
-
-fn assemble_chunk_columns(
-    schema: SchemaRef,
-    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
-) -> Result<RecordBatch> {
-    /// Builds an index from column_name -> size
-    fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
-        summary
-            .columns
-            .iter()
-            .map(|column_summary| {
-                (
-                    column_summary.name.as_ref(),
-                    column_summary.memory_bytes as u64,
-                )
-            })
-            .collect()
-    }
-
-    // Assume each chunk has roughly 5 columns
-    let row_estimate = chunk_summaries.len() * 5;
-
-    let mut partition_key = StringBuilder::new(row_estimate);
-    let mut chunk_id = UInt32Builder::new(row_estimate);
-    let mut table_name = StringBuilder::new(row_estimate);
-    let mut column_name = StringBuilder::new(row_estimate);
-    let mut storage = StringBuilder::new(row_estimate);
-    let mut row_count = UInt64Builder::new(row_estimate);
-    let mut min_values = StringBuilder::new(row_estimate);
-    let mut max_values = StringBuilder::new(row_estimate);
-    let mut memory_bytes = UInt64Builder::new(row_estimate);
-
-    // Note no rows are produced for partitions with no chunks, or
-    // tables with no partitions: There are other tables to list tables
-    // and columns
-    for (table_summary, chunk_summary) in chunk_summaries {
-        let mut column_index = make_column_index(&chunk_summary);
-        let storage_value = chunk_summary.inner.storage.as_str();
-
-        for column in &table_summary.columns {
-            partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
-            chunk_id.append_value(chunk_summary.inner.id)?;
-            table_name.append_value(&chunk_summary.inner.table_name)?;
-            column_name.append_value(&column.name)?;
-            storage.append_value(storage_value)?;
-            row_count.append_value(column.count())?;
-            if let Some(v) = column.stats.min_as_str() {
-                min_values.append_value(v)?;
-            } else {
-                min_values.append(false)?;
-            }
-            if let Some(v) = column.stats.max_as_str() {
-                max_values.append_value(v)?;
-            } else {
-                max_values.append(false)?;
-            }
-
-            let size = column_index.remove(column.name.as_str());
-
-            memory_bytes.append_option(size)?;
-        }
-    }
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(partition_key.finish()) as ArrayRef,
-            Arc::new(chunk_id.finish()),
-            Arc::new(table_name.finish()),
-            Arc::new(column_name.finish()),
-            Arc::new(storage.finish()),
-            Arc::new(row_count.finish()),
-            Arc::new(min_values.finish()),
-            Arc::new(max_values.finish()),
-            Arc::new(memory_bytes.finish()),
-        ],
-    )
-}
-
-/// Implementation of system.operations table
-#[derive(Debug)]
-struct OperationsTable {
-    schema: SchemaRef,
-    db_name: String,
-    jobs: Arc<JobRegistry>,
-}
-
-impl OperationsTable {
-    fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
-        Self {
-            schema: operations_schema(),
-            db_name,
-            jobs,
-        }
-    }
-}
-
-impl IoxSystemTable for OperationsTable {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-
-    fn batch(&self) -> Result<RecordBatch> {
-        from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
-            .log_if_error("system.operations table")
-    }
-}
-
-fn operations_schema() -> SchemaRef {
-    let ts = DataType::Time64(TimeUnit::Nanosecond);
-    Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Utf8, false),
-        Field::new("status", DataType::Utf8, true),
-        Field::new("cpu_time_used", ts.clone(), true),
-        Field::new("wall_time_used", ts, true),
-        Field::new("partition_key", DataType::Utf8, true),
-        Field::new("chunk_id", DataType::UInt32, true),
-        Field::new("description", DataType::Utf8, true),
-    ]))
-}
-
-fn from_task_trackers(
-    schema: SchemaRef,
-    db_name: &str,
-    jobs: Vec<TaskTracker<Job>>,
-) -> Result<RecordBatch> {
-    let jobs = jobs
-        .into_iter()
-        .filter(|job| job.metadata().db_name() == Some(db_name))
-        .collect::<Vec<_>>();
-
-    let ids = jobs
-        .iter()
-        .map(|job| Some(job.id().to_string()))
-        .collect::<StringArray>();
-    let statuses = jobs
-        .iter()
-        .map(|job| Some(job.get_status().name()))
-        .collect::<StringArray>();
-    let cpu_time_used = jobs
-        .iter()
-        .map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
-        .collect::<Time64NanosecondArray>();
-    let wall_time_used = jobs
-        .iter()
-        .map(|job| job.get_status().wall_nanos().map(|n| n as i64))
-        .collect::<Time64NanosecondArray>();
-    let partition_keys = jobs
-        .iter()
-        .map(|job| job.metadata().partition_key())
-        .collect::<StringArray>();
-    let chunk_ids = jobs
-        .iter()
-        .map(|job| job.metadata().chunk_id())
-        .collect::<UInt32Array>();
-    let descriptions = jobs
-        .iter()
-        .map(|job| Some(job.metadata().description()))
-        .collect::<StringArray>();
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(ids) as ArrayRef,
-            Arc::new(statuses),
-            Arc::new(cpu_time_used),
-            Arc::new(wall_time_used),
-            Arc::new(partition_keys),
-            Arc::new(chunk_ids),
-            Arc::new(descriptions),
-        ],
-    )
-}
-
 /// Creates a DataFusion ExecutionPlan node that scans a single batch
 /// of records.
 fn scan_batch(
@@ -605,141 +196,10 @@ fn scan_batch(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
+    use arrow::array::{ArrayRef, UInt64Array};
     use arrow_util::assert_batches_eq;
-    use chrono::NaiveDateTime;
-    use data_types::{
-        chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage},
-        partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary},
-    };
 
-    #[test]
-    fn test_from_chunk_summaries() {
-        let chunks = vec![
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 0,
-                storage: ChunkStorage::OpenMutableBuffer,
-                lifecycle_action: None,
-                memory_bytes: 23754,
-                object_store_bytes: 0,
-                row_count: 11,
-                time_of_first_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(10, 0),
-                    Utc,
-                )),
-                time_of_last_write: None,
-                time_closed: None,
-            },
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 1,
-                storage: ChunkStorage::OpenMutableBuffer,
-                lifecycle_action: Some(ChunkLifecycleAction::Persisting),
-                memory_bytes: 23455,
-                object_store_bytes: 0,
-                row_count: 22,
-                time_of_first_write: None,
-                time_of_last_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(80, 0),
-                    Utc,
-                )),
-                time_closed: None,
-            },
-            ChunkSummary {
-                partition_key: Arc::from("p1"),
-                table_name: Arc::from("table1"),
-                id: 2,
-                storage: ChunkStorage::ObjectStoreOnly,
-                lifecycle_action: None,
-                memory_bytes: 1234,
-                object_store_bytes: 5678,
-                row_count: 33,
-                time_of_first_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(100, 0),
-                    Utc,
-                )),
-                time_of_last_write: Some(DateTime::from_utc(
-                    NaiveDateTime::from_timestamp(200, 0),
-                    Utc,
-                )),
-                time_closed: None,
-            },
-        ];
-
-        let expected = vec![
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-            "| id | partition_key | table_name | storage           | lifecycle_action             | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write  | time_closed |",
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-            "| 0  | p1            | table1     | OpenMutableBuffer |                              | 23754        |                    | 11        | 1970-01-01 00:00:10 |                     |             |",
-            "| 1  | p1            | table1     | OpenMutableBuffer | Persisting to Object Storage | 23455        |                    | 22        |                     | 1970-01-01 00:01:20 |             |",
-            "| 2  | p1            | table1     | ObjectStoreOnly   |                              | 1234         | 5678               | 33        | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 |             |",
-            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
-        ];
-
-        let schema = chunk_summaries_schema();
-        let batch = from_chunk_summaries(schema, chunks).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
-
-    #[test]
-    fn test_from_partition_summaries() {
-        let partitions = vec![
-            PartitionSummary {
-                key: "p1".to_string(),
-                table: TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![
-                        ColumnSummary {
-                            name: "c1".to_string(),
-                            influxdb_type: Some(InfluxDbType::Tag),
-                            stats: Statistics::I64(StatValues::new_with_value(23)),
-                        },
-                        ColumnSummary {
-                            name: "c2".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::I64(StatValues::new_with_value(43)),
-                        },
-                        ColumnSummary {
-                            name: "c3".to_string(),
-                            influxdb_type: None,
-                            stats: Statistics::String(StatValues::new_with_value(
-                                "foo".to_string(),
-                            )),
-                        },
-                        ColumnSummary {
-                            name: "time".to_string(),
-                            influxdb_type: Some(InfluxDbType::Timestamp),
-                            stats: Statistics::I64(StatValues::new_with_value(43)),
-                        },
-                    ],
-                },
-            },
-            PartitionSummary {
-                key: "p3".to_string(),
-                table: TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![],
-                },
-            },
-        ];
-
-        let expected = vec![
-            "+---------------+------------+-------------+-------------+---------------+",
-            "| partition_key | table_name | column_name | column_type | influxdb_type |",
-            "+---------------+------------+-------------+-------------+---------------+",
-            "| p1            | t1         | c1          | I64         | Tag           |",
-            "| p1            | t1         | c2          | I64         | Field         |",
-            "| p1            | t1         | c3          | String      |               |",
-            "| p1            | t1         | time        | I64         | Timestamp     |",
-            "+---------------+------------+-------------+-------------+---------------+",
-        ];
-
-        let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
+    use super::*;
 
     fn seq_array(start: u64, end: u64) -> ArrayRef {
         Arc::new(UInt64Array::from_iter_values(start..end))
@@ -820,130 +280,4 @@ mod tests {
             err_string
         );
     }
-
-    #[test]
-    fn test_assemble_chunk_columns() {
-        let lifecycle_action = None;
-
-        let summaries = vec![
-            (
-                Arc::new(TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![
-                        ColumnSummary {
-                            name: "c1".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::String(StatValues::new(
-                                Some("bar".to_string()),
-                                Some("foo".to_string()),
-                                55,
-                            )),
-                        },
-                        ColumnSummary {
-                            name: "c2".to_string(),
-                            influxdb_type: Some(InfluxDbType::Field),
-                            stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
-                        },
-                    ],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p1".into(),
-                        table_name: "t1".into(),
-                        id: 42,
-                        storage: ChunkStorage::ReadBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![
-                        ChunkColumnSummary {
-                            name: "c1".into(),
-                            memory_bytes: 11,
-                        },
-                        ChunkColumnSummary {
-                            name: "c2".into(),
-                            memory_bytes: 12,
-                        },
-                    ],
-                },
-            ),
-            (
-                Arc::new(TableSummary {
-                    name: "t1".to_string(),
-                    columns: vec![ColumnSummary {
-                        name: "c1".to_string(),
-                        influxdb_type: Some(InfluxDbType::Field),
-                        stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
-                    }],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p2".into(),
-                        table_name: "t1".into(),
-                        id: 43,
-                        storage: ChunkStorage::OpenMutableBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![ChunkColumnSummary {
-                        name: "c1".into(),
-                        memory_bytes: 100,
-                    }],
-                },
-            ),
-            (
-                Arc::new(TableSummary {
-                    name: "t2".to_string(),
-                    columns: vec![ColumnSummary {
-                        name: "c3".to_string(),
-                        influxdb_type: Some(InfluxDbType::Field),
-                        stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
-                    }],
-                }),
-                DetailedChunkSummary {
-                    inner: ChunkSummary {
-                        partition_key: "p2".into(),
-                        table_name: "t2".into(),
-                        id: 44,
-                        storage: ChunkStorage::OpenMutableBuffer,
-                        lifecycle_action,
-                        memory_bytes: 23754,
-                        object_store_bytes: 0,
-                        row_count: 11,
-                        time_of_first_write: None,
-                        time_of_last_write: None,
-                        time_closed: None,
-                    },
-                    columns: vec![ChunkColumnSummary {
-                        name: "c3".into(),
-                        memory_bytes: 200,
-                    }],
-                },
-            ),
-        ];
-
-        let expected = vec![
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-            "| partition_key | chunk_id | table_name | column_name | storage           | row_count | min_value | max_value | memory_bytes |",
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-            "| p1            | 42       | t1         | c1          | ReadBuffer        | 55        | bar       | foo       | 11           |",
-            "| p1            | 42       | t1         | c2          | ReadBuffer        | 66        | 11        | 43        | 12           |",
-            "| p2            | 43       | t1         | c1          | OpenMutableBuffer | 667       | 110       | 430       | 100          |",
-            "| p2            | 44       | t2         | c3          | OpenMutableBuffer | 4         | -1        | 2         | 200          |",
-            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
-        ];
-
-        let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
-        assert_batches_eq!(&expected, &[batch]);
-    }
 }
diff --git a/server/src/db/system_tables/chunks.rs b/server/src/db/system_tables/chunks.rs
new file mode 100644
index 0000000000..90acda0629
--- /dev/null
+++ b/server/src/db/system_tables/chunks.rs
@@ -0,0 +1,201 @@
+use std::sync::Arc;
+
+use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::chunk_metadata::ChunkSummary;
+use data_types::error::ErrorLogger;
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::{time_to_ts, IoxSystemTable};
+
+/// Implementation of system.chunks table
+#[derive(Debug)]
+pub(super) struct ChunksTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ChunksTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: chunk_summaries_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ChunksTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
+            .log_if_error("system.chunks table")
+    }
+}
+
+fn chunk_summaries_schema() -> SchemaRef {
+    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::UInt32, false),
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("storage", DataType::Utf8, false),
+        Field::new("lifecycle_action", DataType::Utf8, true),
+        Field::new("memory_bytes", DataType::UInt64, false),
+        Field::new("object_store_bytes", DataType::UInt64, false),
+        Field::new("row_count", DataType::UInt64, false),
+        Field::new("time_of_first_write", ts.clone(), true),
+        Field::new("time_of_last_write", ts.clone(), true),
+        Field::new("time_closed", ts, true),
+    ]))
+}
+
+fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
+    let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
+    let partition_key = chunks
+        .iter()
+        .map(|c| Some(c.partition_key.as_ref()))
+        .collect::<StringArray>();
+    let table_name = chunks
+        .iter()
+        .map(|c| Some(c.table_name.as_ref()))
+        .collect::<StringArray>();
+    let storage = chunks
+        .iter()
+        .map(|c| Some(c.storage.as_str()))
+        .collect::<StringArray>();
+    let lifecycle_action = chunks
+        .iter()
+        .map(|c| c.lifecycle_action.map(|a| a.name()))
+        .collect::<StringArray>();
+    let memory_bytes = chunks
+        .iter()
+        .map(|c| Some(c.memory_bytes as u64))
+        .collect::<UInt64Array>();
+    let object_store_bytes = chunks
+        .iter()
+        .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
+        .collect::<UInt64Array>();
+    let row_counts = chunks
+        .iter()
+        .map(|c| Some(c.row_count as u64))
+        .collect::<UInt64Array>();
+    let time_of_first_write = chunks
+        .iter()
+        .map(|c| c.time_of_first_write)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+    let time_of_last_write = chunks
+        .iter()
+        .map(|c| c.time_of_last_write)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+    let time_closed = chunks
+        .iter()
+        .map(|c| c.time_closed)
+        .map(time_to_ts)
+        .collect::<TimestampNanosecondArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(id),
+            Arc::new(partition_key),
+            Arc::new(table_name),
+            Arc::new(storage),
+            Arc::new(lifecycle_action),
+            Arc::new(memory_bytes),
+            Arc::new(object_store_bytes),
+            Arc::new(row_counts),
+            Arc::new(time_of_first_write),
+            Arc::new(time_of_last_write),
+            Arc::new(time_closed),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use chrono::{DateTime, NaiveDateTime, Utc};
+
+    use arrow_util::assert_batches_eq;
+    use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage};
+
+    use super::*;
+
+    #[test]
+    fn test_from_chunk_summaries() {
+        let chunks = vec![
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 0,
+                storage: ChunkStorage::OpenMutableBuffer,
+                lifecycle_action: None,
+                memory_bytes: 23754,
+                object_store_bytes: 0,
+                row_count: 11,
+                time_of_first_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(10, 0),
+                    Utc,
+                )),
+                time_of_last_write: None,
+                time_closed: None,
+            },
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 1,
+                storage: ChunkStorage::OpenMutableBuffer,
+                lifecycle_action: Some(ChunkLifecycleAction::Persisting),
+                memory_bytes: 23455,
+                object_store_bytes: 0,
+                row_count: 22,
+                time_of_first_write: None,
+                time_of_last_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(80, 0),
+                    Utc,
+                )),
+                time_closed: None,
+            },
+            ChunkSummary {
+                partition_key: Arc::from("p1"),
+                table_name: Arc::from("table1"),
+                id: 2,
+                storage: ChunkStorage::ObjectStoreOnly,
+                lifecycle_action: None,
+                memory_bytes: 1234,
+                object_store_bytes: 5678,
+                row_count: 33,
+                time_of_first_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(100, 0),
+                    Utc,
+                )),
+                time_of_last_write: Some(DateTime::from_utc(
+                    NaiveDateTime::from_timestamp(200, 0),
+                    Utc,
+                )),
+                time_closed: None,
+            },
+        ];
+
+        let expected = vec![
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+            "| id | partition_key | table_name | storage           | lifecycle_action             | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write  | time_closed |",
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+            "| 0  | p1            | table1     | OpenMutableBuffer |                              | 23754        |                    | 11        | 1970-01-01 00:00:10 |                     |             |",
+            "| 1  | p1            | table1     | OpenMutableBuffer | Persisting to Object Storage | 23455        |                    | 22        |                     | 1970-01-01 00:01:20 |             |",
+            "| 2  | p1            | table1     | ObjectStoreOnly   |                              | 1234         | 5678               | 33        | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 |             |",
+            "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
+        ];
+
+        let schema = chunk_summaries_schema();
+        let batch = from_chunk_summaries(schema, chunks).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}
diff --git a/server/src/db/system_tables/columns.rs b/server/src/db/system_tables/columns.rs
new file mode 100644
index 0000000000..5f0b8f6fdd
--- /dev/null
+++ b/server/src/db/system_tables/columns.rs
@@ -0,0 +1,404 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::chunk_metadata::DetailedChunkSummary;
+use data_types::error::ErrorLogger;
+use data_types::partition_metadata::{PartitionSummary, TableSummary};
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::IoxSystemTable;
+
+/// Implementation of `system.columns` system table
+#[derive(Debug)]
+pub(super) struct ColumnsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ColumnsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: partition_summaries_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ColumnsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+    fn batch(&self) -> Result<RecordBatch> {
+        from_partition_summaries(self.schema(), self.catalog.partition_summaries())
+            .log_if_error("system.columns table")
+    }
+}
+
+fn partition_summaries_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("column_name", DataType::Utf8, false),
+        Field::new("column_type", DataType::Utf8, false),
+        Field::new("influxdb_type", DataType::Utf8, true),
+    ]))
+}
+
+fn from_partition_summaries(
+    schema: SchemaRef,
+    partitions: Vec<PartitionSummary>,
+) -> Result<RecordBatch> {
+    // Assume each partition has roughly 5 tables with 5 columns
+    let row_estimate = partitions.len() * 25;
+
+    let mut partition_key = StringBuilder::new(row_estimate);
+    let mut table_name = StringBuilder::new(row_estimate);
+    let mut column_name = StringBuilder::new(row_estimate);
+    let mut column_type = StringBuilder::new(row_estimate);
+    let mut influxdb_type = StringBuilder::new(row_estimate);
+
+    // Note no rows are produced for partitions with no tabes, or
+    // tables with no columns: There are other tables to list tables
+    // and columns
+    for partition in partitions {
+        let table = partition.table;
+        for column in table.columns {
+            partition_key.append_value(&partition.key)?;
+            table_name.append_value(&table.name)?;
+            column_name.append_value(&column.name)?;
+            column_type.append_value(column.type_name())?;
+            if let Some(t) = &column.influxdb_type {
+                influxdb_type.append_value(t.as_str())?;
+            } else {
+                influxdb_type.append_null()?;
+            }
+        }
+    }
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key.finish()) as ArrayRef,
+            Arc::new(table_name.finish()),
+            Arc::new(column_name.finish()),
+            Arc::new(column_type.finish()),
+            Arc::new(influxdb_type.finish()),
+        ],
+    )
+}
+
+/// Implementation of system.column_chunks table
+#[derive(Debug)]
+pub(super) struct ChunkColumnsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl ChunkColumnsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: chunk_columns_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for ChunkColumnsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
+            .log_if_error("system.column_chunks table")
+    }
+}
+
+fn chunk_columns_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("chunk_id", DataType::UInt32, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("column_name", DataType::Utf8, false),
+        Field::new("storage", DataType::Utf8, false),
+        Field::new("row_count", DataType::UInt64, true),
+        Field::new("min_value", DataType::Utf8, true),
+        Field::new("max_value", DataType::Utf8, true),
+        Field::new("memory_bytes", DataType::UInt64, true),
+    ]))
+}
+
+fn assemble_chunk_columns(
+    schema: SchemaRef,
+    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
+) -> Result<RecordBatch> {
+    /// Builds an index from column_name -> size
+    fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
+        summary
+            .columns
+            .iter()
+            .map(|column_summary| {
+                (
+                    column_summary.name.as_ref(),
+                    column_summary.memory_bytes as u64,
+                )
+            })
+            .collect()
+    }
+
+    // Assume each chunk has roughly 5 columns
+    let row_estimate = chunk_summaries.len() * 5;
+
+    let mut partition_key = StringBuilder::new(row_estimate);
+    let mut chunk_id = UInt32Builder::new(row_estimate);
+    let mut table_name = StringBuilder::new(row_estimate);
+    let mut column_name = StringBuilder::new(row_estimate);
+    let mut storage = StringBuilder::new(row_estimate);
+    let mut row_count = UInt64Builder::new(row_estimate);
+    let mut min_values = StringBuilder::new(row_estimate);
+    let mut max_values = StringBuilder::new(row_estimate);
+    let mut memory_bytes = UInt64Builder::new(row_estimate);
+
+    // Note no rows are produced for partitions with no chunks, or
+    // tables with no partitions: There are other tables to list tables
+    // and columns
+    for (table_summary, chunk_summary) in chunk_summaries {
+        let mut column_index = make_column_index(&chunk_summary);
+        let storage_value = chunk_summary.inner.storage.as_str();
+
+        for column in &table_summary.columns {
+            partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
+            chunk_id.append_value(chunk_summary.inner.id)?;
+            table_name.append_value(&chunk_summary.inner.table_name)?;
+            column_name.append_value(&column.name)?;
+            storage.append_value(storage_value)?;
+            row_count.append_value(column.count())?;
+            if let Some(v) = column.stats.min_as_str() {
+                min_values.append_value(v)?;
+            } else {
+                min_values.append(false)?;
+            }
+            if let Some(v) = column.stats.max_as_str() {
+                max_values.append_value(v)?;
+            } else {
+                max_values.append(false)?;
+            }
+
+            let size = column_index.remove(column.name.as_str());
+
+            memory_bytes.append_option(size)?;
+        }
+    }
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key.finish()) as ArrayRef,
+            Arc::new(chunk_id.finish()),
+            Arc::new(table_name.finish()),
+            Arc::new(column_name.finish()),
+            Arc::new(storage.finish()),
+            Arc::new(row_count.finish()),
+            Arc::new(min_values.finish()),
+            Arc::new(max_values.finish()),
+            Arc::new(memory_bytes.finish()),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_util::assert_batches_eq;
+    use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary};
+    use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics};
+
+    use super::*;
+
+    #[test]
+    fn test_from_partition_summaries() {
+        let partitions = vec![
+            PartitionSummary {
+                key: "p1".to_string(),
+                table: TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![
+                        ColumnSummary {
+                            name: "c1".to_string(),
+                            influxdb_type: Some(InfluxDbType::Tag),
+                            stats: Statistics::I64(StatValues::new_with_value(23)),
+                        },
+                        ColumnSummary {
+                            name: "c2".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::I64(StatValues::new_with_value(43)),
+                        },
+                        ColumnSummary {
+                            name: "c3".to_string(),
+                            influxdb_type: None,
+                            stats: Statistics::String(StatValues::new_with_value(
+                                "foo".to_string(),
+                            )),
+                        },
+                        ColumnSummary {
+                            name: "time".to_string(),
+                            influxdb_type: Some(InfluxDbType::Timestamp),
+                            stats: Statistics::I64(StatValues::new_with_value(43)),
+                        },
+                    ],
+                },
+            },
+            PartitionSummary {
+                key: "p3".to_string(),
+                table: TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![],
+                },
+            },
+        ];
+
+        let expected = vec![
+            "+---------------+------------+-------------+-------------+---------------+",
+            "| partition_key | table_name | column_name | column_type | influxdb_type |",
+            "+---------------+------------+-------------+-------------+---------------+",
+            "| p1            | t1         | c1          | I64         | Tag           |",
+            "| p1            | t1         | c2          | I64         | Field         |",
+            "| p1            | t1         | c3          | String      |               |",
+            "| p1            | t1         | time        | I64         | Timestamp     |",
+            "+---------------+------------+-------------+-------------+---------------+",
+        ];
+
+        let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+
+    #[test]
+    fn test_assemble_chunk_columns() {
+        let lifecycle_action = None;
+
+        let summaries = vec![
+            (
+                Arc::new(TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![
+                        ColumnSummary {
+                            name: "c1".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::String(StatValues::new(
+                                Some("bar".to_string()),
+                                Some("foo".to_string()),
+                                55,
+                            )),
+                        },
+                        ColumnSummary {
+                            name: "c2".to_string(),
+                            influxdb_type: Some(InfluxDbType::Field),
+                            stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
+                        },
+                    ],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p1".into(),
+                        table_name: "t1".into(),
+                        id: 42,
+                        storage: ChunkStorage::ReadBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![
+                        ChunkColumnSummary {
+                            name: "c1".into(),
+                            memory_bytes: 11,
+                        },
+                        ChunkColumnSummary {
+                            name: "c2".into(),
+                            memory_bytes: 12,
+                        },
+                    ],
+                },
+            ),
+            (
+                Arc::new(TableSummary {
+                    name: "t1".to_string(),
+                    columns: vec![ColumnSummary {
+                        name: "c1".to_string(),
+                        influxdb_type: Some(InfluxDbType::Field),
+                        stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
+                    }],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p2".into(),
+                        table_name: "t1".into(),
+                        id: 43,
+                        storage: ChunkStorage::OpenMutableBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![ChunkColumnSummary {
+                        name: "c1".into(),
+                        memory_bytes: 100,
+                    }],
+                },
+            ),
+            (
+                Arc::new(TableSummary {
+                    name: "t2".to_string(),
+                    columns: vec![ColumnSummary {
+                        name: "c3".to_string(),
+                        influxdb_type: Some(InfluxDbType::Field),
+                        stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
+                    }],
+                }),
+                DetailedChunkSummary {
+                    inner: ChunkSummary {
+                        partition_key: "p2".into(),
+                        table_name: "t2".into(),
+                        id: 44,
+                        storage: ChunkStorage::OpenMutableBuffer,
+                        lifecycle_action,
+                        memory_bytes: 23754,
+                        object_store_bytes: 0,
+                        row_count: 11,
+                        time_of_first_write: None,
+                        time_of_last_write: None,
+                        time_closed: None,
+                    },
+                    columns: vec![ChunkColumnSummary {
+                        name: "c3".into(),
+                        memory_bytes: 200,
+                    }],
+                },
+            ),
+        ];
+
+        let expected = vec![
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+            "| partition_key | chunk_id | table_name | column_name | storage           | row_count | min_value | max_value | memory_bytes |",
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+            "| p1            | 42       | t1         | c1          | ReadBuffer        | 55        | bar       | foo       | 11           |",
+            "| p1            | 42       | t1         | c2          | ReadBuffer        | 66        | 11        | 43        | 12           |",
+            "| p2            | 43       | t1         | c1          | OpenMutableBuffer | 667       | 110       | 430       | 100          |",
+            "| p2            | 44       | t2         | c3          | OpenMutableBuffer | 4         | -1        | 2         | 200          |",
+            "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
+        ];
+
+        let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}
diff --git a/server/src/db/system_tables/operations.rs b/server/src/db/system_tables/operations.rs
new file mode 100644
index 0000000000..d8b2af0ac2
--- /dev/null
+++ b/server/src/db/system_tables/operations.rs
@@ -0,0 +1,108 @@
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::error::ErrorLogger;
+use data_types::job::Job;
+use tracker::TaskTracker;
+
+use crate::db::system_tables::IoxSystemTable;
+use crate::JobRegistry;
+
+/// Implementation of system.operations table
+#[derive(Debug)]
+pub(super) struct OperationsTable {
+    schema: SchemaRef,
+    db_name: String,
+    jobs: Arc<JobRegistry>,
+}
+
+impl OperationsTable {
+    pub(super) fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
+        Self {
+            schema: operations_schema(),
+            db_name,
+            jobs,
+        }
+    }
+}
+
+impl IoxSystemTable for OperationsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
+            .log_if_error("system.operations table")
+    }
+}
+
+fn operations_schema() -> SchemaRef {
+    let ts = DataType::Time64(TimeUnit::Nanosecond);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("status", DataType::Utf8, true),
+        Field::new("cpu_time_used", ts.clone(), true),
+        Field::new("wall_time_used", ts, true),
+        Field::new("partition_key", DataType::Utf8, true),
+        Field::new("chunk_id", DataType::UInt32, true),
+        Field::new("description", DataType::Utf8, true),
+    ]))
+}
+
+fn from_task_trackers(
+    schema: SchemaRef,
+    db_name: &str,
+    jobs: Vec<TaskTracker<Job>>,
+) -> Result<RecordBatch> {
+    let jobs = jobs
+        .into_iter()
+        .filter(|job| job.metadata().db_name() == Some(db_name))
+        .collect::<Vec<_>>();
+
+    let ids = jobs
+        .iter()
+        .map(|job| Some(job.id().to_string()))
+        .collect::<StringArray>();
+    let statuses = jobs
+        .iter()
+        .map(|job| Some(job.get_status().name()))
+        .collect::<StringArray>();
+    let cpu_time_used = jobs
+        .iter()
+        .map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
+        .collect::<Time64NanosecondArray>();
+    let wall_time_used = jobs
+        .iter()
+        .map(|job| job.get_status().wall_nanos().map(|n| n as i64))
+        .collect::<Time64NanosecondArray>();
+    let partition_keys = jobs
+        .iter()
+        .map(|job| job.metadata().partition_key())
+        .collect::<StringArray>();
+    let chunk_ids = jobs
+        .iter()
+        .map(|job| job.metadata().chunk_id())
+        .collect::<UInt32Array>();
+    let descriptions = jobs
+        .iter()
+        .map(|job| Some(job.metadata().description()))
+        .collect::<StringArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(ids) as ArrayRef,
+            Arc::new(statuses),
+            Arc::new(cpu_time_used),
+            Arc::new(wall_time_used),
+            Arc::new(partition_keys),
+            Arc::new(chunk_ids),
+            Arc::new(descriptions),
+        ],
+    )
+}

From e4d2c51e8b7c2a0883afeb3838b558b62c701938 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 13:44:47 +0100
Subject: [PATCH 13/27] fix: update PersistenceWindows on rules update (#2018)
 (#2060)

* fix: update PersistenceWindows on rules update (#2018)

* chore: review feedback

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 .../src/persistence_windows.rs                |   6 +
 server/src/config.rs                          |   2 +-
 server/src/db.rs                              |  35 +++++-
 server/src/lib.rs                             |   4 +-
 tests/end_to_end_cases/persistence.rs         | 109 ++++++++++++++----
 5 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs
index 1b5ae73d29..5b66a593f8 100644
--- a/persistence_windows/src/persistence_windows.rs
+++ b/persistence_windows/src/persistence_windows.rs
@@ -133,6 +133,12 @@ impl PersistenceWindows {
         }
     }
 
+    /// Updates the late arrival period of this `PersistenceWindows` instance
+    pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) {
+        self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD);
+        self.late_arrival_period = late_arrival_period;
+    }
+
     /// Updates the windows with the information from a batch of rows from a single sequencer
     /// to the same partition. The min and max times are the times on the row data. The `received_at`
     /// Instant is when the data was received. Taking it in this function is really just about
diff --git a/server/src/config.rs b/server/src/config.rs
index 4554e8e912..f869319a87 100644
--- a/server/src/config.rs
+++ b/server/src/config.rs
@@ -224,7 +224,7 @@ impl Config {
             .db_initialized(db_name)
             .context(DatabaseNotFound { db_name })?;
 
-        db.update_db_rules(update).map_err(UpdateError::Closure)
+        db.update_rules(update).map_err(UpdateError::Closure)
     }
 
     /// Get all registered remote servers.
diff --git a/server/src/db.rs b/server/src/db.rs
index dc198aa292..f94eb4b28c 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -333,13 +333,40 @@ impl Db {
     }
 
     /// Updates the database rules
-    pub fn update_db_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
+    pub fn update_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
     where
         F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E>,
     {
-        let mut rules = self.rules.write();
-        let new_rules = Arc::new(update(rules.as_ref().clone())?);
-        *rules = Arc::clone(&new_rules);
+        let (late_arrive_window_updated, new_rules) = {
+            let mut rules = self.rules.write();
+            info!(db_name=%rules.name,  "updating rules for database");
+            let new_rules = Arc::new(update(rules.as_ref().clone())?);
+            let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds
+                != new_rules.lifecycle_rules.late_arrive_window_seconds;
+
+            *rules = Arc::clone(&new_rules);
+            (late_arrive_window_updated, new_rules)
+        };
+
+        if late_arrive_window_updated {
+            // Hold a read lock to prevent concurrent modification and
+            // use values from re-acquired read guard
+            let current = self.rules.read();
+
+            // Update windows
+            let partitions = self.catalog.partitions();
+            for partition in &partitions {
+                let mut partition = partition.write();
+                let addr = partition.addr().clone();
+                if let Some(windows) = partition.persistence_windows_mut() {
+                    info!(partition=%addr, "updating persistence windows");
+                    windows.set_late_arrival_period(Duration::from_secs(
+                        current.lifecycle_rules.late_arrive_window_seconds.get() as u64,
+                    ))
+                }
+            }
+        }
+
         Ok(new_rules)
     }
 
diff --git a/server/src/lib.rs b/server/src/lib.rs
index 48246918d1..e2dc829e31 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -1733,7 +1733,7 @@ mod tests {
 
         let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2];
         let db = server.db(&db_name).unwrap();
-        db.update_db_rules(|mut rules| {
+        db.update_rules(|mut rules| {
             let shard_config = ShardConfig {
                 hash_ring: Some(HashRing {
                     shards: vec![TEST_SHARD_ID].into(),
@@ -1976,7 +1976,7 @@ mod tests {
         let db_name = DatabaseName::new("foo").unwrap();
         let db = server.db(&db_name).unwrap();
         let rules = db
-            .update_db_rules(|mut rules| {
+            .update_rules(|mut rules| {
                 rules.lifecycle_rules.buffer_size_hard =
                     Some(std::num::NonZeroUsize::new(10).unwrap());
                 Ok::<_, Infallible>(rules)
diff --git a/tests/end_to_end_cases/persistence.rs b/tests/end_to_end_cases/persistence.rs
index 634c506856..af4dccfa61 100644
--- a/tests/end_to_end_cases/persistence.rs
+++ b/tests/end_to_end_cases/persistence.rs
@@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() {
     assert_eq!(chunks[0].row_count, 1_000);
 }
 
+async fn write_data(
+    write_client: &mut influxdb_iox_client::write::Client,
+    db_name: &str,
+    num_payloads: u64,
+    num_duplicates: u64,
+    payload_size: u64,
+) {
+    let payloads: Vec<_> = (0..num_payloads)
+        .map(|x| {
+            (0..payload_size)
+                .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
+                .join("\n")
+        })
+        .collect();
+
+    for payload in &payloads {
+        // Writing the same data multiple times should be compacted away
+        for _ in 0..=num_duplicates {
+            let num_lines_written = write_client
+                .write(db_name, payload)
+                .await
+                .expect("successful write");
+            assert_eq!(num_lines_written, payload_size as usize);
+        }
+    }
+}
+
 #[tokio::test]
 async fn test_full_lifecycle() {
     let fixture = ServerFixture::create_shared().await;
     let mut write_client = fixture.write_client();
 
     let num_payloads = 10;
-    let num_duplicates = 2;
+    let num_duplicates = 1;
     let payload_size = 1_000;
 
-    let total_rows = num_payloads * num_duplicates * payload_size;
+    let total_rows = num_payloads * (1 + num_duplicates) * payload_size;
 
     let db_name = rand_name();
     DatabaseBuilder::new(db_name.clone())
@@ -73,24 +100,14 @@ async fn test_full_lifecycle() {
         .build(fixture.grpc_channel())
         .await;
 
-    let payloads: Vec<_> = (0..num_payloads)
-        .map(|x| {
-            (0..payload_size)
-                .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
-                .join("\n")
-        })
-        .collect();
-
-    for payload in &payloads {
-        // Writing the same data multiple times should be compacted away
-        for _ in 0..num_duplicates {
-            let num_lines_written = write_client
-                .write(&db_name, payload)
-                .await
-                .expect("successful write");
-            assert_eq!(num_lines_written, payload_size as usize);
-        }
-    }
+    write_data(
+        &mut write_client,
+        &db_name,
+        num_payloads,
+        num_duplicates,
+        payload_size,
+    )
+    .await;
 
     wait_for_exact_chunk_states(
         &fixture,
@@ -123,6 +140,58 @@ async fn test_full_lifecycle() {
     assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize)
 }
 
+#[tokio::test]
+async fn test_update_late_arrival() {
+    let fixture = ServerFixture::create_shared().await;
+    let mut write_client = fixture.write_client();
+
+    let payload_size = 100;
+
+    let db_name = rand_name();
+    DatabaseBuilder::new(db_name.clone())
+        .persist(true)
+        // Don't close MUB automatically
+        .mub_row_threshold(payload_size * 2)
+        .persist_row_threshold(payload_size)
+        .persist_age_threshold_seconds(1000)
+        // Initially set to be a large value
+        .late_arrive_window_seconds(1000)
+        .build(fixture.grpc_channel())
+        .await;
+
+    write_data(&mut write_client, &db_name, 1, 0, payload_size).await;
+
+    let mut management = fixture.management_client();
+
+    let chunks = management.list_chunks(&db_name).await.unwrap();
+    assert_eq!(chunks.len(), 1);
+    assert_eq!(
+        chunks[0].storage,
+        influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32
+    );
+
+    let mut rules = management.get_database(&db_name).await.unwrap();
+    rules
+        .lifecycle_rules
+        .as_mut()
+        .unwrap()
+        .late_arrive_window_seconds = 1;
+
+    fixture
+        .management_client()
+        .update_database(rules)
+        .await
+        .unwrap();
+
+    wait_for_exact_chunk_states(
+        &fixture,
+        &db_name,
+        vec![ChunkStorage::ReadBufferAndObjectStore],
+        std::time::Duration::from_secs(5),
+    )
+    .await;
+}
+
 #[tokio::test]
 async fn test_query_chunk_after_restart() {
     // fixtures

From 091837420f5fe83fba63a56b01a81aff041970e6 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 14:10:57 +0100
Subject: [PATCH 14/27] feat: add PersistenceWindows sytem table (#2030)
 (#2062)

* feat: add PersistenceWindows sytem table (#2030)

* chore: update log

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
---
 .../cases/in/all_chunks_dropped.expected      |  46 +++---
 query_tests/src/sql.rs                        |  25 +--
 server/src/db/catalog.rs                      |  20 ++-
 server/src/db/system_tables.rs                |  11 +-
 server/src/db/system_tables/persistence.rs    | 154 ++++++++++++++++++
 5 files changed, 220 insertions(+), 36 deletions(-)
 create mode 100644 server/src/db/system_tables/persistence.rs

diff --git a/query_tests/cases/in/all_chunks_dropped.expected b/query_tests/cases/in/all_chunks_dropped.expected
index 5febb4d2e9..65e17df50a 100644
--- a/query_tests/cases/in/all_chunks_dropped.expected
+++ b/query_tests/cases/in/all_chunks_dropped.expected
@@ -1,25 +1,27 @@
 -- Test Setup: OneMeasurementAllChunksDropped
 -- SQL: SELECT * from information_schema.tables;
-+---------------+--------------------+---------------+------------+
-| table_catalog | table_schema       | table_name    | table_type |
-+---------------+--------------------+---------------+------------+
-| public        | iox                | h2o           | BASE TABLE |
-| public        | system             | chunks        | BASE TABLE |
-| public        | system             | columns       | BASE TABLE |
-| public        | system             | chunk_columns | BASE TABLE |
-| public        | system             | operations    | BASE TABLE |
-| public        | information_schema | tables        | VIEW       |
-| public        | information_schema | columns       | VIEW       |
-+---------------+--------------------+---------------+------------+
++---------------+--------------------+---------------------+------------+
+| table_catalog | table_schema       | table_name          | table_type |
++---------------+--------------------+---------------------+------------+
+| public        | iox                | h2o                 | BASE TABLE |
+| public        | system             | chunks              | BASE TABLE |
+| public        | system             | columns             | BASE TABLE |
+| public        | system             | chunk_columns       | BASE TABLE |
+| public        | system             | operations          | BASE TABLE |
+| public        | system             | persistence_windows | BASE TABLE |
+| public        | information_schema | tables              | VIEW       |
+| public        | information_schema | columns             | VIEW       |
++---------------+--------------------+---------------------+------------+
 -- SQL: SHOW TABLES;
-+---------------+--------------------+---------------+------------+
-| table_catalog | table_schema       | table_name    | table_type |
-+---------------+--------------------+---------------+------------+
-| public        | iox                | h2o           | BASE TABLE |
-| public        | system             | chunks        | BASE TABLE |
-| public        | system             | columns       | BASE TABLE |
-| public        | system             | chunk_columns | BASE TABLE |
-| public        | system             | operations    | BASE TABLE |
-| public        | information_schema | tables        | VIEW       |
-| public        | information_schema | columns       | VIEW       |
-+---------------+--------------------+---------------+------------+
++---------------+--------------------+---------------------+------------+
+| table_catalog | table_schema       | table_name          | table_type |
++---------------+--------------------+---------------------+------------+
+| public        | iox                | h2o                 | BASE TABLE |
+| public        | system             | chunks              | BASE TABLE |
+| public        | system             | columns             | BASE TABLE |
+| public        | system             | chunk_columns       | BASE TABLE |
+| public        | system             | operations          | BASE TABLE |
+| public        | system             | persistence_windows | BASE TABLE |
+| public        | information_schema | tables              | VIEW       |
+| public        | information_schema | columns             | VIEW       |
++---------------+--------------------+---------------------+------------+
diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs
index 58072c5a97..6fc2c13550 100644
--- a/query_tests/src/sql.rs
+++ b/query_tests/src/sql.rs
@@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() {
     // validate we have access to information schema for listing table
     // names
     let expected = vec![
-        "+---------------+--------------------+---------------+------------+",
-        "| table_catalog | table_schema       | table_name    | table_type |",
-        "+---------------+--------------------+---------------+------------+",
-        "| public        | information_schema | columns       | VIEW       |",
-        "| public        | information_schema | tables        | VIEW       |",
-        "| public        | iox                | h2o           | BASE TABLE |",
-        "| public        | iox                | o2            | BASE TABLE |",
-        "| public        | system             | chunk_columns | BASE TABLE |",
-        "| public        | system             | chunks        | BASE TABLE |",
-        "| public        | system             | columns       | BASE TABLE |",
-        "| public        | system             | operations    | BASE TABLE |",
-        "+---------------+--------------------+---------------+------------+",
+        "+---------------+--------------------+---------------------+------------+",
+        "| table_catalog | table_schema       | table_name          | table_type |",
+        "+---------------+--------------------+---------------------+------------+",
+        "| public        | information_schema | columns             | VIEW       |",
+        "| public        | information_schema | tables              | VIEW       |",
+        "| public        | iox                | h2o                 | BASE TABLE |",
+        "| public        | iox                | o2                  | BASE TABLE |",
+        "| public        | system             | chunk_columns       | BASE TABLE |",
+        "| public        | system             | chunks              | BASE TABLE |",
+        "| public        | system             | columns             | BASE TABLE |",
+        "| public        | system             | operations          | BASE TABLE |",
+        "| public        | system             | persistence_windows | BASE TABLE |",
+        "+---------------+--------------------+---------------------+------------+",
     ];
     run_sql_test_case!(
         TwoMeasurementsManyFields {},
diff --git a/server/src/db/catalog.rs b/server/src/db/catalog.rs
index c2953a3de6..dff3c37b6b 100644
--- a/server/src/db/catalog.rs
+++ b/server/src/db/catalog.rs
@@ -6,7 +6,7 @@ use hashbrown::{HashMap, HashSet};
 
 use data_types::chunk_metadata::ChunkSummary;
 use data_types::chunk_metadata::DetailedChunkSummary;
-use data_types::partition_metadata::{PartitionSummary, TableSummary};
+use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary};
 use internal_types::schema::Schema;
 use snafu::{OptionExt, Snafu};
 use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
@@ -15,6 +15,7 @@ use self::chunk::CatalogChunk;
 use self::metrics::CatalogMetrics;
 use self::partition::Partition;
 use self::table::Table;
+use data_types::write_summary::WriteSummary;
 
 pub mod chunk;
 mod metrics;
@@ -225,6 +226,23 @@ impl Catalog {
             .collect()
     }
 
+    /// Returns a list of persistence window summaries for each partition
+    pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> {
+        let mut summaries = Vec::new();
+        let tables = self.tables.read();
+        for table in tables.values() {
+            for partition in table.partitions() {
+                let partition = partition.read();
+                if let Some(w) = partition.persistence_windows() {
+                    for summary in w.summaries() {
+                        summaries.push((partition.addr().clone(), summary))
+                    }
+                }
+            }
+        }
+        summaries
+    }
+
     pub fn chunk_summaries(&self) -> Vec<ChunkSummary> {
         let partition_key = None;
         let table_names = TableNameFilter::AllTables;
diff --git a/server/src/db/system_tables.rs b/server/src/db/system_tables.rs
index f83c793fa5..bcc474e230 100644
--- a/server/src/db/system_tables.rs
+++ b/server/src/db/system_tables.rs
@@ -31,6 +31,7 @@ use super::catalog::Catalog;
 mod chunks;
 mod columns;
 mod operations;
+mod persistence;
 
 // The IOx system schema
 pub const SYSTEM_SCHEMA: &str = "system";
@@ -39,12 +40,14 @@ const CHUNKS: &str = "chunks";
 const COLUMNS: &str = "columns";
 const CHUNK_COLUMNS: &str = "chunk_columns";
 const OPERATIONS: &str = "operations";
+const PERSISTENCE_WINDOWS: &str = "persistence_windows";
 
 pub struct SystemSchemaProvider {
     chunks: Arc<dyn TableProvider>,
     columns: Arc<dyn TableProvider>,
     chunk_columns: Arc<dyn TableProvider>,
     operations: Arc<dyn TableProvider>,
+    persistence_windows: Arc<dyn TableProvider>,
 }
 
 impl std::fmt::Debug for SystemSchemaProvider {
@@ -65,16 +68,20 @@ impl SystemSchemaProvider {
             inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
         });
         let chunk_columns = Arc::new(SystemTableProvider {
-            inner: columns::ChunkColumnsTable::new(catalog),
+            inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
         });
         let operations = Arc::new(SystemTableProvider {
             inner: operations::OperationsTable::new(db_name, jobs),
         });
+        let persistence_windows = Arc::new(SystemTableProvider {
+            inner: persistence::PersistenceWindowsTable::new(catalog),
+        });
         Self {
             chunks,
             columns,
             chunk_columns,
             operations,
+            persistence_windows,
         }
     }
 }
@@ -90,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider {
             COLUMNS.to_string(),
             CHUNK_COLUMNS.to_string(),
             OPERATIONS.to_string(),
+            PERSISTENCE_WINDOWS.to_string(),
         ]
     }
 
@@ -99,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider {
             COLUMNS => Some(Arc::clone(&self.columns)),
             CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)),
             OPERATIONS => Some(Arc::clone(&self.operations)),
+            PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)),
             _ => None,
         }
     }
diff --git a/server/src/db/system_tables/persistence.rs b/server/src/db/system_tables/persistence.rs
new file mode 100644
index 0000000000..3392ff5032
--- /dev/null
+++ b/server/src/db/system_tables/persistence.rs
@@ -0,0 +1,154 @@
+use std::sync::Arc;
+
+use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::error::Result;
+use arrow::record_batch::RecordBatch;
+
+use data_types::error::ErrorLogger;
+use data_types::partition_metadata::PartitionAddr;
+use data_types::write_summary::WriteSummary;
+
+use crate::db::catalog::Catalog;
+use crate::db::system_tables::IoxSystemTable;
+
+/// Implementation of system.persistence_windows table
+#[derive(Debug)]
+pub(super) struct PersistenceWindowsTable {
+    schema: SchemaRef,
+    catalog: Arc<Catalog>,
+}
+
+impl PersistenceWindowsTable {
+    pub(super) fn new(catalog: Arc<Catalog>) -> Self {
+        Self {
+            schema: persistence_windows_schema(),
+            catalog,
+        }
+    }
+}
+
+impl IoxSystemTable for PersistenceWindowsTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn batch(&self) -> Result<RecordBatch> {
+        from_write_summaries(self.schema(), self.catalog.persistence_summaries())
+            .log_if_error("system.persistence_windows table")
+    }
+}
+
+fn persistence_windows_schema() -> SchemaRef {
+    let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
+    Arc::new(Schema::new(vec![
+        Field::new("partition_key", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("row_count", DataType::UInt64, false),
+        Field::new("time_of_first_write", ts.clone(), false),
+        Field::new("time_of_last_write", ts.clone(), false),
+        Field::new("min_timestamp", ts.clone(), false),
+        Field::new("max_timestamp", ts, false),
+    ]))
+}
+
+fn from_write_summaries(
+    schema: SchemaRef,
+    chunks: Vec<(PartitionAddr, WriteSummary)>,
+) -> Result<RecordBatch> {
+    let partition_key = chunks
+        .iter()
+        .map(|(addr, _)| Some(addr.partition_key.as_ref()))
+        .collect::<StringArray>();
+    let table_name = chunks
+        .iter()
+        .map(|(addr, _)| Some(addr.table_name.as_ref()))
+        .collect::<StringArray>();
+    let row_counts = chunks
+        .iter()
+        .map(|(_, w)| Some(w.row_count as u64))
+        .collect::<UInt64Array>();
+    let time_of_first_write = chunks
+        .iter()
+        .map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let time_of_last_write = chunks
+        .iter()
+        .map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let min_timestamp = chunks
+        .iter()
+        .map(|(_, w)| Some(w.min_timestamp.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+    let max_timestamp = chunks
+        .iter()
+        .map(|(_, w)| Some(w.max_timestamp.timestamp_nanos()))
+        .collect::<TimestampNanosecondArray>();
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(partition_key),
+            Arc::new(table_name),
+            Arc::new(row_counts),
+            Arc::new(time_of_first_write),
+            Arc::new(time_of_last_write),
+            Arc::new(min_timestamp),
+            Arc::new(max_timestamp),
+        ],
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use chrono::{TimeZone, Utc};
+
+    use arrow_util::assert_batches_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_from_write_summaries() {
+        let addr = PartitionAddr {
+            db_name: Arc::from("db"),
+            table_name: Arc::from("table"),
+            partition_key: Arc::from("partition"),
+        };
+
+        let summaries = vec![
+            (
+                addr.clone(),
+                WriteSummary {
+                    time_of_first_write: Utc.timestamp_nanos(0),
+                    time_of_last_write: Utc.timestamp_nanos(20),
+                    min_timestamp: Utc.timestamp_nanos(50),
+                    max_timestamp: Utc.timestamp_nanos(60),
+                    row_count: 320,
+                },
+            ),
+            (
+                addr,
+                WriteSummary {
+                    time_of_first_write: Utc.timestamp_nanos(6),
+                    time_of_last_write: Utc.timestamp_nanos(21),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(2),
+                    row_count: 2,
+                },
+            ),
+        ];
+
+        let expected = vec![
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+            "| partition_key | table_name | row_count | time_of_first_write           | time_of_last_write            | min_timestamp                 | max_timestamp                 |",
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+            "| partition     | table      | 320       | 1970-01-01 00:00:00           | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |",
+            "| partition     | table      | 2         | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |",
+            "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
+        ];
+
+        let schema = persistence_windows_schema();
+        let batch = from_write_summaries(schema, summaries).unwrap();
+        assert_batches_eq!(&expected, &[batch]);
+    }
+}

From 61da0fe4dfe218c24da92395937323c710b18e85 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 20 Jul 2021 17:38:28 +0100
Subject: [PATCH 15/27] fix: update last_instant when rotating into persistable
 window (#2067)

---
 .../src/persistence_windows.rs                | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs
index 5b66a593f8..957d034d06 100644
--- a/persistence_windows/src/persistence_windows.rs
+++ b/persistence_windows/src/persistence_windows.rs
@@ -499,6 +499,10 @@ impl Window {
 
     /// Add one window to another. Used to collapse closed windows into persisted.
     fn add_window(&mut self, other: Self) {
+        assert!(self.last_instant <= other.created_at);
+        assert!(self.last_instant <= other.last_instant);
+
+        self.last_instant = other.last_instant;
         self.row_count += other.row_count;
         if self.min_time > other.min_time {
             self.min_time = other.min_time;
@@ -1327,7 +1331,8 @@ mod tests {
 
     #[test]
     fn test_summaries() {
-        let mut w = make_windows(Duration::from_secs(100));
+        let late_arrival_period = Duration::from_secs(100);
+        let mut w = make_windows(late_arrival_period);
         let instant = w.created_at_instant;
 
         // Window 1
@@ -1407,6 +1412,34 @@ mod tests {
                     row_count: 8
                 },
             ]
-        )
+        );
+
+        // Rotate first and second windows into persistable
+        w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2);
+
+        let summaries: Vec<_> = w.summaries().collect();
+
+        assert_eq!(summaries.len(), 2);
+        assert_eq!(
+            summaries,
+            vec![
+                WriteSummary {
+                    time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: w.created_at_time
+                        + closed_duration
+                        + chrono::Duration::milliseconds(1),
+                    min_timestamp: Utc.timestamp_nanos(1),
+                    max_timestamp: Utc.timestamp_nanos(340),
+                    row_count: 24
+                },
+                WriteSummary {
+                    time_of_first_write: w.created_at_time + closed_duration * 3,
+                    time_of_last_write: w.created_at_time + closed_duration * 3,
+                    min_timestamp: Utc.timestamp_nanos(3),
+                    max_timestamp: Utc.timestamp_nanos(4),
+                    row_count: 8
+                },
+            ]
+        );
     }
 }

From 297e0590859e90311b75b8095bd0556ee532bf65 Mon Sep 17 00:00:00 2001
From: Paul Dix <paul@pauldix.net>
Date: Mon, 19 Jul 2021 17:52:40 -0400
Subject: [PATCH 16/27] feat: add parquet cache size setting to database rules

---
 data_types/src/database_rules.rs                 |  4 ++++
 .../iox/management/v1/database_rules.proto       |  7 +++++++
 generated_types/src/database_rules/lifecycle.rs  | 16 ++++++++++++++++
 src/commands/database.rs                         |  6 ++++++
 4 files changed, 33 insertions(+)

diff --git a/data_types/src/database_rules.rs b/data_types/src/database_rules.rs
index 6cff31a510..ddde203c93 100644
--- a/data_types/src/database_rules.rs
+++ b/data_types/src/database_rules.rs
@@ -166,6 +166,9 @@ pub struct LifecycleRules {
 
     /// Maximum number of rows to buffer in a MUB chunk before compacting it
     pub mub_row_threshold: NonZeroUsize,
+
+    /// Use up to this amount of space in bytes for caching Parquet files
+    pub parquet_cache_limit: Option<NonZeroUsize>,
 }
 
 impl LifecycleRules {
@@ -195,6 +198,7 @@ impl Default for LifecycleRules {
             persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS)
                 .unwrap(),
             mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(),
+            parquet_cache_limit: None,
         }
     }
 }
diff --git a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
index 530c95bca6..68d9cbf6f9 100644
--- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
+++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
@@ -82,6 +82,13 @@ message LifecycleRules {
   // If 0, compactions are limited to the default number.
   // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS
   uint32 max_active_compactions = 16;
+
+  // Use up to this amount of space in bytes for caching Parquet files
+  ParquetCacheLimit parquet_cache_limit = 17;
+}
+
+message ParquetCacheLimit {
+  uint64 value = 1;
 }
 
 message DatabaseRules {
diff --git a/generated_types/src/database_rules/lifecycle.rs b/generated_types/src/database_rules/lifecycle.rs
index b9612bf1b6..9a62c54df6 100644
--- a/generated_types/src/database_rules/lifecycle.rs
+++ b/generated_types/src/database_rules/lifecycle.rs
@@ -10,6 +10,7 @@ use data_types::database_rules::{
 
 use crate::google::FieldViolation;
 use crate::influxdata::iox::management::v1 as management;
+use crate::influxdata::iox::management::v1::ParquetCacheLimit;
 
 impl From<LifecycleRules> for management::LifecycleRules {
     fn from(config: LifecycleRules) -> Self {
@@ -35,6 +36,9 @@ impl From<LifecycleRules> for management::LifecycleRules {
             persist_row_threshold: config.persist_row_threshold.get() as u64,
             persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(),
             mub_row_threshold: config.mub_row_threshold.get() as u64,
+            parquet_cache_limit: config.parquet_cache_limit.map(|x| ParquetCacheLimit {
+                value: x.get() as u64,
+            }),
         }
     }
 }
@@ -43,6 +47,11 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
     type Error = FieldViolation;
 
     fn try_from(proto: management::LifecycleRules) -> Result<Self, Self::Error> {
+        let parquet_cache_limit = match proto.parquet_cache_limit {
+            Some(l) => (l.value as usize).try_into().ok(),
+            None => None,
+        };
+
         Ok(Self {
             buffer_size_soft: (proto.buffer_size_soft as usize).try_into().ok(),
             buffer_size_hard: (proto.buffer_size_hard as usize).try_into().ok(),
@@ -69,6 +78,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
                 .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()),
             mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize)
                 .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()),
+            parquet_cache_limit,
         })
     }
 }
@@ -93,6 +103,7 @@ mod tests {
             persist_row_threshold: 57,
             persist_age_threshold_seconds: 23,
             mub_row_threshold: 3454,
+            parquet_cache_limit: Some(ParquetCacheLimit { value: 10 }),
         };
 
         let config: LifecycleRules = protobuf.clone().try_into().unwrap();
@@ -125,6 +136,11 @@ mod tests {
             protobuf.persist_age_threshold_seconds
         );
         assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold);
+        assert_eq!(
+            config.parquet_cache_limit.unwrap().get(),
+            protobuf.parquet_cache_limit.as_ref().unwrap().value as usize
+        );
+        assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit);
     }
 
     #[test]
diff --git a/src/commands/database.rs b/src/commands/database.rs
index d7d4ddf81d..77f25471a5 100644
--- a/src/commands/database.rs
+++ b/src/commands/database.rs
@@ -13,6 +13,7 @@ use influxdb_iox_client::{
     },
     write::{self, WriteError},
 };
+use std::num::NonZeroUsize;
 
 mod catalog;
 mod chunk;
@@ -119,6 +120,10 @@ struct Create {
     /// Maximum number of rows to buffer in a MUB chunk before compacting it
     #[structopt(long, default_value = "100000")]
     mub_row_threshold: u64,
+
+    /// Use up to this amount of space in bytes for caching Parquet files
+    #[structopt(long, parse(try_from_str))]
+    pub parquet_cache_limit: Option<NonZeroUsize>,
 }
 
 /// Get list of databases
@@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
                     persist_row_threshold: command.persist_row_threshold,
                     persist_age_threshold_seconds: command.persist_age_threshold_seconds,
                     mub_row_threshold: command.mub_row_threshold,
+                    parquet_cache_limit: command.parquet_cache_limit.map(|l| ParquetCacheLimit{value: l.get() as u64}),
                 }),
 
                 // Default to hourly partitions

From a4704dd165a816aa5beb9c78a1cb5ea6a3684981 Mon Sep 17 00:00:00 2001
From: Paul Dix <paul@pauldix.net>
Date: Tue, 20 Jul 2021 15:40:50 -0400
Subject: [PATCH 17/27] chore: update parquet_cache_limit to u64 and 0 for
 default

---
 data_types/src/database_rules.rs              |  5 +++--
 .../iox/management/v1/database_rules.proto    |  9 +++------
 .../src/database_rules/lifecycle.rs           | 19 +++++++------------
 src/commands/database.rs                      | 10 +++++-----
 4 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/data_types/src/database_rules.rs b/data_types/src/database_rules.rs
index ddde203c93..86a71be778 100644
--- a/data_types/src/database_rules.rs
+++ b/data_types/src/database_rules.rs
@@ -167,8 +167,9 @@ pub struct LifecycleRules {
     /// Maximum number of rows to buffer in a MUB chunk before compacting it
     pub mub_row_threshold: NonZeroUsize,
 
-    /// Use up to this amount of space in bytes for caching Parquet files
-    pub parquet_cache_limit: Option<NonZeroUsize>,
+    /// Use up to this amount of space in bytes for caching Parquet files. None
+    /// will disable Parquet file caching.
+    pub parquet_cache_limit: Option<NonZeroU64>,
 }
 
 impl LifecycleRules {
diff --git a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
index 68d9cbf6f9..b1ad761dbe 100644
--- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
+++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto
@@ -83,12 +83,9 @@ message LifecycleRules {
   // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS
   uint32 max_active_compactions = 16;
 
-  // Use up to this amount of space in bytes for caching Parquet files
-  ParquetCacheLimit parquet_cache_limit = 17;
-}
-
-message ParquetCacheLimit {
-  uint64 value = 1;
+  // Use up to this amount of space in bytes for caching Parquet files.
+  // A value of 0 disables Parquet caching
+  uint64 parquet_cache_limit = 17;
 }
 
 message DatabaseRules {
diff --git a/generated_types/src/database_rules/lifecycle.rs b/generated_types/src/database_rules/lifecycle.rs
index 9a62c54df6..ab71e38de5 100644
--- a/generated_types/src/database_rules/lifecycle.rs
+++ b/generated_types/src/database_rules/lifecycle.rs
@@ -10,7 +10,6 @@ use data_types::database_rules::{
 
 use crate::google::FieldViolation;
 use crate::influxdata::iox::management::v1 as management;
-use crate::influxdata::iox::management::v1::ParquetCacheLimit;
 
 impl From<LifecycleRules> for management::LifecycleRules {
     fn from(config: LifecycleRules) -> Self {
@@ -36,9 +35,10 @@ impl From<LifecycleRules> for management::LifecycleRules {
             persist_row_threshold: config.persist_row_threshold.get() as u64,
             persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(),
             mub_row_threshold: config.mub_row_threshold.get() as u64,
-            parquet_cache_limit: config.parquet_cache_limit.map(|x| ParquetCacheLimit {
-                value: x.get() as u64,
-            }),
+            parquet_cache_limit: config
+                .parquet_cache_limit
+                .map(|v| v.get())
+                .unwrap_or_default(),
         }
     }
 }
@@ -47,11 +47,6 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
     type Error = FieldViolation;
 
     fn try_from(proto: management::LifecycleRules) -> Result<Self, Self::Error> {
-        let parquet_cache_limit = match proto.parquet_cache_limit {
-            Some(l) => (l.value as usize).try_into().ok(),
-            None => None,
-        };
-
         Ok(Self {
             buffer_size_soft: (proto.buffer_size_soft as usize).try_into().ok(),
             buffer_size_hard: (proto.buffer_size_hard as usize).try_into().ok(),
@@ -78,7 +73,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
                 .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()),
             mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize)
                 .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()),
-            parquet_cache_limit,
+            parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit),
         })
     }
 }
@@ -103,7 +98,7 @@ mod tests {
             persist_row_threshold: 57,
             persist_age_threshold_seconds: 23,
             mub_row_threshold: 3454,
-            parquet_cache_limit: Some(ParquetCacheLimit { value: 10 }),
+            parquet_cache_limit: 10,
         };
 
         let config: LifecycleRules = protobuf.clone().try_into().unwrap();
@@ -138,7 +133,7 @@ mod tests {
         assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold);
         assert_eq!(
             config.parquet_cache_limit.unwrap().get(),
-            protobuf.parquet_cache_limit.as_ref().unwrap().value as usize
+            protobuf.parquet_cache_limit
         );
         assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit);
     }
diff --git a/src/commands/database.rs b/src/commands/database.rs
index 77f25471a5..3ff2c2bbf9 100644
--- a/src/commands/database.rs
+++ b/src/commands/database.rs
@@ -13,7 +13,6 @@ use influxdb_iox_client::{
     },
     write::{self, WriteError},
 };
-use std::num::NonZeroUsize;
 
 mod catalog;
 mod chunk;
@@ -121,9 +120,10 @@ struct Create {
     #[structopt(long, default_value = "100000")]
     mub_row_threshold: u64,
 
-    /// Use up to this amount of space in bytes for caching Parquet files
-    #[structopt(long, parse(try_from_str))]
-    pub parquet_cache_limit: Option<NonZeroUsize>,
+    /// Use up to this amount of space in bytes for caching Parquet files. A
+    /// value of zero disables Parquet file caching.
+    #[structopt(long, default_value = "0")]
+    parquet_cache_limit: u64,
 }
 
 /// Get list of databases
@@ -198,7 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
                     persist_row_threshold: command.persist_row_threshold,
                     persist_age_threshold_seconds: command.persist_age_threshold_seconds,
                     mub_row_threshold: command.mub_row_threshold,
-                    parquet_cache_limit: command.parquet_cache_limit.map(|l| ParquetCacheLimit{value: l.get() as u64}),
+                    parquet_cache_limit: command.parquet_cache_limit,
                 }),
 
                 // Default to hourly partitions

From 387667330a4822f7af21c05c4e08fbb6be875def Mon Sep 17 00:00:00 2001
From: Andrew Lamb <alamb@influxdata.com>
Date: Wed, 21 Jul 2021 04:27:03 -0400
Subject: [PATCH 18/27] chore: Update datafusion deps (#2073)

* chore: Update datafusion deps

* fix: update tests
---
 Cargo.lock                             |  18 +-
 datafusion/Cargo.toml                  |   2 +-
 query_tests/cases/in/pushdown.expected | 288 ++++++++++++-------------
 3 files changed, 154 insertions(+), 154 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ec611883bf..cbc1081590 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -769,9 +769,9 @@ dependencies = [
 
 [[package]]
 name = "crypto-mac"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6"
+checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a"
 dependencies = [
  "generic-array",
  "subtle",
@@ -843,7 +843,7 @@ dependencies = [
 [[package]]
 name = "datafusion"
 version = "4.0.0-SNAPSHOT"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=3fb600df48ab1e53903b1a9bb12ebde33ad0856b#3fb600df48ab1e53903b1a9bb12ebde33ad0856b"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3"
 dependencies = [
  "ahash 0.7.4",
  "arrow",
@@ -4330,9 +4330,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
+checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -4345,9 +4345,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
 
 [[package]]
 name = "tokio"
-version = "1.8.1"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985"
+checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4984,9 +4984,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
 
 [[package]]
 name = "zeroize"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18"
+checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd"
 
 [[package]]
 name = "zstd"
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index f3a735d307..f969251e6b 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version"
 
 # Rename to workaround doctest bug
 # Turn off optional datafusion features (function packages)
-upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="3fb600df48ab1e53903b1a9bb12ebde33ad0856b", default-features = false, package = "datafusion" }
+upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" }
diff --git a/query_tests/cases/in/pushdown.expected b/query_tests/cases/in/pushdown.expected
index dd58342994..e4e84b8b9e 100644
--- a/query_tests/cases/in/pushdown.expected
+++ b/query_tests/cases/in/pushdown.expected
@@ -10,158 +10,158 @@
 |               |     IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                  |
 +---------------+---------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.count Gt Int64(200)                                                   |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: CAST(count@0 AS Int64) > 200                                                |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+--------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                         |
++---------------+--------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                        |
+|               |   Filter: #restaurant.count Gt Int64(200)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                  |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                 |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200                                                                 |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                       |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] |
++---------------+--------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200.0;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.count Gt Float64(200)                                                 |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: CAST(count@0 AS Float64) > 200                                              |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+----------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                           |
++---------------+----------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                          |
+|               |   Filter: #restaurant.count Gt Float64(200)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                    |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                   |
+|               |     FilterExec: CAST(count@0 AS Float64) > 200                                                                 |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                         |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] |
++---------------+----------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where system > 4.0;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.system Gt Float64(4)                                                  |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: system@1 > 4                                                                |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                          |
++---------------+---------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                         |
+|               |   Filter: #restaurant.system Gt Float64(4)                                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)]           |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                   |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                  |
+|               |     FilterExec: system@1 > 4                                                                                  |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                        |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] |
++---------------+---------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")       |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury           |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                        |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                       |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury")                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                 |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury                                                           |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                      |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")]  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
-+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                             |
-+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                            |
-|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                          |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                      |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                     |
-|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence           |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                           |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                   |
-+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                               |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                         |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                        |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence                                                              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                              |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")]                                                                          |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                   |
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                           |
-|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000              |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                 |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                         |
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                   |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                  |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000)                                                 |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                           |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000                                                              |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)]                                                                                     |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where count > 200  and count < 40000;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)             |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000             |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000                                                       |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)]  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)             |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: system@1 > 4 AND system@1 < 7                                               |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: system@1 > 4 AND system@1 < 7                                                                                         |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)]  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
-+---------------+---------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                        |
-+---------------+---------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town       |
-|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)             |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                |
-|               |     FilterExec: system@1 > 5 AND system@1 < 7                                               |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate              |
-+---------------+---------------------------------------------------------------------------------------------+
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                 |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7)                                                       |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                           |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                          |
+|               |     FilterExec: system@1 > 5 AND system@1 < 7                                                                                         |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)]  |
++---------------+---------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
-+---------------+-----------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                        |
-+---------------+-----------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                       |
-|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                     |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                 |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                                                |
-|               |     FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                          |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                      |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                              |
-+---------------+-----------------------------------------------------------------------------------------------------------------------------+
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                           |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                          |
+|               |   Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system                                                    |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                    |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                   |
+|               |     FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1                                                                                             |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                         |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system]             |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                   |
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                  |
-|               |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                            |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                           |
-|               |     FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                  |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                 |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                         |
-+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                   |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                  |
+|               |   Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")                                                 |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                            |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                           |
+|               |     FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading                                                                                                  |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                 |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)]                                                                                     |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 -- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
-+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                                    |
-+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                   |
-|               |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
-|               |     TableScan: restaurant projection=Some([0, 1, 2, 3])                                                                                                                                                                                                                                 |
-| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                             |
-|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                                                            |
-|               |     FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00)                                                                     |
-|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                                                  |
-|               |         IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate                                                                                                                                                                                                          |
-+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                 |
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town                                                                                                                                                                                                                |
+|               |   Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130)                                              |
+|               |     TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] |
+| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town]                                                                                                                                                                                                          |
+|               |   CoalesceBatchesExec: target_batch_size=500                                                                                                                                                                                                                                                         |
+|               |     FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130                                                                                                                               |
+|               |       RepartitionExec: partitioning=RoundRobinBatch(4)                                                                                                                                                                                                                                               |
+|               |         IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)]                                                                                                |
++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

From 5df88c70aa91709d35dc820af890fcbf10be0d19 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 10:34:08 +0200
Subject: [PATCH 19/27] feat: add ability to fetch watermarks from write buffer

---
 write_buffer/src/core.rs  | 191 +++++++++++++++++++++++++++++++++-----
 write_buffer/src/kafka.rs |  75 +++++++++++----
 write_buffer/src/mock.rs  |  42 ++++++++-
 3 files changed, 264 insertions(+), 44 deletions(-)

diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs
index fdefc76746..f604b80862 100644
--- a/write_buffer/src/core.rs
+++ b/write_buffer/src/core.rs
@@ -1,6 +1,8 @@
+use std::fmt::Debug;
+
 use async_trait::async_trait;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::stream::BoxStream;
+use futures::{future::BoxFuture, stream::BoxStream};
 
 /// Generic boxed error type that is used in this crate.
 ///
@@ -10,7 +12,7 @@ pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
 /// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading
 /// entries from the Write Buffer at a later time.
 #[async_trait]
-pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
+pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
     /// Send an `Entry` to the write buffer using the specified sequencer ID.
     ///
     /// Returns information that can be used to restore entries at a later time.
@@ -21,17 +23,42 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
     ) -> Result<Sequence, WriteBufferError>;
 }
 
+pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
+pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
+
 /// Output stream of [`WriteBufferReading`].
-pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError>>;
+pub struct EntryStream<'a> {
+    /// Stream that produces entries.
+    pub stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
+
+    /// Get high watermark (= what we believe is the next sequence number to be added).
+    ///
+    /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
+    /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
+    pub fetch_high_watermark: FetchHighWatermark<'a>,
+}
+
+impl<'a> Debug for EntryStream<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("EntryStream").finish_non_exhaustive()
+    }
+}
 
 /// Produce streams (one per sequencer) of [`SequencedEntry`]s.
 #[async_trait]
-pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
+pub trait WriteBufferReading: Sync + Send + Debug + 'static {
     /// Returns a stream per sequencer.
+    ///
+    /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
+    /// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
+    /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
+    /// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
     fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>;
 
     /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
     /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
+    ///
+    /// Note that due to the mutable borrow, it is not possible to seek while streams exists.
     async fn seek(
         &mut self,
         sequencer_id: u32,
@@ -68,9 +95,11 @@ pub mod test_utils {
         T: TestAdapter,
     {
         test_single_stream_io(&adapter).await;
+        test_multi_stream_io(&adapter).await;
         test_multi_sequencer_io(&adapter).await;
         test_multi_writer_multi_reader(&adapter).await;
         test_seek(&adapter).await;
+        test_watermark(&adapter).await;
     }
 
     async fn test_single_stream_io<T>(adapter: &T)
@@ -94,23 +123,90 @@ pub mod test_utils {
         let mut cx = futures::task::Context::from_waker(&waker);
 
         // empty stream is pending
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
 
         // adding content allows us to get results
         writer.store_entry(&entry_1, sequencer_id).await.unwrap();
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1);
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );
 
         // stream is pending again
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
 
         // adding more data unblocks the stream
         writer.store_entry(&entry_2, sequencer_id).await.unwrap();
         writer.store_entry(&entry_3, sequencer_id).await.unwrap();
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2);
-        assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3);
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );
 
         // stream is pending again
-        assert!(stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
+    }
+
+    async fn test_multi_stream_io<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(1).await;
+
+        let entry_1 = lp_to_entry("upc user=1 100");
+        let entry_2 = lp_to_entry("upc user=2 200");
+        let entry_3 = lp_to_entry("upc user=3 300");
+
+        let writer = context.writing();
+        let mut reader = context.reading().await;
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+
+        writer.store_entry(&entry_1, 0).await.unwrap();
+        writer.store_entry(&entry_2, 0).await.unwrap();
+        writer.store_entry(&entry_3, 0).await.unwrap();
+
+        // creating stream, drop stream, re-create it => still starts at first entry
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, stream) = streams.pop().unwrap();
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );
+
+        // re-creating stream after reading remembers offset
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
+        assert_eq!(
+            stream.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );
+
+        // re-creating stream after reading everything makes it pending
+        drop(stream);
+        drop(streams);
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 1);
+        let (_sequencer_id, mut stream) = streams.pop().unwrap();
+        assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
     }
 
     async fn test_multi_sequencer_io<T>(adapter: &T)
@@ -136,25 +232,34 @@ pub mod test_utils {
         let mut cx = futures::task::Context::from_waker(&waker);
 
         // empty streams are pending
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
 
         // entries arrive at the right target stream
         writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert_eq!(
+            stream_1.stream.next().await.unwrap().unwrap().entry(),
+            &entry_1
+        );
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
 
         writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert_eq!(
+            stream_2.stream.next().await.unwrap().unwrap().entry(),
+            &entry_2
+        );
 
         writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
-        assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
+        assert_eq!(
+            stream_1.stream.next().await.unwrap().unwrap().entry(),
+            &entry_3
+        );
 
         // streams are pending again
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
     }
 
     async fn test_multi_writer_multi_reader<T>(adapter: &T)
@@ -239,8 +344,8 @@ pub mod test_utils {
         assert_eq!(streams.len(), 2);
         let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
         let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
-        assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
-        assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
+        assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
         drop(stream_1);
         drop(stream_2);
         drop(streams);
@@ -249,6 +354,47 @@ pub mod test_utils {
         reader_1.seek(0, 42).await.unwrap();
     }
 
+    async fn test_watermark<T>(adapter: &T)
+    where
+        T: TestAdapter,
+    {
+        let context = adapter.new_context(2).await;
+
+        let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
+        let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
+        let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
+
+        let writer = context.writing();
+        let mut reader = context.reading().await;
+
+        let mut streams = reader.streams();
+        assert_eq!(streams.len(), 2);
+        let (sequencer_id_1, stream_1) = streams.pop().unwrap();
+        let (sequencer_id_2, stream_2) = streams.pop().unwrap();
+
+        // start at watermark 0
+        assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
+        assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
+
+        // high water mark moves
+        writer
+            .store_entry(&entry_east_1, sequencer_id_1)
+            .await
+            .unwrap();
+        let mark_1 = writer
+            .store_entry(&entry_east_2, sequencer_id_1)
+            .await
+            .unwrap()
+            .number;
+        let mark_2 = writer
+            .store_entry(&entry_west_1, sequencer_id_2)
+            .await
+            .unwrap()
+            .number;
+        assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1);
+        assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1);
+    }
+
     async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&Entry])])
     where
         R: WriteBufferReading,
@@ -264,6 +410,7 @@ pub mod test_utils {
 
             // we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
             let mut results: Vec<_> = actual_stream
+                .stream
                 .take(expected_entries.len())
                 .try_collect()
                 .await
diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index a32769ae2c..c786f93066 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -8,7 +8,7 @@ use std::{
 use async_trait::async_trait;
 use data_types::server_id::ServerId;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::StreamExt;
+use futures::{FutureExt, StreamExt};
 use observability_deps::tracing::{debug, info};
 use rdkafka::{
     consumer::{BaseConsumer, Consumer, StreamConsumer},
@@ -18,7 +18,10 @@ use rdkafka::{
     ClientConfig, Message, Offset, TopicPartitionList,
 };
 
-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::core::{
+    EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
+    WriteBufferWriting,
+};
 
 pub struct KafkaBufferProducer {
     conn: String,
@@ -112,25 +115,59 @@ impl std::fmt::Debug for KafkaBufferConsumer {
 #[async_trait]
 impl WriteBufferReading for KafkaBufferConsumer {
     fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
-        self.consumers
-            .iter()
-            .map(|(sequencer_id, consumer)| {
-                let stream = consumer
-                    .stream()
-                    .map(move |message| {
-                        let message = message?;
-                        let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
-                        let sequence = Sequence {
-                            id: message.partition().try_into()?,
-                            number: message.offset().try_into()?,
-                        };
+        let mut streams = vec![];
 
-                        Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
+        for (sequencer_id, consumer) in &self.consumers {
+            let sequencer_id = *sequencer_id;
+            let consumer_cloned = Arc::clone(consumer);
+            let database_name = self.database_name.clone();
+
+            let stream = consumer
+                .stream()
+                .map(move |message| {
+                    let message = message?;
+                    let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
+                    let sequence = Sequence {
+                        id: message.partition().try_into()?,
+                        number: message.offset().try_into()?,
+                    };
+
+                    Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
+                })
+                .boxed();
+
+            let fetch_high_watermark = move || {
+                let consumer_cloned = Arc::clone(&consumer_cloned);
+                let database_name = database_name.clone();
+
+                let fut = async move {
+                    let (_low, high) = tokio::task::spawn_blocking(move || {
+                        consumer_cloned.fetch_watermarks(
+                            &database_name,
+                            sequencer_id as i32,
+                            Duration::from_secs(60),
+                        )
                     })
-                    .boxed();
-                (*sequencer_id, stream)
-            })
-            .collect()
+                    .await
+                    .expect("subtask failed")?;
+
+                    Ok(high as u64)
+                };
+
+                fut.boxed() as FetchHighWatermarkFut<'_>
+            };
+            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
+
+            streams.push((
+                sequencer_id,
+                EntryStream {
+                    stream,
+                    fetch_high_watermark,
+                },
+            ));
+        }
+
+        streams
     }
 
     async fn seek(
diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs
index 37659ba05b..a67000633d 100644
--- a/write_buffer/src/mock.rs
+++ b/write_buffer/src/mock.rs
@@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll};
 
 use async_trait::async_trait;
 use entry::{Entry, Sequence, SequencedEntry};
-use futures::{stream, StreamExt};
+use futures::{stream, FutureExt, StreamExt};
 use parking_lot::Mutex;
 
-use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
+use crate::core::{
+    EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
+    WriteBufferWriting,
+};
 
 type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;
 
@@ -244,7 +247,40 @@ impl WriteBufferReading for MockBufferForReading {
                 Poll::Pending
             })
             .boxed();
-            streams.push((sequencer_id, stream));
+
+            let shared_state = self.shared_state.clone();
+
+            let fetch_high_watermark = move || {
+                let shared_state = shared_state.clone();
+
+                let fut = async move {
+                    let entries = shared_state.entries.lock();
+                    let entry_vec = entries.get(&sequencer_id).unwrap();
+                    let watermark = entry_vec
+                        .iter()
+                        .filter_map(|entry_res| {
+                            entry_res
+                                .as_ref()
+                                .ok()
+                                .map(|entry| entry.sequence().unwrap().number)
+                        })
+                        .max()
+                        .map(|n| n + 1)
+                        .unwrap_or(0);
+
+                    Ok(watermark)
+                };
+                fut.boxed() as FetchHighWatermarkFut<'_>
+            };
+            let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
+
+            streams.push((
+                sequencer_id,
+                EntryStream {
+                    stream,
+                    fetch_high_watermark,
+                },
+            ));
         }
 
         streams

From fb931bb1ca5116a8186e922fc7044d2a95e6de5e Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 11:59:38 +0200
Subject: [PATCH 20/27] feat: write buffer ingestion metrics

---
 server/src/db.rs | 308 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 277 insertions(+), 31 deletions(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index f94eb4b28c..4a0aa076b8 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -50,7 +50,7 @@ use std::{
     time::{Duration, Instant},
 };
 use write_buffer::config::WriteBufferConfig;
-use write_buffer::core::WriteBufferError;
+use write_buffer::core::{FetchHighWatermark, WriteBufferError};
 
 pub mod access;
 pub mod catalog;
@@ -144,6 +144,94 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
+/// Metrics for data ingest.
+#[derive(Debug)]
+struct IngestMetrics {
+    /// Metrics domain
+    domain: Arc<metrics::Domain>,
+}
+
+impl IngestMetrics {
+    fn new(domain: Arc<metrics::Domain>) -> Self {
+        Self { domain }
+    }
+
+    fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics {
+        let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())];
+
+        let red = self
+            .domain
+            .register_red_metric_with_labels(Some("write_buffer"), labels.clone());
+        let bytes_read = self.domain.register_counter_metric_with_labels(
+            "read",
+            Some("bytes"),
+            "Bytes read from sequencer",
+            labels.clone(),
+        );
+        let watermark_iox = self.domain.register_gauge_metric_with_labels(
+            "watermark_iox",
+            None,
+            "High watermark of IOx (aka next sequence number that will be ingested)",
+            &labels,
+        );
+        let watermark_sequencer = self.domain.register_gauge_metric_with_labels(
+            "watermark_sequencer",
+            None,
+            "High watermark of the sequencer (aka next sequence number that will be added)",
+            &labels,
+        );
+        let last_min_ts = self.domain.register_gauge_metric_with_labels(
+            "last_min_ts",
+            None,
+            "Minimum unix timestamp of last write as unix timestamp in nanoseconds",
+            &labels,
+        );
+        let last_max_ts = self.domain.register_gauge_metric_with_labels(
+            "last_max_ts",
+            None,
+            "Maximum unix timestamp of last write as unix timestamp in nanoseconds",
+            &labels,
+        );
+
+        SequencerMetrics {
+            red,
+            bytes_read,
+            watermark_iox,
+            watermark_sequencer,
+            last_min_ts,
+            last_max_ts,
+        }
+    }
+}
+
+/// Metrics for a single sequencer.
+#[derive(Debug)]
+struct SequencerMetrics {
+    /// Metrics for tracking ingest.
+    red: metrics::RedMetric,
+
+    /// Bytes read from sequencer.
+    ///
+    /// This metrics is independent of the success / error state of the entries.
+    bytes_read: metrics::Counter,
+
+    /// Watermark of ingested data.
+    ///
+    /// This represents the next sequence number that will be ingested.
+    watermark_iox: metrics::Gauge,
+
+    /// Watermark of to-be-ingested data.
+    ///
+    /// This represents the next sequence number that will be added to the sequencer.
+    watermark_sequencer: metrics::Gauge,
+
+    /// Minimum unix timestamp of last write as unix timestamp in nanoseconds.
+    last_min_ts: metrics::Gauge,
+
+    /// Maximum unix timestamp of last write as unix timestamp in nanoseconds.
+    last_max_ts: metrics::Gauge,
+}
+
 /// This is the main IOx Database object. It is the root object of any
 /// specific InfluxDB IOx instance
 ///
@@ -248,8 +336,8 @@ pub struct Db {
     /// Metric labels
     metric_labels: Vec<KeyValue>,
 
-    /// Metrics for tracking the number of errors that occur while ingesting data
-    ingest_errors: metrics::Counter,
+    /// Ingest metrics
+    ingest_metrics: IngestMetrics,
 
     /// Optionally connect to a write buffer for either buffering writes or reading buffered writes
     write_buffer: Option<WriteBufferConfig>,
@@ -286,8 +374,7 @@ impl Db {
 
         let ingest_domain =
             metrics_registry.register_domain_with_labels("ingest", metric_labels.clone());
-        let ingest_errors =
-            ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest");
+        let ingest_metrics = IngestMetrics::new(Arc::new(ingest_domain));
 
         let catalog = Arc::new(database_to_commit.catalog);
 
@@ -316,7 +403,7 @@ impl Db {
             worker_iterations_lifecycle: AtomicUsize::new(0),
             worker_iterations_cleanup: AtomicUsize::new(0),
             metric_labels,
-            ingest_errors,
+            ingest_metrics,
             write_buffer: database_to_commit.write_buffer,
             cleanup_lock: Default::default(),
         }
@@ -687,8 +774,13 @@ impl Db {
                         .try_lock()
                         .expect("no streams should exist at this point");
                     let mut futures = vec![];
-                    for (_sequencer_id, stream) in write_buffer.streams() {
-                        let fut = self.stream_in_sequenced_entries(stream);
+                    for (sequencer_id, stream) in write_buffer.streams() {
+                        let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id);
+                        let fut = self.stream_in_sequenced_entries(
+                            stream.stream,
+                            stream.fetch_high_watermark,
+                            metrics,
+                        );
                         futures.push(fut);
                     }
 
@@ -705,32 +797,116 @@ impl Db {
 
     /// This is used to take entries from a `Stream` and put them in the mutable buffer, such as
     /// streaming entries from a write buffer.
-    async fn stream_in_sequenced_entries(
-        &self,
-        stream: BoxStream<'_, Result<SequencedEntry, WriteBufferError>>,
+    async fn stream_in_sequenced_entries<'a>(
+        &'a self,
+        mut stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
+        f_mark: FetchHighWatermark<'a>,
+        mut metrics: SequencerMetrics,
     ) {
-        stream
-            .for_each(|sequenced_entry_result| async {
-                let sequenced_entry = match sequenced_entry_result {
-                    Ok(sequenced_entry) => sequenced_entry,
-                    Err(e) => {
-                        debug!(?e, "Error converting write buffer data to SequencedEntry");
-                        self.ingest_errors.add(1);
-                        return;
-                    }
-                };
+        let mut last_watermark_update: Option<Instant> = None;
 
-                let sequenced_entry = Arc::new(sequenced_entry);
+        while let Some(sequenced_entry_result) = stream.next().await {
+            let red_observation = metrics.red.observation();
 
-                if let Err(e) = self.store_sequenced_entry(sequenced_entry) {
+            // get entry from sequencer
+            let sequenced_entry = match sequenced_entry_result {
+                Ok(sequenced_entry) => sequenced_entry,
+                Err(e) => {
+                    debug!(?e, "Error converting write buffer data to SequencedEntry");
+                    red_observation.client_error();
+                    continue;
+                }
+            };
+            let sequenced_entry = Arc::new(sequenced_entry);
+
+            // store entry
+            match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) {
+                Ok(_) => {
+                    red_observation.ok();
+                }
+                Err(e) => {
                     debug!(
                         ?e,
                         "Error storing SequencedEntry from write buffer in database"
                     );
-                    self.ingest_errors.add(1);
+                    red_observation.error();
                 }
-            })
-            .await
+            }
+
+            // update:
+            // - bytes read
+            // - iox watermark
+            // - min ts
+            // - max ts
+            let sequence = sequenced_entry
+                .sequence()
+                .expect("entry from write buffer must be sequenced");
+            let entry = sequenced_entry.entry();
+            metrics
+                .watermark_iox
+                .set((sequence.number + 1) as usize, &[]);
+            metrics.bytes_read.add(entry.data().len() as u64);
+            if let Some(min_ts) = entry
+                .partition_writes()
+                .map(|partition_writes| {
+                    partition_writes
+                        .iter()
+                        .filter_map(|partition_write| {
+                            partition_write
+                                .table_batches()
+                                .iter()
+                                .filter_map(|table_batch| table_batch.min_max_time().ok())
+                                .map(|(min, _max)| min)
+                                .max()
+                        })
+                        .min()
+                })
+                .flatten()
+            {
+                metrics
+                    .last_min_ts
+                    .set(min_ts.timestamp_nanos() as usize, &[]);
+            }
+            if let Some(max_ts) = entry
+                .partition_writes()
+                .map(|partition_writes| {
+                    partition_writes
+                        .iter()
+                        .filter_map(|partition_write| {
+                            partition_write
+                                .table_batches()
+                                .iter()
+                                .filter_map(|table_batch| table_batch.min_max_time().ok())
+                                .map(|(_min, max)| max)
+                                .max()
+                        })
+                        .max()
+                })
+                .flatten()
+            {
+                metrics
+                    .last_max_ts
+                    .set(max_ts.timestamp_nanos() as usize, &[]);
+            }
+
+            // maybe update sequencer watermark
+            // We are not updating this watermark every round because asking the sequencer for that watermark can be
+            // quite expensive.
+            if last_watermark_update
+                .map(|ts| ts.elapsed() > Duration::from_secs(60))
+                .unwrap_or(true)
+            {
+                match f_mark().await {
+                    Ok(watermark) => {
+                        metrics.watermark_sequencer.set(watermark as usize, &[]);
+                    }
+                    Err(e) => {
+                        debug!(%e, "Error while reading sequencer watermark")
+                    }
+                }
+                last_watermark_update = Some(Instant::now());
+            }
+        }
     }
 
     async fn cleanup_unreferenced_parquet_files(
@@ -1244,13 +1420,13 @@ mod tests {
             .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
         let write_buffer = MockBufferForReading::new(write_buffer_state);
 
-        let db = TestDb::builder()
+        let test_db = TestDb::builder()
             .write_buffer(WriteBufferConfig::Reading(Arc::new(
                 tokio::sync::Mutex::new(Box::new(write_buffer) as _),
             )))
             .build()
-            .await
-            .db;
+            .await;
+        let db = test_db.db;
 
         // do: start background task loop
         let shutdown: CancellationToken = Default::default();
@@ -1279,6 +1455,71 @@ mod tests {
             tokio::time::sleep(Duration::from_millis(100)).await;
         }
 
+        // check: metrics
+        // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise
+        let metrics = test_db.metric_registry;
+        metrics
+            .has_metric_family("ingest_write_buffer_requests_total")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+                ("status", "ok"),
+            ])
+            .counter()
+            .eq(1.0)
+            .unwrap();
+        metrics
+            .has_metric_family("ingest_read_bytes_total")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .counter()
+            .eq(256.0)
+            .unwrap();
+        metrics
+            .has_metric_family("ingest_watermark_iox")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(1.0)
+            .unwrap();
+        metrics
+            .has_metric_family("ingest_watermark_sequencer")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(1.0)
+            .unwrap();
+        metrics
+            .has_metric_family("ingest_last_min_ts")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(10.0)
+            .unwrap();
+        metrics
+            .has_metric_family("ingest_last_max_ts")
+            .with_labels(&[
+                ("db_name", "placeholder"),
+                ("svr_id", "1"),
+                ("sequencer_id", "0"),
+            ])
+            .gauge()
+            .eq(10.0)
+            .unwrap();
+
         // do: stop background task loop
         shutdown.cancel();
         join_handle.await.unwrap();
@@ -1325,11 +1566,16 @@ mod tests {
         // check: after a while the error should be reported in the database's metrics
         let t_0 = Instant::now();
         loop {
-            let family = metrics.try_has_metric_family("ingest_errors_total");
+            let family = metrics.try_has_metric_family("ingest_write_buffer_requests_total");
 
             if let Ok(metric) = family {
                 if metric
-                    .with_labels(&[("db_name", "placeholder"), ("svr_id", "1")])
+                    .with_labels(&[
+                        ("db_name", "placeholder"),
+                        ("svr_id", "1"),
+                        ("sequencer_id", "0"),
+                        ("status", "client_error"),
+                    ])
                     .counter()
                     .eq(1.0)
                     .is_ok()

From ffe6e62aeef5540ccf4cbca15a04cd92008b7635 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Wed, 21 Jul 2021 12:43:27 +0100
Subject: [PATCH 21/27] feat: add instant to datetime conversion (#2078)

* feat: add instant to datetime conversion

* chore: review feedback
---
 Cargo.lock                                    |  1 +
 data_types/Cargo.toml                         |  1 +
 data_types/src/instant.rs                     | 53 ++++++++++++++++
 data_types/src/lib.rs                         |  3 +-
 .../src/persistence_windows.rs                | 62 +++++++------------
 5 files changed, 78 insertions(+), 42 deletions(-)
 create mode 100644 data_types/src/instant.rs

diff --git a/Cargo.lock b/Cargo.lock
index cbc1081590..ec6ccbe445 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -826,6 +826,7 @@ dependencies = [
  "influxdb_line_protocol",
  "num_cpus",
  "observability_deps",
+ "once_cell",
  "percent-encoding",
  "regex",
  "serde",
diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml
index 197174f14e..fd145052f9 100644
--- a/data_types/Cargo.toml
+++ b/data_types/Cargo.toml
@@ -15,6 +15,7 @@ regex = "1.4"
 serde = { version = "1.0", features = ["rc", "derive"] }
 snafu = "0.6"
 observability_deps = { path = "../observability_deps" }
+once_cell = { version = "1.4.0", features = ["parking_lot"] }
 
 [dev-dependencies] # In alphabetical order
 test_helpers = { path = "../test_helpers" }
diff --git a/data_types/src/instant.rs b/data_types/src/instant.rs
new file mode 100644
index 0000000000..807bcbba49
--- /dev/null
+++ b/data_types/src/instant.rs
@@ -0,0 +1,53 @@
+use chrono::{DateTime, Utc};
+use once_cell::sync::OnceCell;
+use std::time::Instant;
+
+/// Stores an Instant and DateTime<Utc> captured as close as possible together
+static INSTANCE: OnceCell<(DateTime<Utc>, Instant)> = OnceCell::new();
+
+/// Provides a conversion from Instant to DateTime<Utc> for display purposes
+///
+/// It is an approximation as if the system clock changes, the returned DateTime will not be
+/// the same as the DateTime that would have been recorded at the time the Instant was created.
+///
+/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger
+/// Instant will have a larger returned DateTime.
+///
+/// This should ONLY be used for display purposes, the results should not be used to
+/// drive logic, nor persisted
+pub fn to_approximate_datetime(instant: Instant) -> DateTime<Utc> {
+    let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now()));
+
+    if ref_instant > instant {
+        ref_date
+            - chrono::Duration::from_std(ref_instant.duration_since(instant))
+                .expect("date overflow")
+    } else {
+        ref_date
+            + chrono::Duration::from_std(instant.duration_since(ref_instant))
+                .expect("date overflow")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_to_datetime() {
+        // Seed global state
+        to_approximate_datetime(Instant::now());
+
+        let (ref_date, ref_instant) = *INSTANCE.get().unwrap();
+
+        assert_eq!(
+            to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)),
+            ref_date + chrono::Duration::nanoseconds(78)
+        );
+
+        assert_eq!(
+            to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)),
+            ref_date - chrono::Duration::nanoseconds(23)
+        );
+    }
+}
diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs
index 76d7ca0306..f222aad0ff 100644
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@@ -13,13 +13,14 @@
 pub mod chunk_metadata;
 pub mod consistent_hasher;
 mod database_name;
-pub use database_name::*;
 pub mod database_rules;
 pub mod database_state;
 pub mod error;
+pub mod instant;
 pub mod job;
 pub mod names;
 pub mod partition_metadata;
 pub mod server_id;
 pub mod timestamp;
 pub mod write_summary;
+pub use database_name::*;
diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs
index 957d034d06..8cb97d0694 100644
--- a/persistence_windows/src/persistence_windows.rs
+++ b/persistence_windows/src/persistence_windows.rs
@@ -13,6 +13,7 @@ use internal_types::guard::{ReadGuard, ReadLock};
 
 use crate::checkpoint::PartitionCheckpoint;
 use crate::min_max_sequence::MinMaxSequence;
+use data_types::instant::to_approximate_datetime;
 
 const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30);
 
@@ -45,15 +46,8 @@ pub struct PersistenceWindows {
     late_arrival_period: Duration,
     closed_window_period: Duration,
 
-    /// The datetime this PersistenceWindows was created
-    ///
-    /// `PersistenceWindows` internally uses monotonic `Instant`, however,
-    /// these cannot be rendered. To provide a stable rendering of Wall timestamp,
-    /// a single timestamp is recorded at creation time
-    created_at_time: DateTime<Utc>,
-
     /// The instant this PersistenceWindows was created
-    created_at_instant: Instant,
+    created_at: Instant,
 
     /// The last instant passed to PersistenceWindows::add_range
     last_instant: Instant,
@@ -116,7 +110,6 @@ impl PersistenceWindows {
 
         let closed_window_count = late_arrival_seconds / closed_window_seconds;
 
-        let created_at_time = Utc::now();
         let created_at_instant = Instant::now();
 
         Self {
@@ -126,8 +119,7 @@ impl PersistenceWindows {
             addr,
             late_arrival_period,
             closed_window_period,
-            created_at_time,
-            created_at_instant,
+            created_at: created_at_instant,
             last_instant: created_at_instant,
             max_sequence_numbers: Default::default(),
         }
@@ -362,25 +354,12 @@ impl PersistenceWindows {
     /// These are approximate because persistence may partially flush a window, which will
     /// update the min row timestamp but not the row count
     pub fn summaries(&self) -> impl Iterator<Item = WriteSummary> + '_ {
-        self.windows().map(move |window| {
-            let window_age = chrono::Duration::from_std(
-                window.created_at.duration_since(self.created_at_instant),
-            )
-            .expect("duration overflow");
-
-            let time_of_first_write = self.created_at_time + window_age;
-
-            let window_duration =
-                chrono::Duration::from_std(window.last_instant.duration_since(window.created_at))
-                    .expect("duration overflow");
-
-            WriteSummary {
-                time_of_first_write,
-                time_of_last_write: time_of_first_write + window_duration,
-                min_timestamp: window.min_time,
-                max_timestamp: window.max_time,
-                row_count: window.row_count,
-            }
+        self.windows().map(move |window| WriteSummary {
+            time_of_first_write: to_approximate_datetime(window.created_at),
+            time_of_last_write: to_approximate_datetime(window.last_instant),
+            min_timestamp: window.min_time,
+            max_timestamp: window.max_time,
+            row_count: window.row_count,
         })
     }
 
@@ -1333,7 +1312,8 @@ mod tests {
     fn test_summaries() {
         let late_arrival_period = Duration::from_secs(100);
         let mut w = make_windows(late_arrival_period);
-        let instant = w.created_at_instant;
+        let instant = w.created_at;
+        let created_at_time = to_approximate_datetime(w.created_at);
 
         // Window 1
         w.add_range(
@@ -1387,17 +1367,17 @@ mod tests {
             summaries,
             vec![
                 WriteSummary {
-                    time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1),
-                    time_of_last_write: w.created_at_time + chrono::Duration::milliseconds(50),
+                    time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: created_at_time + chrono::Duration::milliseconds(50),
                     min_timestamp: Utc.timestamp_nanos(1),
                     max_timestamp: Utc.timestamp_nanos(340),
                     row_count: 21
                 },
                 WriteSummary {
-                    time_of_first_write: w.created_at_time
+                    time_of_first_write: created_at_time
                         + closed_duration
                         + chrono::Duration::milliseconds(1),
-                    time_of_last_write: w.created_at_time
+                    time_of_last_write: created_at_time
                         + closed_duration
                         + chrono::Duration::milliseconds(1),
                     min_timestamp: Utc.timestamp_nanos(89),
@@ -1405,8 +1385,8 @@ mod tests {
                     row_count: 3
                 },
                 WriteSummary {
-                    time_of_first_write: w.created_at_time + closed_duration * 3,
-                    time_of_last_write: w.created_at_time + closed_duration * 3,
+                    time_of_first_write: created_at_time + closed_duration * 3,
+                    time_of_last_write: created_at_time + closed_duration * 3,
                     min_timestamp: Utc.timestamp_nanos(3),
                     max_timestamp: Utc.timestamp_nanos(4),
                     row_count: 8
@@ -1424,8 +1404,8 @@ mod tests {
             summaries,
             vec![
                 WriteSummary {
-                    time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1),
-                    time_of_last_write: w.created_at_time
+                    time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
+                    time_of_last_write: created_at_time
                         + closed_duration
                         + chrono::Duration::milliseconds(1),
                     min_timestamp: Utc.timestamp_nanos(1),
@@ -1433,8 +1413,8 @@ mod tests {
                     row_count: 24
                 },
                 WriteSummary {
-                    time_of_first_write: w.created_at_time + closed_duration * 3,
-                    time_of_last_write: w.created_at_time + closed_duration * 3,
+                    time_of_first_write: created_at_time + closed_duration * 3,
+                    time_of_last_write: created_at_time + closed_duration * 3,
                     min_timestamp: Utc.timestamp_nanos(3),
                     max_timestamp: Utc.timestamp_nanos(4),
                     row_count: 8

From 7d597d1d5c7ec5989429d0cc4c0b0beee3ed35ff Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 13:57:53 +0200
Subject: [PATCH 22/27] refactor: make ingest metrics easier to understand

---
 server/src/db.rs | 116 ++++++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 52 deletions(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index 4a0aa076b8..8f74114b08 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -168,16 +168,16 @@ impl IngestMetrics {
             "Bytes read from sequencer",
             labels.clone(),
         );
-        let watermark_iox = self.domain.register_gauge_metric_with_labels(
-            "watermark_iox",
+        let last_sequence_number = self.domain.register_gauge_metric_with_labels(
+            "last_sequence_number",
             None,
-            "High watermark of IOx (aka next sequence number that will be ingested)",
+            "Last consumed sequence number (e.g. Kafka offset)",
             &labels,
         );
-        let watermark_sequencer = self.domain.register_gauge_metric_with_labels(
-            "watermark_sequencer",
+        let sequence_number_lag = self.domain.register_gauge_metric_with_labels(
+            "sequence_number_lag",
             None,
-            "High watermark of the sequencer (aka next sequence number that will be added)",
+            "The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number available",
             &labels,
         );
         let last_min_ts = self.domain.register_gauge_metric_with_labels(
@@ -196,8 +196,8 @@ impl IngestMetrics {
         SequencerMetrics {
             red,
             bytes_read,
-            watermark_iox,
-            watermark_sequencer,
+            last_sequence_number,
+            sequence_number_lag,
             last_min_ts,
             last_max_ts,
         }
@@ -215,15 +215,12 @@ struct SequencerMetrics {
     /// This metrics is independent of the success / error state of the entries.
     bytes_read: metrics::Counter,
 
-    /// Watermark of ingested data.
-    ///
-    /// This represents the next sequence number that will be ingested.
-    watermark_iox: metrics::Gauge,
+    /// Last consumed sequence number (e.g. Kafka offset).
+    last_sequence_number: metrics::Gauge,
 
-    /// Watermark of to-be-ingested data.
-    ///
-    /// This represents the next sequence number that will be added to the sequencer.
-    watermark_sequencer: metrics::Gauge,
+    /// The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number
+    /// available.
+    sequence_number_lag: metrics::Gauge,
 
     /// Minimum unix timestamp of last write as unix timestamp in nanoseconds.
     last_min_ts: metrics::Gauge,
@@ -803,7 +800,8 @@ impl Db {
         f_mark: FetchHighWatermark<'a>,
         mut metrics: SequencerMetrics,
     ) {
-        let mut last_watermark_update: Option<Instant> = None;
+        let mut watermark_last_updated: Option<Instant> = None;
+        let mut watermark = 0;
 
         while let Some(sequenced_entry_result) = stream.next().await {
             let red_observation = metrics.red.observation();
@@ -833,19 +831,42 @@ impl Db {
                 }
             }
 
+            // maybe update sequencer watermark
+            // We are not updating this watermark every round because asking the sequencer for that watermark can be
+            // quite expensive.
+            if watermark_last_updated
+                .map(|ts| ts.elapsed() > Duration::from_secs(60))
+                .unwrap_or(true)
+            {
+                match f_mark().await {
+                    Ok(w) => {
+                        watermark = w;
+                    }
+                    Err(e) => {
+                        debug!(%e, "Error while reading sequencer watermark")
+                    }
+                }
+                watermark_last_updated = Some(Instant::now());
+            }
+
             // update:
             // - bytes read
-            // - iox watermark
+            // - last sequence number
+            // - lag
             // - min ts
             // - max ts
             let sequence = sequenced_entry
                 .sequence()
                 .expect("entry from write buffer must be sequenced");
             let entry = sequenced_entry.entry();
-            metrics
-                .watermark_iox
-                .set((sequence.number + 1) as usize, &[]);
             metrics.bytes_read.add(entry.data().len() as u64);
+            metrics
+                .last_sequence_number
+                .set(sequence.number as usize, &[]);
+            metrics.sequence_number_lag.set(
+                watermark.saturating_sub(sequence.number).saturating_sub(1) as usize,
+                &[],
+            );
             if let Some(min_ts) = entry
                 .partition_writes()
                 .map(|partition_writes| {
@@ -888,24 +909,6 @@ impl Db {
                     .last_max_ts
                     .set(max_ts.timestamp_nanos() as usize, &[]);
             }
-
-            // maybe update sequencer watermark
-            // We are not updating this watermark every round because asking the sequencer for that watermark can be
-            // quite expensive.
-            if last_watermark_update
-                .map(|ts| ts.elapsed() > Duration::from_secs(60))
-                .unwrap_or(true)
-            {
-                match f_mark().await {
-                    Ok(watermark) => {
-                        metrics.watermark_sequencer.set(watermark as usize, &[]);
-                    }
-                    Err(e) => {
-                        debug!(%e, "Error while reading sequencer watermark")
-                    }
-                }
-                last_watermark_update = Some(Instant::now());
-            }
         }
     }
 
@@ -1414,10 +1417,18 @@ mod tests {
 
     #[tokio::test]
     async fn read_from_write_buffer_write_to_mutable_buffer() {
-        let entry = lp_to_entry("cpu bar=1 10");
         let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1);
-        write_buffer_state
-            .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
+        write_buffer_state.push_entry(
+            SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10"))
+                .unwrap(),
+        );
+        write_buffer_state.push_entry(
+            SequencedEntry::new_from_sequence(
+                Sequence::new(0, 7),
+                lp_to_entry("cpu bar=2 20\ncpu bar=3 30"),
+            )
+            .unwrap(),
+        );
         let write_buffer = MockBufferForReading::new(write_buffer_state);
 
         let test_db = TestDb::builder()
@@ -1467,7 +1478,7 @@ mod tests {
                 ("status", "ok"),
             ])
             .counter()
-            .eq(1.0)
+            .eq(2.0)
             .unwrap();
         metrics
             .has_metric_family("ingest_read_bytes_total")
@@ -1477,27 +1488,27 @@ mod tests {
                 ("sequencer_id", "0"),
             ])
             .counter()
-            .eq(256.0)
+            .eq(528.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_watermark_iox")
+            .has_metric_family("ingest_last_sequence_number")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
                 ("sequencer_id", "0"),
             ])
             .gauge()
-            .eq(1.0)
+            .eq(7.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_watermark_sequencer")
+            .has_metric_family("ingest_sequence_number_lag")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
                 ("sequencer_id", "0"),
             ])
             .gauge()
-            .eq(1.0)
+            .eq(0.0)
             .unwrap();
         metrics
             .has_metric_family("ingest_last_min_ts")
@@ -1507,7 +1518,7 @@ mod tests {
                 ("sequencer_id", "0"),
             ])
             .gauge()
-            .eq(10.0)
+            .eq(20.0)
             .unwrap();
         metrics
             .has_metric_family("ingest_last_max_ts")
@@ -1517,7 +1528,7 @@ mod tests {
                 ("sequencer_id", "0"),
             ])
             .gauge()
-            .eq(10.0)
+            .eq(30.0)
             .unwrap();
 
         // do: stop background task loop
@@ -1525,13 +1536,14 @@ mod tests {
         join_handle.await.unwrap();
 
         // check: the expected results should be there
-        let batches = run_query(db, "select * from cpu").await;
+        let batches = run_query(db, "select * from cpu order by time").await;
 
         let expected = vec![
             "+-----+-------------------------------+",
             "| bar | time                          |",
             "+-----+-------------------------------+",
-            "| 1   | 1970-01-01 00:00:00.000000010 |",
+            "| 2   | 1970-01-01 00:00:00.000000020 |",
+            "| 3   | 1970-01-01 00:00:00.000000030 |",
             "+-----+-------------------------------+",
         ];
         assert_batches_eq!(expected, &batches);

From 4d5f2090306fd737a026d1bb53ee3c347553f408 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 14:59:07 +0200
Subject: [PATCH 23/27] docs: do not repeat unix that often

---
 server/src/db.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index 8f74114b08..eff1e589d2 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -183,13 +183,13 @@ impl IngestMetrics {
         let last_min_ts = self.domain.register_gauge_metric_with_labels(
             "last_min_ts",
             None,
-            "Minimum unix timestamp of last write as unix timestamp in nanoseconds",
+            "Minimum timestamp of last write as unix timestamp in nanoseconds",
             &labels,
         );
         let last_max_ts = self.domain.register_gauge_metric_with_labels(
             "last_max_ts",
             None,
-            "Maximum unix timestamp of last write as unix timestamp in nanoseconds",
+            "Maximum timestamp of last write as unix timestamp in nanoseconds",
             &labels,
         );
 
@@ -222,10 +222,10 @@ struct SequencerMetrics {
     /// available.
     sequence_number_lag: metrics::Gauge,
 
-    /// Minimum unix timestamp of last write as unix timestamp in nanoseconds.
+    /// Minimum timestamp of last write as unix timestamp in nanoseconds.
     last_min_ts: metrics::Gauge,
 
-    /// Maximum unix timestamp of last write as unix timestamp in nanoseconds.
+    /// Maximum timestamp of last write as unix timestamp in nanoseconds.
     last_max_ts: metrics::Gauge,
 }
 

From 2f1efcf517f7504d728f6faddbf296bdc0e5ea26 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 15:00:53 +0200
Subject: [PATCH 24/27] docs: clarify difference

---
 server/src/db.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index eff1e589d2..949ba5225d 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -177,7 +177,7 @@ impl IngestMetrics {
         let sequence_number_lag = self.domain.register_gauge_metric_with_labels(
             "sequence_number_lag",
             None,
-            "The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number available",
+            "The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number",
             &labels,
         );
         let last_min_ts = self.domain.register_gauge_metric_with_labels(
@@ -218,8 +218,8 @@ struct SequencerMetrics {
     /// Last consumed sequence number (e.g. Kafka offset).
     last_sequence_number: metrics::Gauge,
 
-    /// The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number
-    /// available.
+    // The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed
+    // sequence number.
     sequence_number_lag: metrics::Gauge,
 
     /// Minimum timestamp of last write as unix timestamp in nanoseconds.

From fd00206fbbd43cbcff5e743a7ca8fcbb798c733d Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 15:02:48 +0200
Subject: [PATCH 25/27] refactor: increase watermark update frequence to once
 per 10s

---
 server/src/db.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index 949ba5225d..216b99effb 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -835,7 +835,7 @@ impl Db {
             // We are not updating this watermark every round because asking the sequencer for that watermark can be
             // quite expensive.
             if watermark_last_updated
-                .map(|ts| ts.elapsed() > Duration::from_secs(60))
+                .map(|ts| ts.elapsed() > Duration::from_secs(10))
                 .unwrap_or(true)
             {
                 match f_mark().await {

From cddf94653cb741649f347c128e4b6759d485899c Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 15:07:59 +0200
Subject: [PATCH 26/27] refactor: use `write_buffer` subsystem for ingest
 metrics

---
 server/src/db.rs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/server/src/db.rs b/server/src/db.rs
index 216b99effb..6dee5446b1 100644
--- a/server/src/db.rs
+++ b/server/src/db.rs
@@ -144,14 +144,14 @@ pub enum Error {
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
-/// Metrics for data ingest.
+/// Metrics for data ingest via write buffer.
 #[derive(Debug)]
-struct IngestMetrics {
+struct WriteBufferIngestMetrics {
     /// Metrics domain
     domain: Arc<metrics::Domain>,
 }
 
-impl IngestMetrics {
+impl WriteBufferIngestMetrics {
     fn new(domain: Arc<metrics::Domain>) -> Self {
         Self { domain }
     }
@@ -161,7 +161,7 @@ impl IngestMetrics {
 
         let red = self
             .domain
-            .register_red_metric_with_labels(Some("write_buffer"), labels.clone());
+            .register_red_metric_with_labels(Some("ingest"), labels.clone());
         let bytes_read = self.domain.register_counter_metric_with_labels(
             "read",
             Some("bytes"),
@@ -334,7 +334,7 @@ pub struct Db {
     metric_labels: Vec<KeyValue>,
 
     /// Ingest metrics
-    ingest_metrics: IngestMetrics,
+    ingest_metrics: WriteBufferIngestMetrics,
 
     /// Optionally connect to a write buffer for either buffering writes or reading buffered writes
     write_buffer: Option<WriteBufferConfig>,
@@ -370,8 +370,8 @@ impl Db {
         let metric_labels = database_to_commit.catalog.metric_labels.clone();
 
         let ingest_domain =
-            metrics_registry.register_domain_with_labels("ingest", metric_labels.clone());
-        let ingest_metrics = IngestMetrics::new(Arc::new(ingest_domain));
+            metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone());
+        let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain));
 
         let catalog = Arc::new(database_to_commit.catalog);
 
@@ -1470,7 +1470,7 @@ mod tests {
         // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise
         let metrics = test_db.metric_registry;
         metrics
-            .has_metric_family("ingest_write_buffer_requests_total")
+            .has_metric_family("write_buffer_ingest_requests_total")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1481,7 +1481,7 @@ mod tests {
             .eq(2.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_read_bytes_total")
+            .has_metric_family("write_buffer_read_bytes_total")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1491,7 +1491,7 @@ mod tests {
             .eq(528.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_last_sequence_number")
+            .has_metric_family("write_buffer_last_sequence_number")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1501,7 +1501,7 @@ mod tests {
             .eq(7.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_sequence_number_lag")
+            .has_metric_family("write_buffer_sequence_number_lag")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1511,7 +1511,7 @@ mod tests {
             .eq(0.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_last_min_ts")
+            .has_metric_family("write_buffer_last_min_ts")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1521,7 +1521,7 @@ mod tests {
             .eq(20.0)
             .unwrap();
         metrics
-            .has_metric_family("ingest_last_max_ts")
+            .has_metric_family("write_buffer_last_max_ts")
             .with_labels(&[
                 ("db_name", "placeholder"),
                 ("svr_id", "1"),
@@ -1578,7 +1578,7 @@ mod tests {
         // check: after a while the error should be reported in the database's metrics
         let t_0 = Instant::now();
         loop {
-            let family = metrics.try_has_metric_family("ingest_write_buffer_requests_total");
+            let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total");
 
             if let Ok(metric) = family {
                 if metric

From 55490c279a0e93f9199f7debf9b8aaf5f837cc57 Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Wed, 21 Jul 2021 15:21:52 +0200
Subject: [PATCH 27/27] fix: Kafka watermark error for new partitions

---
 write_buffer/src/kafka.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs
index c786f93066..9f971ef9ac 100644
--- a/write_buffer/src/kafka.rs
+++ b/write_buffer/src/kafka.rs
@@ -14,6 +14,7 @@ use rdkafka::{
     consumer::{BaseConsumer, Consumer, StreamConsumer},
     error::KafkaError,
     producer::{FutureProducer, FutureRecord},
+    types::RDKafkaErrorCode,
     util::Timeout,
     ClientConfig, Message, Offset, TopicPartitionList,
 };
@@ -141,7 +142,7 @@ impl WriteBufferReading for KafkaBufferConsumer {
                 let database_name = database_name.clone();
 
                 let fut = async move {
-                    let (_low, high) = tokio::task::spawn_blocking(move || {
+                    match tokio::task::spawn_blocking(move || {
                         consumer_cloned.fetch_watermarks(
                             &database_name,
                             sequencer_id as i32,
@@ -149,9 +150,12 @@ impl WriteBufferReading for KafkaBufferConsumer {
                         )
                     })
                     .await
-                    .expect("subtask failed")?;
-
-                    Ok(high as u64)
+                    .expect("subtask failed")
+                    {
+                        Ok((_low, high)) => Ok(high as u64),
+                        Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0),
+                        Err(e) => Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>),
+                    }
                 };
 
                 fut.boxed() as FetchHighWatermarkFut<'_>