influxdb/server/src/db.rs

//! This module contains the main IOx Database object which has the
//! instances of the mutable buffer, read buffer, and object store

use std::{
    any::Any,
    collections::{HashMap, HashSet},
    num::NonZeroUsize,
    sync::{
        atomic::{AtomicUsize, Ordering},
        Arc,
    },
    time::Duration,
};

use ::lifecycle::select_persistable_chunks;
use async_trait::async_trait;
use parking_lot::{Mutex, RwLock};
use rand_distr::{Distribution, Poisson};
use snafu::{ensure, OptionExt, ResultExt, Snafu};

pub use ::lifecycle::{LifecycleChunk, LockableChunk, LockablePartition};
use data_types::partition_metadata::PartitionAddr;
use data_types::{
    chunk_metadata::{ChunkId, ChunkLifecycleAction, ChunkOrder, ChunkSummary},
    database_rules::DatabaseRules,
    partition_metadata::{PartitionSummary, TableSummary},
    sequence::Sequence,
    server_id::ServerId,
};
use datafusion::catalog::{catalog::CatalogProvider, schema::SchemaProvider};
use entry::{Entry, SequencedEntry, TableBatch};
use iox_object_store::IoxObjectStore;
use mutable_buffer::chunk::{ChunkMetrics as MutableBufferChunkMetrics, MBChunk};
use observability_deps::tracing::{debug, error, info, warn};
use parquet_file::catalog::{
    cleanup::{delete_files as delete_parquet_files, get_unreferenced_parquet_files},
    core::PreservedCatalog,
    interface::{CatalogParquetInfo, CheckpointData, ChunkAddrWithoutDatabase},
    prune::prune_history as prune_catalog_transaction_history,
};
use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::PersistenceWindows};
use predicate::{delete_predicate::DeletePredicate, predicate::Predicate};
use query::{
    exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext},
    QueryDatabase,
};
use schema::Schema;
use time::{Time, TimeProvider};
use trace::ctx::SpanContext;
use write_buffer::core::{WriteBufferReading, WriteBufferWriting};

pub(crate) use crate::db::chunk::DbChunk;
use crate::{
    db::{
        access::QueryCatalogAccess,
        catalog::{
            chunk::{CatalogChunk, ChunkStage},
            partition::Partition,
            table::TableSchemaUpsertHandle,
            Catalog, Error as CatalogError, TableNameFilter,
        },
        lifecycle::{LockableCatalogChunk, LockableCatalogPartition, WeakDb},
    },
    JobRegistry,
};

pub mod access;
pub mod catalog;
mod chunk;
mod lifecycle;
pub mod load;
pub mod pred;
mod replay;
mod streams;
mod system_tables;

#[allow(clippy::large_enum_variant)]
#[derive(Debug, Snafu)]
pub enum Error {
    #[snafu(context(false))]
    CatalogError { source: catalog::Error },

    #[snafu(context(false))]
    PartitionError { source: catalog::partition::Error },

    #[snafu(display("Lifecycle error: {}", source))]
    LifecycleError { source: lifecycle::Error },

    #[snafu(display("Error freezing chunk while rolling over partition: {}", source))]
    FreezingChunk { source: catalog::chunk::Error },

    #[snafu(display("Error sending entry to write buffer"))]
    WriteBufferWritingError {
        source: Box<dyn std::error::Error + Sync + Send>,
    },

    #[snafu(display("Cannot write to this database: no mutable buffer configured"))]
    DatabaseNotWriteable {},

    #[snafu(display("Hard buffer size limit reached"))]
    HardLimitReached {},

    #[snafu(display("Can not write entry {}:{} : {}", partition_key, chunk_id.get(), source))]
    WriteEntry {
        partition_key: String,
        chunk_id: ChunkId,
        source: mutable_buffer::chunk::Error,
    },

    #[snafu(display("Cannot write entry to new open chunk {}: {}", partition_key, source))]
    WriteEntryInitial {
        partition_key: String,
        source: mutable_buffer::chunk::Error,
    },

    #[snafu(display(
        "Cannot delete data from non-existing table, {}: {}",
        table_name,
        source
    ))]
    DeleteFromTable {
        table_name: String,
        source: CatalogError,
    },

    #[snafu(display(
        "Storing sequenced entry failed with the following error(s), and possibly more: {}",
        errors.iter().map(ToString::to_string).collect::<Vec<_>>().join(", ")
    ))]
    StoreSequencedEntryFailures { errors: Vec<Error> },

    #[snafu(display("background task cancelled: {}", source))]
    TaskCancelled { source: futures::future::Aborted },

    #[snafu(display("error computing time summary on table batch: {}", source))]
    TableBatchTimeError { source: entry::Error },

    #[snafu(display("error batch had null times"))]
    TableBatchMissingTimes {},

    #[snafu(display("Table batch has invalid schema: {}", source))]
    TableBatchSchemaExtractError { source: schema::builder::Error },

    #[snafu(display("Table batch has mismatching schema: {}", source))]
    TableBatchSchemaMergeError { source: schema::merge::Error },

    #[snafu(display(
        "Unable to flush partition at the moment {}:{}",
        table_name,
        partition_key,
    ))]
    CannotFlushPartition {
        table_name: String,
        partition_key: String,
    },

    #[snafu(display("Partition {} has no open chunk", addr))]
    NoOpenChunk { addr: PartitionAddr },

    #[snafu(display("Cannot create replay plan: {}", source))]
    ReplayPlanError {
        source: persistence_windows::checkpoint::Error,
    },

    #[snafu(display("Cannot replay: {}", source))]
    ReplayError { source: crate::db::replay::Error },

    #[snafu(display(
        "Error while commiting delete predicate on preserved catalog: {}",
        source
    ))]
    CommitDeletePredicateError {
        source: parquet_file::catalog::core::Error,
    },
}

pub type Result<T, E = Error> = std::result::Result<T, E>;

/// `Db` is an instance-local, queryable, possibly persisted, and possibly mutable data store
///
/// It is responsible for:
///
/// * Receiving new writes for this IOx instance
/// * Exposing APIs for the lifecycle policy to compact/persist data
/// * Exposing APIs for the query engine to use to query data
///
/// The data in a `Db` is structured in this way:
///
/// ┌───────────────────────────────────────────────┐
/// │                                               │
/// │    ┌────────────────┐                         │
/// │    │    Database    │                         │
/// │    └────────────────┘                         │
/// │             │  multiple Tables (measurements) │
/// │             ▼                                 │
/// │    ┌────────────────┐                         │
/// │    │     Table      │                         │
/// │    └────────────────┘                         │
/// │             │ one partition per               │
/// │             │ partition_key                   │
/// │             ▼                                 │
/// │    ┌────────────────┐                         │
/// │    │   Partition    │                         │
/// │    └────────────────┘                         │
/// │             │  one open Chunk                 │
/// │             │  zero or more closed            │
/// │             ▼  Chunks                         │
/// │    ┌────────────────┐                         │
/// │    │     Chunk      │                         │
/// │    └────────────────┘                         │
/// │             │  multiple Columns               │
/// │             ▼                                 │
/// │    ┌────────────────┐                         │
/// │    │     Column     │                         │
/// │    └────────────────┘                         │
/// │                                               │
/// └───────────────────────────────────────────────┘
///
/// Each row of data is routed into a particular partitions based on
/// column values in that row. The partition's open chunk is updated
/// with the new data.
///
/// The currently open chunk in a partition can be rolled over. When
/// this happens, the chunk is closed (becomes read-only) and stops
/// taking writes. Any new writes to the same partition will create a
/// new active open chunk.
///
/// Catalog Usage: the state of the catalog and the state of the `Db`
/// must remain in sync. If they are ever out of sync, the IOx system
/// should be shutdown and forced through a "recovery" to correctly
/// reconcile the state.
///
/// Ensuring the Catalog and Db remain in sync is accomplished by
/// manipulating the catalog state alongside the state in the `Db`
/// itself. The catalog state can be observed (but not mutated) by things
/// outside of the Db
#[derive(Debug)]
pub struct Db {
    rules: RwLock<Arc<DatabaseRules>>,

    name: Arc<str>,

    server_id: ServerId, // this is also the Query Server ID

    /// Interface to use for persistence
    iox_object_store: Arc<IoxObjectStore>,

    /// Executor for running queries
    exec: Arc<Executor>,

    /// Preserved catalog (data in object store).
    preserved_catalog: Arc<PreservedCatalog>,

    /// The catalog holds chunks of data under partitions for the database.
    /// The underlying chunks may be backed by different execution engines
    /// depending on their stage in the data lifecycle. Currently there are
    /// three backing engines for Chunks:
    ///
    ///  - The Mutable Buffer where chunks are mutable but also queryable;
    ///  - The Read Buffer where chunks are immutable and stored in an optimised
    ///    compressed form for small footprint and fast query execution; and
    ///  - The Parquet Buffer where chunks are backed by Parquet file data.
    catalog: Arc<Catalog>,

    /// A handle to the global jobs registry for long running tasks
    jobs: Arc<JobRegistry>,

    /// The global metric registry
    metric_registry: Arc<metric::Registry>,

    /// Catalog interface for query
    catalog_access: Arc<QueryCatalogAccess>,

    /// Number of iterations of the worker lifecycle loop for this Db
    worker_iterations_lifecycle: AtomicUsize,

    /// Number of iterations of the worker cleanup loop for this Db
    worker_iterations_cleanup: AtomicUsize,

    /// Number of iterations of the worker delete predicate preservation loop for this Db
    worker_iterations_delete_predicate_preservation: AtomicUsize,

    /// Optional write buffer producer
    /// TODO: Move onto Database
    write_buffer_producer: Option<Arc<dyn WriteBufferWriting>>,

    /// Lock that prevents the cleanup job from deleting files that are written but not yet added to the preserved
    /// catalog.
    ///
    /// The cleanup job needs exclusive access and hence will acquire a write-guard. Creating parquet files and creating
    /// catalog transaction only needs shared access and hence will acquire a read-guard.
    cleanup_lock: Arc<tokio::sync::RwLock<()>>,

    /// Lifecycle policy.
    ///
    /// Optional because it will be created after `Arc<Self>`.
    ///
    /// This is stored here for the following reasons:
    /// - to control the persistence suppression via a [`Db::unsuppress_persistence`]
    /// - to keep the lifecycle state (e.g. the number of running compactions) around
    lifecycle_policy: Mutex<Option<::lifecycle::LifecyclePolicy<WeakDb>>>,

    time_provider: Arc<dyn TimeProvider>,

    /// To-be-written delete predicates.
    delete_predicates_mailbox: Mutex<Vec<(Arc<DeletePredicate>, Vec<ChunkAddrWithoutDatabase>)>>,

    /// TESTING ONLY: Override of IDs for persisted chunks.
    persisted_chunk_id_override: Mutex<Option<ChunkId>>,
}

/// All the information needed to commit a database
#[derive(Debug)]
pub(crate) struct DatabaseToCommit {
    pub(crate) server_id: ServerId,
    pub(crate) iox_object_store: Arc<IoxObjectStore>,
    pub(crate) exec: Arc<Executor>,
    pub(crate) preserved_catalog: PreservedCatalog,
    pub(crate) catalog: Catalog,
    pub(crate) rules: Arc<DatabaseRules>,
    pub(crate) time_provider: Arc<dyn TimeProvider>,

    /// TODO: Move onto Database
    pub(crate) write_buffer_producer: Option<Arc<dyn WriteBufferWriting>>,

    pub(crate) metric_registry: Arc<metric::Registry>,
}

impl Db {
    pub(crate) fn new(database_to_commit: DatabaseToCommit, jobs: Arc<JobRegistry>) -> Arc<Self> {
        let name = Arc::from(database_to_commit.rules.name.as_str());

        let rules = RwLock::new(database_to_commit.rules);
        let server_id = database_to_commit.server_id;
        let iox_object_store = Arc::clone(&database_to_commit.iox_object_store);

        let catalog = Arc::new(database_to_commit.catalog);

        let catalog_access = QueryCatalogAccess::new(
            &*name,
            Arc::clone(&catalog),
            Arc::clone(&jobs),
            database_to_commit.metric_registry.as_ref(),
        );
        let catalog_access = Arc::new(catalog_access);

        let this = Self {
            rules,
            name,
            server_id,
            iox_object_store,
            exec: database_to_commit.exec,
            preserved_catalog: Arc::new(database_to_commit.preserved_catalog),
            catalog,
            jobs,
            metric_registry: database_to_commit.metric_registry,
            catalog_access,
            worker_iterations_lifecycle: AtomicUsize::new(0),
            worker_iterations_cleanup: AtomicUsize::new(0),
            worker_iterations_delete_predicate_preservation: AtomicUsize::new(0),
            write_buffer_producer: database_to_commit.write_buffer_producer,
            cleanup_lock: Default::default(),
            lifecycle_policy: Mutex::new(None),
            time_provider: database_to_commit.time_provider,
            delete_predicates_mailbox: Default::default(),
            persisted_chunk_id_override: Default::default(),
        };
        let this = Arc::new(this);
        *this.lifecycle_policy.try_lock().expect("not used yet") = Some(
            ::lifecycle::LifecyclePolicy::new_suppress_persistence(WeakDb(Arc::downgrade(&this))),
        );
        this
    }

    /// Return all table names of the DB
    pub fn table_names(&self) -> Vec<String> {
        self.catalog.table_names()
    }

    /// Allow persistence if database rules all it.
    pub fn unsuppress_persistence(&self) {
        let mut guard = self.lifecycle_policy.lock();
        let policy = guard
            .as_mut()
            .expect("lifecycle policy should be initialized");
        policy.unsuppress_persistence();
    }

    /// Return a handle to the executor used to run queries
    pub fn executor(&self) -> Arc<Executor> {
        Arc::clone(&self.exec)
    }

    /// Return the current database rules
    pub fn rules(&self) -> Arc<DatabaseRules> {
        Arc::clone(&*self.rules.read())
    }

    pub fn name(&self) -> Arc<str> {
        Arc::clone(&self.name)
    }

    /// Updates the database rules
    pub fn update_rules(&self, new_rules: Arc<DatabaseRules>) {
        let late_arrive_window_updated = {
            let mut rules = self.rules.write();
            info!(db_name=%rules.name,  "updating rules for database");
            let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds
                != new_rules.lifecycle_rules.late_arrive_window_seconds;

            *rules = new_rules;
            late_arrive_window_updated
        };

        if late_arrive_window_updated {
            // Hold a read lock to prevent concurrent modification and
            // use values from re-acquired read guard
            let current = self.rules.read();

            // Update windows
            let partitions = self.catalog.partitions();
            for partition in &partitions {
                let mut partition = partition.write();
                let addr = partition.addr().clone();
                if let Some(windows) = partition.persistence_windows_mut() {
                    info!(partition=%addr, "updating persistence windows");
                    windows.set_late_arrival_period(Duration::from_secs(
                        current.lifecycle_rules.late_arrive_window_seconds.get() as u64,
                    ))
                }
            }
        }
    }

    /// Return the current database's object storage
    pub fn iox_object_store(&self) -> Arc<IoxObjectStore> {
        Arc::clone(&self.iox_object_store)
    }

    /// Rolls over the active chunk in the database's specified
    /// partition. Returns the previously open (now closed) Chunk if
    /// there was any.
    ///
    /// NOTE: this function is only used in tests and can be invoked
    /// by the management API. It is not called automatically by the
    /// lifecycle manager during normal operation.
    pub async fn rollover_partition(
        &self,
        table_name: &str,
        partition_key: &str,
    ) -> Result<Option<Arc<DbChunk>>> {
        let chunk = self
            .partition(table_name, partition_key)?
            .read()
            .open_chunk();

        info!(%table_name, %partition_key, found_chunk=chunk.is_some(), "rolling over a partition");
        if let Some(chunk) = chunk {
            let mut chunk = chunk.write();
            chunk.freeze().context(FreezingChunk)?;

            Ok(Some(DbChunk::snapshot(&chunk)))
        } else {
            Ok(None)
        }
    }

    pub fn partition(
        &self,
        table_name: &str,
        partition_key: &str,
    ) -> catalog::Result<Arc<tracker::RwLock<Partition>>> {
        let partition = self.catalog.partition(table_name, partition_key)?;
        Ok(Arc::clone(&partition))
    }

    pub fn chunk(
        &self,
        table_name: &str,
        partition_key: &str,
        chunk_id: ChunkId,
    ) -> catalog::Result<(Arc<tracker::RwLock<CatalogChunk>>, ChunkOrder)> {
        self.catalog.chunk(table_name, partition_key, chunk_id)
    }

    pub fn lockable_chunk(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
        chunk_id: ChunkId,
    ) -> catalog::Result<LockableCatalogChunk> {
        let (chunk, order) = self.chunk(table_name, partition_key, chunk_id)?;
        Ok(LockableCatalogChunk {
            db: Arc::clone(self),
            chunk,
            id: chunk_id,
            order,
        })
    }

    pub fn lockable_partition(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
    ) -> catalog::Result<LockableCatalogPartition> {
        let partition = self.partition(table_name, partition_key)?;
        Ok(LockableCatalogPartition::new(Arc::clone(self), partition))
    }

    /// Drops the specified chunk from the catalog and all storage systems
    pub async fn drop_chunk(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
        chunk_id: ChunkId,
    ) -> Result<()> {
        // Use explicit scope to ensure the async generator doesn't
        // assume the locks have to possibly live across the `await`
        let fut = {
            let partition = self.lockable_partition(table_name, partition_key)?;

            // Do lock dance to get a write lock on the partition as well
            // as on the to-be-dropped chunk.
            let partition = partition.read();
            LockablePartition::chunk(&partition, chunk_id).ok_or(
                catalog::Error::ChunkNotFound {
                    chunk_id,
                    partition: partition_key.to_string(),
                    table: table_name.to_string(),
                },
            )?;

            let chunk = self.lockable_chunk(table_name, partition_key, chunk_id)?;
            let partition = partition.upgrade();

            let (_, fut) =
                lifecycle::drop_chunk(partition, chunk.write()).context(LifecycleError)?;
            fut
        };

        fut.await.context(TaskCancelled)?.context(LifecycleError)
    }

    /// Drops the specified partition from the catalog and all storage systems
    pub async fn drop_partition(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
    ) -> Result<()> {
        // Use explicit scope to ensure the async generator doesn't
        // assume the locks have to possibly live across the `await`
        let fut = {
            let partition = self.lockable_partition(table_name, partition_key)?;
            let partition = partition.write();
            let (_, fut) = lifecycle::drop_partition(partition).context(LifecycleError)?;
            fut
        };

        fut.await.context(TaskCancelled)?.context(LifecycleError)
    }

    /// Delete data from  a table on a specified predicate
    pub async fn delete(
        self: &Arc<Self>,
        table_name: &str,
        delete_predicate: Arc<DeletePredicate>,
    ) -> Result<()> {
        // collect delete predicates on preserved partitions for a catalog transaction
        let mut affected_persisted_chunks = vec![];

        // get all partitions of this table
        // Note: we need an additional scope here to convince rustc that the future produced by this function is sendable.
        {
            let table = self
                .catalog
                .table(table_name)
                .context(DeleteFromTable { table_name })?;
            let partitions = table.partitions();
            for partition in partitions {
                let partition = partition.write();
                let chunks = partition.chunks();
                for chunk in chunks {
                    // save the delete predicate in the chunk
                    let mut chunk = chunk.write();
                    chunk.add_delete_predicate(Arc::clone(&delete_predicate));

                    // We should only report persisted chunks or chunks that are currently being persisted, because the
                    // preserved catalog does not care about purely in-mem chunks.
                    if matches!(chunk.stage(), ChunkStage::Persisted { .. })
                        || chunk.is_in_lifecycle(ChunkLifecycleAction::Persisting)
                    {
                        affected_persisted_chunks.push(ChunkAddrWithoutDatabase {
                            table_name: Arc::clone(&chunk.addr().table_name),
                            partition_key: Arc::clone(&chunk.addr().partition_key),
                            chunk_id: chunk.addr().chunk_id,
                        });
                    }
                }
            }
        }

        if !affected_persisted_chunks.is_empty() {
            let mut guard = self.delete_predicates_mailbox.lock();
            guard.push((delete_predicate, affected_persisted_chunks));
        }

        Ok(())
    }

    /// Compacts the open chunk to the read buffer
    pub async fn compact_open_chunk(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
    ) -> Result<Option<Arc<DbChunk>>> {
        // This is somewhat inefficient as it will acquire write locks on all chunks in the
        // partition, however, it is currently only used for tests
        self.compact_chunks(table_name, partition_key, |chunk| chunk.stage().is_open())
            .await
    }

    /// Compacts all chunks in a partition to create a new chunk
    ///
    /// This code does not do any checking of the read buffer against
    /// memory limits, etc
    ///
    /// This (async) function returns when this process is complete,
    /// but the process may take a long time
    ///
    /// Returns a handle to the newly created chunk in the read buffer
    pub async fn compact_partition(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
    ) -> Result<Option<Arc<DbChunk>>> {
        self.compact_chunks(table_name, partition_key, |_| true)
            .await
    }

    /// Compacts all chunks within a partition passing a predicate
    ///
    /// There is no lock gap between predicate evaluation and creation of the lifecycle action
    pub async fn compact_chunks(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
        predicate: impl Fn(&CatalogChunk) -> bool + Send,
    ) -> Result<Option<Arc<DbChunk>>> {
        // Use explicit scope to ensure the async generator doesn't
        // assume the locks have to possibly live across the `await`
        let fut = {
            let partition = self.partition(table_name, partition_key)?;
            let partition = LockableCatalogPartition::new(Arc::clone(self), partition);

            // Do lock dance to get a write lock on the partition as well
            // as on all of the chunks
            let partition = partition.read();

            // Get a list of all the chunks to compact
            let chunks = LockablePartition::chunks(&partition);
            let partition = partition.upgrade();
            let chunks = chunks
                .iter()
                .map(|chunk| chunk.write())
                .filter(|chunk| predicate(&*chunk))
                .collect();

            let (_, fut) = lifecycle::compact_chunks(partition, chunks).context(LifecycleError)?;
            fut
        };

        fut.await.context(TaskCancelled)?.context(LifecycleError)
    }

    /// Persist given partition.
    ///
    /// If `force` is `true` will persist all unpersisted data regardless of arrival time
    ///
    /// Errors if there is nothing to persist at the moment as per the lifecycle rules. If successful it returns the
    /// chunk that contains the persisted data.
    ///
    pub async fn persist_partition(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
        force: bool,
    ) -> Result<Option<Arc<DbChunk>>> {
        // Use explicit scope to ensure the async generator doesn't
        // assume the locks have to possibly live across the `await`
        let fut = {
            let partition = self.lockable_partition(table_name, partition_key)?;
            let partition = partition.read();

            let chunks = LockablePartition::chunks(&partition);
            let mut partition = partition.upgrade();

            // get flush handle
            let flush_handle = partition
                .persistence_windows_mut()
                .map(|window| match force {
                    true => window.flush_all_handle(),
                    false => window.flush_handle(),
                })
                .flatten()
                .context(CannotFlushPartition {
                    table_name,
                    partition_key,
                })?;

            let chunks = match select_persistable_chunks(&chunks, flush_handle.timestamp()) {
                Ok(chunks) => chunks,
                Err(_) => {
                    return Err(Error::CannotFlushPartition {
                        table_name: table_name.to_string(),
                        partition_key: partition_key.to_string(),
                    });
                }
            };

            let (_, fut) = lifecycle::persist_chunks(partition, chunks, flush_handle)
                .context(LifecycleError)?;
            fut
        };

        fut.await.context(TaskCancelled)?.context(LifecycleError)
    }

    /// Unload chunk from read buffer but keep it in object store
    pub fn unload_read_buffer(
        self: &Arc<Self>,
        table_name: &str,
        partition_key: &str,
        chunk_id: ChunkId,
    ) -> Result<Arc<DbChunk>> {
        let chunk = self.lockable_chunk(table_name, partition_key, chunk_id)?;
        let chunk = chunk.write();
        lifecycle::unload_read_buffer_chunk(chunk).context(LifecycleError)
    }

    /// Return chunk summary information for all chunks in the specified
    /// partition across all storage systems
    pub fn partition_chunk_summaries(&self, partition_key: &str) -> Vec<ChunkSummary> {
        let partition_key = Some(partition_key);
        let table_names = TableNameFilter::AllTables;
        self.catalog
            .filtered_chunks(table_names, partition_key, CatalogChunk::summary)
    }

    /// Return Summary information for all columns in all chunks in the
    /// partition across all storage systems
    pub fn partition_summary(
        &self,
        table_name: &str,
        partition_key: &str,
    ) -> Option<PartitionSummary> {
        self.catalog
            .partition(table_name, partition_key)
            .ok()
            .and_then(|partition| partition.read().summary())
    }

    /// Return table summary information for the given chunk in the specified
    /// partition
    pub fn table_summary(
        &self,
        table_name: &str,
        partition_key: &str,
        chunk_id: ChunkId,
    ) -> Option<Arc<TableSummary>> {
        let (chunk, _order) = self.chunk(table_name, partition_key, chunk_id).ok()?;
        let chunk = chunk.read();
        Some(chunk.table_summary())
    }

    /// Returns the number of iterations of the background worker lifecycle loop
    pub fn worker_iterations_lifecycle(&self) -> usize {
        self.worker_iterations_lifecycle.load(Ordering::Relaxed)
    }

    /// Returns the number of iterations of the background worker lifecycle loop
    pub fn worker_iterations_cleanup(&self) -> usize {
        self.worker_iterations_cleanup.load(Ordering::Relaxed)
    }

    /// Returns the number of iterations of the background worker delete predicate preservation loop
    pub fn worker_iterations_delete_predicate_preservation(&self) -> usize {
        self.worker_iterations_delete_predicate_preservation
            .load(Ordering::Relaxed)
    }

    /// Perform sequencer-driven replay for this DB.
    ///
    /// When `replay_plan` is `None` then no real replay will be performed. Instead the write buffer streams will be set
    /// to the current high watermark and normal playback will continue from there.
    pub async fn perform_replay(
        &self,
        replay_plan: Option<&ReplayPlan>,
        consumer: &mut dyn WriteBufferReading,
    ) -> Result<()> {
        use crate::db::replay::{perform_replay, seek_to_end};
        if let Some(replay_plan) = replay_plan {
            perform_replay(self, replay_plan, consumer)
                .await
                .context(ReplayError)
        } else {
            seek_to_end(self, consumer).await.context(ReplayError)
        }
    }

    /// Background worker function
    pub async fn background_worker(
        self: &Arc<Self>,
        shutdown: tokio_util::sync::CancellationToken,
    ) {
        info!("started background worker");

        // Loop that drives the lifecycle for this database
        let lifecycle_loop = async {
            loop {
                self.worker_iterations_lifecycle
                    .fetch_add(1, Ordering::Relaxed);

                let fut = {
                    let mut guard = self.lifecycle_policy.lock();
                    let policy = guard
                        .as_mut()
                        .expect("lifecycle policy should be initialized");

                    policy.check_for_work(self.time_provider.now().date_time())
                };
                fut.await
            }
        };

        // object store cleanup loop
        let object_store_cleanup_loop = async {
            loop {
                self.worker_iterations_cleanup
                    .fetch_add(1, Ordering::Relaxed);

                // read relevant parts of the db rules
                let (avg_sleep_secs, catalog_transaction_prune_age) = {
                    let guard = self.rules.read();
                    let avg_sleep_secs = guard.worker_cleanup_avg_sleep.as_secs_f32().max(1.0);
                    let catalog_transaction_prune_age =
                        guard.lifecycle_rules.catalog_transaction_prune_age;
                    (avg_sleep_secs, catalog_transaction_prune_age)
                };

                // Sleep for a duration drawn from a poisson distribution to de-correlate workers.
                // Perform this sleep BEFORE the actual clean-up so that we don't immediately run a clean-up
                // on startup.
                let dist =
                    Poisson::new(avg_sleep_secs).expect("parameter should be positive and finite");
                let duration = Duration::from_secs_f32(dist.sample(&mut rand::thread_rng()));
                debug!(?duration, "cleanup worker sleeps");
                tokio::time::sleep(duration).await;

                if let Err(e) = prune_catalog_transaction_history(
                    self.iox_object_store(),
                    self.time_provider.now() - catalog_transaction_prune_age,
                )
                .await
                {
                    error!(%e, "error while pruning catalog transactions");
                }

                if let Err(e) = self.cleanup_unreferenced_parquet_files().await {
                    error!(%e, "error while cleaning unreferenced parquet files");
                }
            }
        };

        // worker loop to persist delete predicates
        let delete_predicate_persistence_loop = async {
            loop {
                let todo: Vec<_> = {
                    let guard = self.delete_predicates_mailbox.lock();
                    guard.clone()
                };

                if !todo.is_empty() {
                    match self.preserve_delete_predicates(&todo).await {
                        Ok(()) => {
                            let mut guard = self.delete_predicates_mailbox.lock();
                            // TODO: we could also run a de-duplication here once
                            // https://github.com/influxdata/influxdb_iox/issues/2626 is implemented
                            guard.drain(0..todo.len());
                        }
                        Err(e) => {
                            error!(%e, "cannot preserve delete predicates");
                        }
                    }
                }

                self.worker_iterations_delete_predicate_preservation
                    .fetch_add(1, Ordering::Relaxed);
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        };

        // None of the futures need to perform drain logic on shutdown.
        // When the first one finishes, all of them are dropped
        tokio::select! {
            _ = lifecycle_loop => error!("lifecycle loop exited - db worker bailing out"),
            _ = object_store_cleanup_loop => error!("object store cleanup loop exited - db worker bailing out"),
            _ = delete_predicate_persistence_loop => error!("delete predicate persistence loop exited - db worker bailing out"),
            _ = shutdown.cancelled() => info!("db worker shutting down"),
        }

        info!("finished db background worker");
    }

    async fn cleanup_unreferenced_parquet_files(
        self: &Arc<Self>,
    ) -> std::result::Result<(), parquet_file::catalog::cleanup::Error> {
        let guard = self.cleanup_lock.write().await;
        let files =
            get_unreferenced_parquet_files(&self.name(), &self.preserved_catalog, 1_000).await?;
        drop(guard);

        delete_parquet_files(&self.preserved_catalog, &files).await
    }

    async fn preserve_delete_predicates(
        self: &Arc<Self>,
        predicates: &[(Arc<DeletePredicate>, Vec<ChunkAddrWithoutDatabase>)],
    ) -> Result<(), parquet_file::catalog::core::Error> {
        let mut transaction = self.preserved_catalog.open_transaction().await;
        for (predicate, chunks) in predicates {
            transaction.delete_predicate(predicate, chunks);
        }
        let ckpt_handle = transaction.commit().await?;

        let catalog_transactions_until_checkpoint = self
            .rules
            .read()
            .lifecycle_rules
            .catalog_transactions_until_checkpoint
            .get();
        let create_checkpoint =
            ckpt_handle.revision_counter() % catalog_transactions_until_checkpoint == 0;
        if create_checkpoint {
            // Commit is already done, so we can just scan the catalog for the state.
            //
            // NOTE: There can only be a single transaction in this section because the checkpoint handle holds
            //       transaction lock. Therefore we don't need to worry about concurrent modifications of
            //       preserved chunks.
            if let Err(e) = ckpt_handle
                .create_checkpoint(checkpoint_data_from_catalog(&self.catalog))
                .await
            {
                warn!(%e, "cannot create catalog checkpoint");

                // That's somewhat OK. Don't fail the entire task, because the actual preservation was completed
                // (both in-mem and within the preserved catalog).
            }
        }

        Ok(())
    }

    /// Stores an entry based on the configuration.
    pub async fn store_entry(&self, entry: Entry) -> Result<()> {
        let immutable = {
            let rules = self.rules.read();
            rules.lifecycle_rules.immutable
        };
        debug!(%immutable, has_write_buffer_producer=self.write_buffer_producer.is_some(), "storing entry");

        match (self.write_buffer_producer.as_ref(), immutable) {
            (Some(write_buffer), true) => {
                // If only the write buffer is configured, this is passing the data through to
                // the write buffer, and it's not an error. We ignore the returned metadata; it
                // will get picked up when data is read from the write buffer.

                // TODO: be smarter than always using sequencer 0
                let _ = write_buffer
                    .store_entry(&entry, 0)
                    .await
                    .context(WriteBufferWritingError)?;
                Ok(())
            }
            (Some(write_buffer), false) => {
                // If using both write buffer and mutable buffer, we want to wait for the write
                // buffer to return success before adding the entry to the mutable buffer.

                // TODO: be smarter than always using sequencer 0
                let (sequence, producer_wallclock_timestamp) = write_buffer
                    .store_entry(&entry, 0)
                    .await
                    .context(WriteBufferWritingError)?;
                let sequenced_entry = Arc::new(SequencedEntry::new_from_sequence(
                    sequence,
                    producer_wallclock_timestamp,
                    entry,
                ));

                self.store_sequenced_entry(sequenced_entry, filter_table_batch_keep_all)
            }
            (_, true) => {
                // If not configured to send entries to the write buffer and the database is
                // immutable, trying to store an entry is an error and we don't need to build a
                // `SequencedEntry`.
                DatabaseNotWriteable {}.fail()
            }
            (None, false) => {
                // If no write buffer is configured, nothing is
                // sequencing entries so skip doing so here
                let sequenced_entry = Arc::new(SequencedEntry::new_unsequenced(entry));

                self.store_sequenced_entry(sequenced_entry, filter_table_batch_keep_all)
            }
        }
    }

    /// Given a `SequencedEntry`, if the mutable buffer is configured, the `SequencedEntry` is then
    /// written into the mutable buffer.
    ///
    /// # Filtering
    /// `filter_table_batch` can be used to filter out table batches. It gets:
    ///
    /// 1. the current sequence
    /// 2. the partition key
    /// 3. the table batch (which also contains the table name)
    ///
    /// It shall return `(true, _)` if the batch should be stored and `(false, _)` otherwise. In the first case the
    /// second element in the tuple is a row-wise mask. If it is provided only rows marked with `true` are stored.
    pub fn store_sequenced_entry<F>(
        &self,
        sequenced_entry: Arc<SequencedEntry>,
        filter_table_batch: F,
    ) -> Result<()>
    where
        F: Fn(Option<&Sequence>, &str, &TableBatch<'_>) -> (bool, Option<Vec<bool>>),
    {
        // Get all needed database rule values, then release the lock
        let rules = self.rules.read();
        let immutable = rules.lifecycle_rules.immutable;
        let buffer_size_hard = rules.lifecycle_rules.buffer_size_hard;
        let late_arrival_window = rules.lifecycle_rules.late_arrive_window();
        let mub_row_threshold = rules.lifecycle_rules.mub_row_threshold;
        std::mem::drop(rules);

        // We may have gotten here through `store_entry`, in which case this is checking the
        // configuration again unnecessarily, but we may have come here by consuming records from
        // the write buffer, so this check is necessary in that case.
        if immutable {
            return DatabaseNotWriteable {}.fail();
        }

        if let Some(hard_limit) = buffer_size_hard {
            if self.catalog.metrics().memory().total() > hard_limit.get() {
                return HardLimitReached {}.fail();
            }
        }

        // Note: as `time_of_write` is taken before any synchronisation writes may arrive to a chunk
        // out of order w.r.t this timestamp. As DateTime<Utc> isn't monotonic anyway
        // this isn't an issue

        if let Some(partitioned_writes) = sequenced_entry.partition_writes() {
            let sequence = sequenced_entry.as_ref().sequence();

            // Protect against DoS by limiting the number of errors we might collect
            const MAX_ERRORS_PER_SEQUENCED_ENTRY: usize = 10;

            let mut errors = Vec::with_capacity(MAX_ERRORS_PER_SEQUENCED_ENTRY);

            for write in partitioned_writes {
                let partition_key = write.key();
                for table_batch in write.table_batches() {
                    let row_count = match NonZeroUsize::new(table_batch.row_count()) {
                        Some(row_count) => row_count,
                        None => continue,
                    };

                    let (store_batch, mask) =
                        filter_table_batch(sequence, partition_key, &table_batch);
                    if !store_batch {
                        continue;
                    }

                    let (partition, table_schema) = self
                        .catalog
                        .get_or_create_partition(table_batch.name(), partition_key);

                    let batch_schema =
                        match table_batch.schema().context(TableBatchSchemaExtractError) {
                            Ok(batch_schema) => batch_schema,
                            Err(e) => {
                                if errors.len() < MAX_ERRORS_PER_SEQUENCED_ENTRY {
                                    errors.push(e);
                                }
                                continue;
                            }
                        };

                    let schema_handle =
                        match TableSchemaUpsertHandle::new(&table_schema, &batch_schema)
                            .context(TableBatchSchemaMergeError)
                        {
                            Ok(schema_handle) => schema_handle,
                            Err(e) => {
                                if errors.len() < MAX_ERRORS_PER_SEQUENCED_ENTRY {
                                    errors.push(e);
                                }
                                continue;
                            }
                        };

                    let timestamp_summary = table_batch
                        .timestamp_summary()
                        .context(TableBatchTimeError)?;

                    // At this point this should not be possible
                    ensure!(
                        timestamp_summary.stats.total_count == row_count.get() as u64,
                        TableBatchMissingTimes {}
                    );

                    let mut partition = partition.write();

                    let handle_chunk_write = |chunk: &mut CatalogChunk| {
                        chunk.record_write(&timestamp_summary);
                        if chunk.storage().0 >= mub_row_threshold.get() {
                            chunk.freeze().expect("freeze mub chunk");
                        }
                    };

                    match partition.open_chunk() {
                        Some(chunk) => {
                            let mut chunk = chunk.write();
                            let chunk_id = chunk.id();

                            let mb_chunk =
                                chunk.mutable_buffer().expect("cannot mutate open chunk");

                            if let Err(e) = mb_chunk
                                .write_table_batch(table_batch, mask.as_ref().map(|x| x.as_ref()))
                                .context(WriteEntry {
                                    partition_key,
                                    chunk_id,
                                })
                            {
                                if errors.len() < MAX_ERRORS_PER_SEQUENCED_ENTRY {
                                    errors.push(e);
                                }
                                continue;
                            };
                            handle_chunk_write(&mut *chunk)
                        }
                        None => {
                            let chunk_result = MBChunk::new(
                                MutableBufferChunkMetrics::new(self.metric_registry.as_ref()),
                                table_batch,
                                mask.as_ref().map(|x| x.as_ref()),
                            )
                            .context(WriteEntryInitial { partition_key });

                            match chunk_result {
                                Ok(mb_chunk) => {
                                    let chunk = partition.create_open_chunk(mb_chunk);
                                    let mut chunk = chunk
                                        .try_write()
                                        .expect("partition lock should prevent contention");
                                    handle_chunk_write(&mut *chunk)
                                }
                                Err(e) => {
                                    if errors.len() < MAX_ERRORS_PER_SEQUENCED_ENTRY {
                                        errors.push(e);
                                    }
                                    continue;
                                }
                            }
                        }
                    };

                    partition.update_last_write_at();

                    schema_handle.commit();

                    // TODO: PersistenceWindows use TimestampSummary
                    let min_time = Time::from_timestamp_nanos(timestamp_summary.stats.min.unwrap());
                    let max_time = Time::from_timestamp_nanos(timestamp_summary.stats.max.unwrap());

                    match partition.persistence_windows_mut() {
                        Some(windows) => {
                            windows.add_range(sequence, row_count, min_time, max_time);
                        }
                        None => {
                            let mut windows = PersistenceWindows::new(
                                partition.addr().clone(),
                                late_arrival_window,
                                Arc::clone(&self.time_provider),
                            );
                            windows.add_range(sequence, row_count, min_time, max_time);
                            partition.set_persistence_windows(windows);
                        }
                    }
                }
            }

            ensure!(errors.is_empty(), StoreSequencedEntryFailures { errors });
        }

        Ok(())
    }
}

pub fn filter_table_batch_keep_all(
    _sequence: Option<&Sequence>,
    _partition_key: &str,
    _batch: &TableBatch<'_>,
) -> (bool, Option<Vec<bool>>) {
    (true, None)
}

#[async_trait]
/// Convenience implementation of `Database` so the rest of the code
/// can just use Db as a `Database` even though the implementation
/// lives in `catalog_access`
impl QueryDatabase for Db {
    type Error = Error;
    type Chunk = DbChunk;

    fn chunks(&self, predicate: &Predicate) -> Vec<Arc<Self::Chunk>> {
        self.catalog_access.chunks(predicate)
    }

    fn partition_keys(&self) -> Result<Vec<String>, Self::Error> {
        self.catalog_access.partition_keys()
    }

    fn chunk_summaries(&self) -> Result<Vec<ChunkSummary>> {
        self.catalog_access.chunk_summaries()
    }

    fn table_schema(&self, table_name: &str) -> Option<Arc<Schema>> {
        self.catalog_access.table_schema(table_name)
    }
}

impl ExecutionContextProvider for Db {
    fn new_query_context(self: &Arc<Self>, span_ctx: Option<SpanContext>) -> IOxExecutionContext {
        self.exec
            .new_execution_config(ExecutorType::Query)
            .with_default_catalog(Arc::<Self>::clone(self))
            .with_span_context(span_ctx)
            .build()
    }
}

/// Convenience implementation of `CatalogProvider` so the rest of the
/// code can use Db as a `CatalogProvider` (e.g. for running
/// SQL). even though the implementation lives in `catalog_access`
impl CatalogProvider for Db {
    fn as_any(&self) -> &dyn Any {
        self as &dyn Any
    }

    fn schema_names(&self) -> Vec<String> {
        self.catalog_access.schema_names()
    }

    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
        self.catalog_access.schema(name)
    }
}

pub(crate) fn checkpoint_data_from_catalog(catalog: &Catalog) -> CheckpointData {
    let mut files = HashMap::new();
    let mut delete_predicates: HashMap<Arc<DeletePredicate>, HashSet<ChunkAddrWithoutDatabase>> =
        Default::default();

    for chunk in catalog.chunks() {
        let guard = chunk.read();
        if let ChunkStage::Persisted { parquet, .. } = guard.stage() {
            // capture parquet file path
            let path = parquet.path().clone();

            let m = CatalogParquetInfo {
                path: path.clone(),
                file_size_bytes: parquet.file_size_bytes(),
                metadata: parquet.parquet_metadata(),
            };

            files.insert(path, m);
        }

        // capture delete predicates
        // We should only report persisted chunks or chunks that are currently being persisted, because the
        // preserved catalog does not care about purely in-mem chunks.
        if matches!(guard.stage(), ChunkStage::Persisted { .. })
            || guard.is_in_lifecycle(ChunkLifecycleAction::Persisting)
        {
            for predicate in guard.delete_predicates() {
                delete_predicates
                    .entry(Arc::clone(predicate))
                    .and_modify(|chunks| {
                        chunks.insert(guard.addr().clone().into());
                    })
                    .or_insert_with(|| {
                        IntoIterator::into_iter([guard.addr().clone().into()]).collect()
                    });
            }
        }
    }

    CheckpointData {
        files,
        delete_predicates,
    }
}

pub mod test_helpers {
    use std::collections::HashSet;

    use arrow::record_batch::RecordBatch;

    use entry::test_helpers::lp_to_entries;
    use query::frontend::sql::SqlQueryPlanner;

    use super::*;

    /// Try to write lineprotocol data and return all tables that where written.
    pub async fn try_write_lp(db: &Db, lp: &str) -> Result<Vec<String>> {
        let entries = {
            let partitioner = &db.rules.read().partition_template;
            lp_to_entries(lp, partitioner)
        };

        let mut tables = HashSet::new();
        for entry in entries {
            if let Some(writes) = entry.partition_writes() {
                for write in writes {
                    for batch in write.table_batches() {
                        tables.insert(batch.name().to_string());
                    }
                }
                db.store_entry(entry).await?;
            }
        }

        let mut tables: Vec<_> = tables.into_iter().collect();
        tables.sort();
        Ok(tables)
    }

    /// Same was [`try_write_lp`](try_write_lp) but will panic on failure.
    pub async fn write_lp(db: &Db, lp: &str) -> Vec<String> {
        try_write_lp(db, lp).await.unwrap()
    }

    /// Convenience macro to test if an [`db::Error`](crate::db::Error) is a
    /// [StoreSequencedEntryFailures](crate::db::Error::StoreSequencedEntryFailures) and then check for errors contained
    /// in it.
    #[macro_export]
    macro_rules! assert_store_sequenced_entry_failures {
        ($e:expr, [$($sub:pat),*]) => {
            {
                // bind $e to variable so we don't evaluate it twice
                let e = $e;

                if let $crate::db::Error::StoreSequencedEntryFailures{errors} = e {
                    assert!(matches!(&errors[..], [$($sub),*]));
                } else {
                    panic!("Expected StoreSequencedEntryFailures but got {}", e);
                }
            }
        };
    }

    /// Run a sql query against the database, returning the results as record batches.
    pub async fn run_query(db: Arc<Db>, query: &str) -> Vec<RecordBatch> {
        let planner = SqlQueryPlanner::default();
        let ctx = db.new_query_context(None);
        let physical_plan = planner.query(query, &ctx).await.unwrap();
        ctx.collect(physical_plan).await.unwrap()
    }
}

#[cfg(test)]
mod tests {
    use std::{
        convert::TryFrom,
        iter::Iterator,
        num::{NonZeroU32, NonZeroU64, NonZeroUsize},
        ops::Deref,
        str,
        time::{Duration, Instant},
    };

    use arrow::record_batch::RecordBatch;
    use bytes::Bytes;
    use futures::{stream, StreamExt, TryStreamExt};
    use predicate::delete_expr::DeleteExpr;
    use tokio_util::sync::CancellationToken;

    use ::test_helpers::{assert_contains, maybe_start_logging};
    use arrow_util::{assert_batches_eq, assert_batches_sorted_eq};
    use data_types::{
        chunk_metadata::{ChunkAddr, ChunkStorage},
        database_rules::{LifecycleRules, PartitionTemplate, TemplatePart},
        partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary},
        timestamp::TimestampRange,
        write_summary::TimestampSummary,
    };
    use entry::test_helpers::lp_to_entry;
    use iox_object_store::ParquetFilePath;
    use metric::{Attributes, CumulativeGauge, Metric, Observation};
    use object_store::ObjectStore;
    use parquet_file::{
        catalog::test_helpers::load_ok,
        metadata::IoxParquetMetaData,
        test_utils::{load_parquet_from_store_for_path, read_data_from_parquet_data},
    };
    use persistence_windows::min_max_sequence::MinMaxSequence;
    use query::{QueryChunk, QueryDatabase};
    use schema::selection::Selection;
    use schema::Schema;
    use time::Time;
    use write_buffer::mock::{
        MockBufferForWriting, MockBufferForWritingThatAlwaysErrors, MockBufferSharedState,
    };

    use crate::utils::make_db_time;
    use crate::{
        assert_store_sequenced_entry_failures,
        db::{
            catalog::chunk::ChunkStage,
            test_helpers::{run_query, try_write_lp, write_lp},
        },
        utils::{make_db, TestDb},
    };

    use super::*;

    type TestError = Box<dyn std::error::Error + Send + Sync + 'static>;
    type Result<T, E = TestError> = std::result::Result<T, E>;

    async fn immutable_db() -> Arc<Db> {
        TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                immutable: true,
                ..Default::default()
            })
            .build()
            .await
            .db
    }

    #[tokio::test]
    async fn write_no_mutable_buffer() {
        // Validate that writes are rejected if there is no mutable buffer
        let db = immutable_db().await;

        let entry = lp_to_entry("cpu bar=1 10");
        let res = db.store_entry(entry).await;
        assert_contains!(
            res.unwrap_err().to_string(),
            "Cannot write to this database: no mutable buffer configured"
        );
    }

    #[tokio::test]
    async fn write_with_write_buffer_no_mutable_buffer() {
        // Writes should be forwarded to the write buffer and *not* rejected if the write buffer is
        // configured and the mutable buffer isn't
        let write_buffer_state =
            MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
        let time_provider = Arc::new(time::MockProvider::new(Time::from_timestamp_nanos(0)));
        let write_buffer = Arc::new(
            MockBufferForWriting::new(write_buffer_state.clone(), None, time_provider).unwrap(),
        );
        let test_db = TestDb::builder()
            .write_buffer_producer(write_buffer)
            .lifecycle_rules(LifecycleRules {
                immutable: true,
                ..Default::default()
            })
            .build()
            .await
            .db;

        let entry = lp_to_entry("cpu bar=1 10");
        test_db.store_entry(entry).await.unwrap();

        assert_eq!(write_buffer_state.get_messages(0).len(), 1);
    }

    #[tokio::test]
    async fn write_to_write_buffer_and_mutable_buffer() {
        // Writes should be forwarded to the write buffer *and* the mutable buffer if both are
        // configured.
        let write_buffer_state =
            MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
        let time_provider = Arc::new(time::MockProvider::new(Time::from_timestamp_nanos(0)));
        let write_buffer = Arc::new(
            MockBufferForWriting::new(write_buffer_state.clone(), None, time_provider).unwrap(),
        );
        let db = TestDb::builder()
            .write_buffer_producer(write_buffer)
            .build()
            .await
            .db;

        let entry = lp_to_entry("cpu bar=1 10");
        db.store_entry(entry).await.unwrap();

        assert_eq!(write_buffer_state.get_messages(0).len(), 1);

        let batches = run_query(db, "select * from cpu").await;

        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        assert_batches_eq!(expected, &batches);
    }

    #[tokio::test]
    async fn write_buffer_errors_propagated() {
        let write_buffer = Arc::new(MockBufferForWritingThatAlwaysErrors {});

        let db = TestDb::builder()
            .write_buffer_producer(write_buffer)
            .build()
            .await
            .db;

        let entry = lp_to_entry("cpu bar=1 10");

        let res = db.store_entry(entry).await;

        assert!(
            matches!(res, Err(Error::WriteBufferWritingError { .. })),
            "Expected Err(Error::WriteBufferWritingError {{ .. }}), got: {:?}",
            res
        );
    }

    #[tokio::test]
    async fn cant_write_when_reading_from_write_buffer() {
        // Validate that writes are rejected if this database is reading from the write buffer
        let db = immutable_db().await;
        let entry = lp_to_entry("cpu bar=1 10");
        let res = db.store_entry(entry).await;
        assert_contains!(
            res.unwrap_err().to_string(),
            "Cannot write to this database: no mutable buffer configured"
        );
    }

    #[tokio::test]
    async fn read_write() {
        // This test also exercises the path without a write buffer.
        let db = make_db().await.db;
        write_lp(&db, "cpu bar=1 10").await;

        let batches = run_query(db, "select * from cpu").await;

        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        assert_batches_eq!(expected, &batches);
    }

    #[tokio::test]
    async fn try_all_partition_writes_when_some_fail() {
        let db = make_db().await.db;

        let nanoseconds_per_hour = 60 * 60 * 1_000_000_000u64;

        // 3 lines that will go into 3 hour partitions and start new chunks.
        let lp = format!(
            "foo,t1=alpha iv=1i {}
             foo,t1=bravo iv=1i {}
             foo,t1=charlie iv=1i {}",
            0,
            nanoseconds_per_hour,
            nanoseconds_per_hour * 2,
        );

        let entry = lp_to_entry(&lp);

        // This should succeed and start chunks in the MUB
        db.store_entry(entry).await.unwrap();

        // 3 more lines that should go in the 3 partitions/chunks.
        // Line 1 has the same schema and should end up in the MUB.
        // Line 2 has a different schema than line 1 and should error
        // Line 3 has the same schema as line 1 and should end up in the MUB.
        let lp = format!(
            "foo,t1=delta iv=1i {}
             foo t1=10i {}
             foo,t1=important iv=1i {}",
            1,
            nanoseconds_per_hour + 1,
            nanoseconds_per_hour * 2 + 1,
        );

        let entry = lp_to_entry(&lp);

        // This should return an error because there was at least one error in the loop
        let result = db.store_entry(entry).await;
        assert_contains!(
            result.unwrap_err().to_string(),
            "Storing sequenced entry failed with the following error(s), and possibly more:"
        );

        // But 5 points should be returned, most importantly the last one after the line with
        // the mismatched schema
        let batches = run_query(db, "select t1 from foo").await;

        let expected = vec![
            "+-----------+",
            "| t1        |",
            "+-----------+",
            "| alpha     |",
            "| bravo     |",
            "| charlie   |",
            "| delta     |",
            "| important |",
            "+-----------+",
        ];
        assert_batches_sorted_eq!(expected, &batches);
    }

    fn catalog_chunk_size_bytes_metric_eq(
        registry: &metric::Registry,
        location: &'static str,
        expected: u64,
    ) {
        let actual = registry
            .get_instrument::<Metric<CumulativeGauge>>("catalog_chunks_mem_usage_bytes")
            .unwrap()
            .get_observer(&Attributes::from(&[
                ("db_name", "placeholder"),
                ("location", location),
            ]))
            .unwrap()
            .fetch();

        assert_eq!(actual, expected)
    }

    fn assert_storage_gauge(
        registry: &metric::Registry,
        name: &'static str,
        location: &'static str,
        expected: u64,
    ) {
        let actual = registry
            .get_instrument::<Metric<CumulativeGauge>>(name)
            .unwrap()
            .get_observer(&Attributes::from(&[
                ("db_name", "placeholder"),
                ("location", location),
                ("table", "cpu"),
            ]))
            .unwrap()
            .fetch();

        assert_eq!(actual, expected)
    }

    #[tokio::test]
    async fn metrics_during_rollover() {
        let time = Arc::new(time::MockProvider::new(Time::from_timestamp(11, 22)));
        let test_db = TestDb::builder()
            .time_provider(Arc::<time::MockProvider>::clone(&time))
            .build()
            .await;

        let db = Arc::clone(&test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;

        let registry = test_db.metric_registry.as_ref();

        // A chunk has been opened
        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);

        // verify chunk size updated
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 700);

        // write into same chunk again.
        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=3 30").await;

        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=4 40").await;

        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=5 50").await;

        // verify chunk size updated
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 764);

        // Still only one chunk open
        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 5);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);

        db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();

        // A chunk is now closed
        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 5);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);

        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 1295);

        db.compact_partition("cpu", "1970-01-01T00").await.unwrap();

        // A chunk is now in the read buffer
        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 5);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);

        // verify chunk size updated (chunk moved from closing to moving to moved)
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        let expected_read_buffer_size = 1706;
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", expected_read_buffer_size);

        time.inc(Duration::from_secs(1));
        *db.persisted_chunk_id_override.lock() = Some(ChunkId::new_test(1337));
        let chunk_id = db
            .persist_partition("cpu", "1970-01-01T00", true)
            .await
            .unwrap()
            .unwrap()
            .id();

        // A chunk is now in the object store and still in read buffer
        let expected_parquet_size = 1233;
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", expected_read_buffer_size);
        // now also in OS
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", expected_parquet_size);

        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 1);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 1);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 5);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 5);

        db.unload_read_buffer("cpu", "1970-01-01T00", chunk_id)
            .unwrap();

        // A chunk is now now in the "os-only" state.
        assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_chunks", "object_store", 1);
        assert_storage_gauge(registry, "catalog_loaded_rows", "mutable_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "read_buffer", 0);
        assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 5);

        // verify chunk size not increased for OS (it was in OS before unload)
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", expected_parquet_size);
        // verify chunk size for RB has decreased
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 0);
    }

    #[tokio::test]
    async fn write_metrics() {
        std::env::set_var("INFLUXDB_IOX_ROW_TIMESTAMP_METRICS", "write_metrics_test");
        let test_db = make_db().await;
        let db = Arc::clone(&test_db.db);

        write_lp(db.as_ref(), "write_metrics_test foo=1 100000000000").await;
        write_lp(db.as_ref(), "write_metrics_test foo=2 180000000000").await;
        write_lp(db.as_ref(), "write_metrics_test foo=3 650000000000").await;
        write_lp(db.as_ref(), "write_metrics_test foo=3 650000000010").await;

        let mut summary = TimestampSummary::default();
        summary.record(Time::from_timestamp_nanos(100000000000));
        summary.record(Time::from_timestamp_nanos(180000000000));
        summary.record(Time::from_timestamp_nanos(650000000000));
        summary.record(Time::from_timestamp_nanos(650000000010));

        let mut reporter = metric::RawReporter::default();
        test_db.metric_registry.report(&mut reporter);

        let observation = reporter
            .metric("catalog_row_time")
            .unwrap()
            .observation(&[("db_name", "placeholder"), ("table", "write_metrics_test")])
            .unwrap();

        let histogram = match observation {
            Observation::DurationHistogram(histogram) => histogram,
            _ => unreachable!(),
        };
        assert_eq!(histogram.buckets.len(), 60);

        for ((minute, count), observation) in
            summary.counts.iter().enumerate().zip(&histogram.buckets)
        {
            let minute = Duration::from_secs((minute * 60) as u64);
            assert_eq!(observation.le, minute);
            assert_eq!(*count as u64, observation.count)
        }
    }

    #[tokio::test]
    async fn write_with_rollover() {
        let db = make_db().await.db;
        write_lp(db.as_ref(), "cpu bar=1 10").await;
        assert_eq!(vec!["1970-01-01T00"], db.partition_keys().unwrap());

        let mb_chunk = db
            .rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();

        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu").await;
        assert_batches_sorted_eq!(expected, &batches);

        // add new data
        write_lp(db.as_ref(), "cpu bar=2 20").await;
        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "| 2   | 1970-01-01T00:00:00.000000020Z |",
            "+-----+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu").await;
        assert_batches_sorted_eq!(&expected, &batches);

        // And expect that we still get the same thing when data is rolled over again
        let chunk = db
            .rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();
        assert_ne!(chunk.id(), mb_chunk.id());

        let batches = run_query(db, "select * from cpu").await;
        assert_batches_sorted_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn write_with_missing_tags_are_null() {
        let db = Arc::new(make_db().await.db);
        // Note the `region` tag is introduced in the second line, so
        // the values in prior rows for the region column are
        // null. Likewise the `core` tag is introduced in the third
        // line so the prior columns are null
        let lines = vec![
            "cpu,region=west user=23.2 10",
            "cpu, user=10.0 11",
            "cpu,core=one user=10.0 11",
        ];

        write_lp(db.as_ref(), &lines.join("\n")).await;
        assert_eq!(vec!["1970-01-01T00"], db.partition_keys().unwrap());

        db.rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();

        let expected = vec![
            "+------+--------+--------------------------------+------+",
            "| core | region | time                           | user |",
            "+------+--------+--------------------------------+------+",
            "|      |        | 1970-01-01T00:00:00.000000011Z | 10   |",
            "|      | west   | 1970-01-01T00:00:00.000000010Z | 23.2 |",
            "| one  |        | 1970-01-01T00:00:00.000000011Z | 10   |",
            "+------+--------+--------------------------------+------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu").await;
        assert_batches_sorted_eq!(expected, &batches);
    }

    #[tokio::test]
    async fn read_from_read_buffer() {
        // Test that data can be loaded into the ReadBuffer
        let test_db = make_db().await;
        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        let partition_key = "1970-01-01T00";
        let mb_chunk = db
            .rollover_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        let rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        // it should be a new chunk
        assert_ne!(mb_chunk.id(), rb_chunk.id());

        // we should have chunks in both the read buffer only
        assert!(mutable_chunk_ids(&db, partition_key).is_empty());
        assert_eq!(read_buffer_chunk_ids(&db, partition_key).len(), 1);

        // data should be readable
        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "| 2   | 1970-01-01T00:00:00.000000020Z |",
            "+-----+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu").await;
        assert_batches_eq!(&expected, &batches);

        let registry = test_db.metric_registry.as_ref();

        // A chunk is now in the read buffer
        assert_storage_gauge(registry, "catalog_loaded_chunks", "read_buffer", 1);

        // verify chunk size updated (chunk moved from moved to writing to written)
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 1700);

        // drop, the chunk from the read buffer
        db.drop_chunk("cpu", partition_key, rb_chunk.id())
            .await
            .unwrap();
        assert_eq!(
            read_buffer_chunk_ids(db.as_ref(), partition_key),
            vec![] as Vec<ChunkId>
        );

        // verify size is not accounted even though a reference to the RubChunk still exists
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 0);
        std::mem::drop(rb_chunk);

        // verify chunk size updated (chunk dropped from moved state)
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 0);

        // Currently this doesn't work (as we need to teach the stores how to
        // purge tables after data bas been dropped println!("running
        // query after all data dropped!"); let expected = vec![] as
        // Vec<&str>; let batches = run_query(&db, "select * from
        // cpu").await; assert_batches_eq!(expected, &batches);
    }

    #[tokio::test]
    async fn compact() {
        // Test that data can be read after it is compacted
        let (db, time) = make_db_time().await;

        let t_write1 = time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=1 10").await;

        let partition_key = "1970-01-01T00";
        db.rollover_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        let old_rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        let first_old_rb_write = old_rb_chunk.time_of_first_write();
        let last_old_rb_write = old_rb_chunk.time_of_last_write();
        assert_eq!(first_old_rb_write, last_old_rb_write);
        assert_eq!(first_old_rb_write, t_write1);

        // Put new data into the mutable buffer
        let t_write2 = time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        // now, compact it
        let compacted_rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        // no other read buffer data should be present
        assert_eq!(
            read_buffer_chunk_ids(&db, partition_key),
            vec![compacted_rb_chunk.id()]
        );
        assert_ne!(old_rb_chunk.id(), compacted_rb_chunk.id());

        // Compacted first/last write times should be the min of the first writes and the max
        // of the last writes of the compacted chunks
        let first_compacted_write = compacted_rb_chunk.time_of_first_write();
        let last_compacted_write = compacted_rb_chunk.time_of_last_write();
        assert_eq!(first_old_rb_write, first_compacted_write);
        assert_ne!(last_old_rb_write, last_compacted_write);
        assert_eq!(last_compacted_write, t_write2);

        // data should be readable
        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "| 2   | 1970-01-01T00:00:00.000000020Z |",
            "+-----+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu").await;
        assert_batches_eq!(&expected, &batches);
    }

    async fn collect_read_filter(chunk: &DbChunk) -> Vec<RecordBatch> {
        chunk
            .read_filter(&Default::default(), Selection::All, &[])
            .unwrap()
            .collect::<Vec<_>>()
            .await
            .into_iter()
            .map(Result::unwrap)
            .collect()
    }

    #[tokio::test]
    async fn load_to_read_buffer_sorted() {
        let test_db = make_db().await;
        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu,tag1=cupcakes bar=1 10").await;
        write_lp(db.as_ref(), "cpu,tag1=asfd,tag2=foo bar=2 20").await;
        write_lp(db.as_ref(), "cpu,tag1=bingo,tag2=foo bar=2 10").await;
        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 20").await;
        write_lp(db.as_ref(), "cpu,tag1=bongo,tag2=a bar=2 10").await;
        write_lp(db.as_ref(), "cpu,tag2=a bar=3 5").await;

        let partition_key = "1970-01-01T00";
        let mb_chunk = db
            .rollover_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        let mb = collect_read_filter(&mb_chunk).await;

        let registry = test_db.metric_registry.as_ref();
        // MUB chunk size
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 3607);

        // With the above data, cardinality of tag2 is 2 and tag1 is 5. Hence, RUB is sorted on (tag2, tag1)
        let rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        // MUB chunk size
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 3618);

        let rb = collect_read_filter(&rb_chunk).await;

        // Test that data on load into the read buffer is sorted

        assert_batches_eq!(
            &[
                "+-----+----------+------+--------------------------------+",
                "| bar | tag1     | tag2 | time                           |",
                "+-----+----------+------+--------------------------------+",
                "| 1   | cupcakes |      | 1970-01-01T00:00:00.000000010Z |",
                "| 2   | asfd     | foo  | 1970-01-01T00:00:00.000000020Z |",
                "| 2   | bingo    | foo  | 1970-01-01T00:00:00.000000010Z |",
                "| 2   | bongo    | a    | 1970-01-01T00:00:00.000000020Z |",
                "| 2   | bongo    | a    | 1970-01-01T00:00:00.000000010Z |",
                "| 3   |          | a    | 1970-01-01T00:00:00.000000005Z |",
                "+-----+----------+------+--------------------------------+",
            ],
            &mb
        );

        assert_batches_eq!(
            &[
                "+-----+----------+------+--------------------------------+",
                "| bar | tag1     | tag2 | time                           |",
                "+-----+----------+------+--------------------------------+",
                "| 1   | cupcakes |      | 1970-01-01T00:00:00.000000010Z |",
                "| 3   |          | a    | 1970-01-01T00:00:00.000000005Z |",
                "| 2   | bongo    | a    | 1970-01-01T00:00:00.000000010Z |",
                "| 2   | bongo    | a    | 1970-01-01T00:00:00.000000020Z |",
                "| 2   | asfd     | foo  | 1970-01-01T00:00:00.000000020Z |",
                "| 2   | bingo    | foo  | 1970-01-01T00:00:00.000000010Z |",
                "+-----+----------+------+--------------------------------+",
            ],
            &rb
        );
    }

    async fn parquet_files(iox_storage: &IoxObjectStore) -> Result<Vec<ParquetFilePath>> {
        iox_storage
            .parquet_files()
            .await?
            .map_ok(|v| stream::iter(v).map(Ok))
            .try_flatten()
            .try_collect()
            .await
    }

    #[tokio::test]
    async fn write_one_chunk_to_parquet_file() {
        // Test that data can be written into parquet files
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let time = Arc::new(time::MockProvider::new(Time::from_timestamp(11, 22)));

        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .object_store(Arc::clone(&object_store))
            .time_provider(Arc::<time::MockProvider>::clone(&time))
            .build()
            .await;

        let db = test_db.db;

        // Write some line protocols in Mutable buffer of the DB
        write_lp(db.as_ref(), "cpu bar=1 10").await;
        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        //Now mark the MB chunk close
        let partition_key = "1970-01-01T00";
        let mb_chunk = db
            .rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();
        // Move that MB chunk to RB chunk and drop it from MB
        let rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        // Write the RB chunk to Object Store but keep it in RB
        time.inc(Duration::from_secs(1));
        *db.persisted_chunk_id_override.lock() = Some(ChunkId::new_test(1337));
        let pq_chunk = db
            .persist_partition("cpu", partition_key, true)
            .await
            .unwrap()
            .unwrap();

        let registry = test_db.metric_registry.as_ref();

        // Read buffer + Parquet chunk size
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 1700);
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", 1231);

        // All the chunks should have different IDs
        assert_ne!(mb_chunk.id(), rb_chunk.id());
        assert_ne!(mb_chunk.id(), pq_chunk.id());

        // we should have chunks in both the read buffer only
        assert!(mutable_chunk_ids(&db, partition_key).is_empty());
        assert_eq!(read_buffer_chunk_ids(&db, partition_key).len(), 1);
        assert_eq!(parquet_file_chunk_ids(&db, partition_key).len(), 1);

        // Verify data written to the parquet file in object store
        //
        // First, there must be one path of object store in the catalog
        let path = pq_chunk.object_store_path().unwrap();

        // Check that the path must exist in the object store
        let path_list = parquet_files(&db.iox_object_store).await.unwrap();
        assert_eq!(path_list.len(), 1);
        assert_eq!(&path_list[0], path);

        // Now read data from that path
        let parquet_data =
            load_parquet_from_store_for_path(&path_list[0], Arc::clone(&db.iox_object_store))
                .await
                .unwrap();

        let parquet_metadata = IoxParquetMetaData::from_file_bytes(parquet_data.clone()).unwrap();
        // Read metadata at file level
        let schema = parquet_metadata.decode().unwrap().read_schema().unwrap();
        // Read data
        let record_batches =
            read_data_from_parquet_data(Arc::clone(&schema.as_arrow()), parquet_data);

        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "| 2   | 1970-01-01T00:00:00.000000020Z |",
            "+-----+--------------------------------+",
        ];
        assert_batches_eq!(expected, &record_batches);
    }

    #[tokio::test]
    async fn unload_chunk_from_read_buffer() {
        // Test that data can be written into parquet files and then
        // remove it from read buffer and make sure we are still
        // be able to read data from object store

        // Create an object store in memory
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let time = Arc::new(time::MockProvider::new(Time::from_timestamp(11, 22)));

        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .object_store(Arc::clone(&object_store))
            .time_provider(Arc::<time::MockProvider>::clone(&time))
            .build()
            .await;

        let db = test_db.db;

        // Write some line protocols in Mutable buffer of the DB
        write_lp(db.as_ref(), "cpu bar=1 10").await;

        time.inc(Duration::from_secs(1));
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        // Now mark the MB chunk close
        let partition_key = "1970-01-01T00";
        let mb_chunk = db
            .rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();
        // Move that MB chunk to RB chunk and drop it from MB
        let rb_chunk = db
            .compact_partition("cpu", partition_key)
            .await
            .unwrap()
            .unwrap();

        // Write the RB chunk to Object Store but keep it in RB
        time.inc(Duration::from_secs(1));
        *db.persisted_chunk_id_override.lock() = Some(ChunkId::new_test(1337));
        let pq_chunk = db
            .persist_partition("cpu", partition_key, true)
            .await
            .unwrap()
            .unwrap();

        // All chunks should have different ids
        assert_ne!(mb_chunk.id(), rb_chunk.id());
        assert_ne!(mb_chunk.id(), pq_chunk.id());
        let pq_chunk_id = pq_chunk.id();

        // we should have chunks in both the read buffer only
        assert!(mutable_chunk_ids(&db, partition_key).is_empty());
        assert_eq!(read_buffer_chunk_ids(&db, partition_key), vec![pq_chunk_id]);
        assert_eq!(
            parquet_file_chunk_ids(&db, partition_key),
            vec![pq_chunk_id]
        );

        let registry = test_db.metric_registry.as_ref();

        // Read buffer + Parquet chunk size
        let object_store_bytes = 1231;
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 1700);
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", object_store_bytes);

        // Unload RB chunk but keep it in OS
        let pq_chunk = db
            .unload_read_buffer("cpu", partition_key, pq_chunk_id)
            .unwrap();

        // still should be the same chunk!
        assert_eq!(pq_chunk_id, pq_chunk.id());

        // we should only have chunk in os
        assert!(mutable_chunk_ids(&db, partition_key).is_empty());
        assert!(read_buffer_chunk_ids(&db, partition_key).is_empty());
        assert_eq!(
            parquet_file_chunk_ids(&db, partition_key),
            vec![pq_chunk_id]
        );

        // Parquet chunk size only
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", object_store_bytes);

        // Verify data written to the parquet file in object store
        //
        // First, there must be one path of object store in the catalog
        let path = pq_chunk.object_store_path().unwrap();

        // Check that the path must exist in the object store
        let path_list = parquet_files(&db.iox_object_store).await.unwrap();
        println!("path_list: {:#?}", path_list);
        assert_eq!(path_list.len(), 1);
        assert_eq!(&path_list[0], path);

        // Now read data from that path
        let parquet_data =
            load_parquet_from_store_for_path(&path_list[0], Arc::clone(&db.iox_object_store))
                .await
                .unwrap();
        let parquet_metadata = IoxParquetMetaData::from_file_bytes(parquet_data.clone()).unwrap();
        // Read metadata at file level
        let schema = parquet_metadata.decode().unwrap().read_schema().unwrap();
        // Read data
        let record_batches =
            read_data_from_parquet_data(Arc::clone(&schema.as_arrow()), parquet_data);

        let expected = vec![
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1   | 1970-01-01T00:00:00.000000010Z |",
            "| 2   | 1970-01-01T00:00:00.000000020Z |",
            "+-----+--------------------------------+",
        ];
        assert_batches_eq!(expected, &record_batches);
    }

    #[tokio::test]
    async fn write_updates_last_write_at() {
        let (db, time) = make_db_time().await;
        let w0 = time.inc(Duration::from_secs(23));

        let partition_key = "1970-01-01T00";
        write_lp(&db, "cpu bar=1 10").await;

        {
            let partition = db.catalog.partition("cpu", partition_key).unwrap();
            let partition = partition.read();

            assert_eq!(partition.created_at(), w0);
            assert_eq!(partition.last_write_at(), w0);
        }

        let w1 = time.inc(Duration::from_secs(1));

        write_lp(&db, "cpu bar=1 20").await;
        {
            let partition = db.catalog.partition("cpu", partition_key).unwrap();
            let partition = partition.read();
            assert_eq!(partition.created_at(), w0);
            assert_eq!(partition.last_write_at(), w1);
        }
    }

    #[tokio::test]
    async fn failed_write_doesnt_update_last_write_at() {
        let (db, time) = make_db_time().await;

        let t0 = time.inc(Duration::from_secs(2));

        let partition_key = "1970-01-01T00";
        write_lp(&db, "cpu bar=1 10").await;

        {
            let partition = db.catalog.partition("cpu", partition_key).unwrap();
            let partition = partition.read();

            assert_eq!(partition.created_at(), t0);
            assert_eq!(partition.last_write_at(), t0);
            let chunk = partition.open_chunk().unwrap();
            let chunk = chunk.read();
            assert_eq!(chunk.time_of_last_write(), t0);
        }

        time.inc(Duration::from_secs(1));

        let entry = lp_to_entry("cpu bar=true 10");
        let result = db.store_entry(entry).await;
        assert!(result.is_err());
        {
            let partition = db.catalog.partition("cpu", partition_key).unwrap();
            let partition = partition.read();
            assert_eq!(partition.created_at(), t0);
            assert_eq!(partition.last_write_at(), t0);
            let chunk = partition.open_chunk().unwrap();
            let chunk = chunk.read();
            assert_eq!(chunk.time_of_last_write(), t0);
        }
    }

    #[tokio::test]
    async fn write_updates_persistence_windows() {
        // Writes should update the persistence windows when there
        // is a write buffer configured.
        let write_buffer_state =
            MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
        let time_provider = Arc::new(time::MockProvider::new(Time::from_timestamp_nanos(0)));
        let write_buffer = Arc::new(
            MockBufferForWriting::new(write_buffer_state.clone(), None, time_provider).unwrap(),
        );
        let db = TestDb::builder()
            .write_buffer_producer(write_buffer)
            .build()
            .await
            .db;

        let partition_key = "1970-01-01T00";
        write_lp(&db, "cpu bar=1 10").await; // seq 0
        write_lp(&db, "cpu bar=1 20").await; // seq 1
        write_lp(&db, "cpu bar=1 30").await; // seq 2

        let partition = db.catalog.partition("cpu", partition_key).unwrap();
        let partition = partition.write();
        let windows = partition.persistence_windows().unwrap();
        let seq = windows.minimum_unpersisted_sequence().unwrap();

        let seq = seq.get(&0).unwrap();
        assert_eq!(seq, &MinMaxSequence::new(0, 2));
    }

    #[tokio::test]
    async fn write_with_no_write_buffer_updates_sequence() {
        let db = Arc::new(make_db().await.db);

        let partition_key = "1970-01-01T00";
        write_lp(&db, "cpu bar=1 10").await;
        write_lp(&db, "cpu bar=1 20").await;

        let partition = db.catalog.partition("cpu", partition_key).unwrap();
        let partition = partition.write();
        // validate it has data
        let table_summary = partition.summary().unwrap().table;
        assert_eq!(&table_summary.name, "cpu");
        assert_eq!(table_summary.total_count(), 2);
        let windows = partition.persistence_windows().unwrap();
        let open_min = windows.minimum_unpersisted_timestamp().unwrap();
        let open_max = windows.maximum_unpersisted_timestamp().unwrap();
        assert_eq!(open_min.timestamp_nanos(), 10);
        assert_eq!(open_max.timestamp_nanos(), 20);
    }

    #[tokio::test]
    async fn test_chunk_timestamps() {
        let (db, time) = make_db_time().await;
        let w0 = time.inc(Duration::from_secs(95));

        // Given data loaded into two chunks
        write_lp(&db, "cpu bar=1 10").await;

        let w1 = time.inc(Duration::from_secs(2));

        write_lp(&db, "cpu bar=1 20").await;

        // When the chunk is rolled over
        let partition_key = "1970-01-01T00";
        let chunk_id = db
            .rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap()
            .id();

        let partition = db.catalog.partition("cpu", partition_key).unwrap();
        let partition = partition.read();
        let (chunk, _order) = partition.chunk(chunk_id).unwrap();
        let chunk = chunk.read();

        // then the chunk creation and rollover times are as expected
        assert_eq!(chunk.time_of_first_write(), w0);
        assert_eq!(chunk.time_of_last_write(), w1);
    }

    #[tokio::test]
    async fn chunk_id_listing() {
        // Test that chunk id listing is hooked up
        let db = Arc::new(make_db().await.db);
        let partition_key = "1970-01-01T00";

        write_lp(&db, "cpu bar=1 10").await;
        write_lp(&db, "cpu bar=1 20").await;

        assert_eq!(mutable_chunk_ids(&db, partition_key).len(), 1);
        assert_eq!(
            read_buffer_chunk_ids(&db, partition_key),
            vec![] as Vec<ChunkId>
        );

        let partition_key = "1970-01-01T00";
        db.rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();

        // add a new chunk in mutable buffer, and move chunk1 (but
        // not chunk 0) to read buffer
        write_lp(&db, "cpu bar=1 30").await;
        db.compact_open_chunk("cpu", "1970-01-01T00").await.unwrap();

        write_lp(&db, "cpu bar=1 40").await;

        assert_eq!(mutable_chunk_ids(&db, partition_key).len(), 2);
        assert_eq!(read_buffer_chunk_ids(&db, partition_key).len(), 1);
    }

    #[tokio::test]
    async fn partition_chunk_summaries() {
        // Test that chunk id listing is hooked up
        let db = Arc::new(make_db().await.db);

        write_lp(&db, "cpu bar=1 1").await;
        db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();

        // write into a separate partitiion
        write_lp(&db, "cpu bar=1,baz2,frob=3 400000000000000").await;

        print!("Partitions: {:?}", db.partition_keys().unwrap());

        let chunk_summaries = db.partition_chunk_summaries("1970-01-05T15");

        let expected = vec![ChunkSummary {
            partition_key: Arc::from("1970-01-05T15"),
            table_name: Arc::from("cpu"),
            id: ChunkId::new_test(0),
            storage: ChunkStorage::OpenMutableBuffer,
            lifecycle_action: None,
            memory_bytes: 1006,    // memory_size
            object_store_bytes: 0, // os_size
            row_count: 1,
            time_of_last_access: None,
            time_of_first_write: Time::from_timestamp_nanos(1),
            time_of_last_write: Time::from_timestamp_nanos(1),
            order: ChunkOrder::new(5).unwrap(),
        }];

        let size: usize = db
            .chunk_summaries()
            .unwrap()
            .into_iter()
            .map(|x| x.memory_bytes)
            .sum();

        assert_eq!(db.catalog.metrics().memory().mutable_buffer(), size);

        for (expected_summary, actual_summary) in expected.iter().zip(chunk_summaries.iter()) {
            assert!(
                expected_summary.equal_without_timestamps_and_ids(actual_summary),
                "expected:\n{:#?}\n\nactual:{:#?}\n\n",
                expected_summary,
                actual_summary
            );
        }
    }

    #[tokio::test]
    async fn partition_chunk_summaries_timestamp() {
        let (db, time) = make_db_time().await;

        let t_first_write = time.inc(Duration::from_secs(2));
        write_lp(&db, "cpu bar=1 1").await;

        let t_second_write = time.inc(Duration::from_secs(2));
        write_lp(&db, "cpu bar=2 2").await;

        let mut chunk_summaries = db.chunk_summaries().unwrap();

        chunk_summaries.sort_by_key(|s| s.id);

        let summary = &chunk_summaries[0];
        assert_eq!(summary.time_of_first_write, t_first_write);
        assert_eq!(summary.time_of_last_write, t_second_write);
    }

    fn assert_first_last_times_eq(chunk_summary: &ChunkSummary, expected: Time) {
        let first_write = chunk_summary.time_of_first_write;
        let last_write = chunk_summary.time_of_last_write;

        assert_eq!(first_write, last_write);
        assert_eq!(first_write, expected);
    }

    fn assert_chunks_times_ordered(before: &ChunkSummary, after: &ChunkSummary) {
        let before_last_write = before.time_of_last_write;
        let after_first_write = after.time_of_first_write;

        assert!(before_last_write < after_first_write);
    }

    fn assert_chunks_times_eq(a: &ChunkSummary, b: &ChunkSummary) {
        assert_chunks_first_times_eq(a, b);
        assert_chunks_last_times_eq(a, b);
    }

    fn assert_chunks_first_times_eq(a: &ChunkSummary, b: &ChunkSummary) {
        let a_first_write = a.time_of_first_write;
        let b_first_write = b.time_of_first_write;
        assert_eq!(a_first_write, b_first_write);
    }

    fn assert_chunks_last_times_eq(a: &ChunkSummary, b: &ChunkSummary) {
        let a_last_write = a.time_of_last_write;
        let b_last_write = b.time_of_last_write;
        assert_eq!(a_last_write, b_last_write);
    }

    #[tokio::test]
    async fn chunk_summaries() {
        // Test that chunk id listing is hooked up
        let (db, time) = make_db_time().await;

        // get three chunks: one open, one closed in mb and one close in rb
        // In open chunk, will end up in rb/os
        let t1_write = Time::from_timestamp(11, 22);
        time.set(t1_write);
        write_lp(&db, "cpu bar=1 1").await;

        // Move open chunk to closed
        db.rollover_partition("cpu", "1970-01-01T00").await.unwrap();

        // New open chunk in mb
        // This point will end up in rb/os
        let t2_write = time.inc(Duration::from_secs(1));
        write_lp(&db, "cpu bar=1,baz=2 2").await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        assert_eq!(chunk_summaries.len(), 2);
        // Each chunk has one write, so both chunks should have first write == last write
        let closed_mb_t3 = chunk_summaries[0].clone();
        assert_eq!(closed_mb_t3.storage, ChunkStorage::ClosedMutableBuffer);
        assert_first_last_times_eq(&closed_mb_t3, t1_write);
        let open_mb_t3 = chunk_summaries[1].clone();
        assert_eq!(open_mb_t3.storage, ChunkStorage::OpenMutableBuffer);
        assert_first_last_times_eq(&open_mb_t3, t2_write);
        assert_chunks_times_ordered(&closed_mb_t3, &open_mb_t3);

        // This point makes a new open mb chunk and will end up in the closed mb chunk
        time.inc(Duration::from_secs(1));
        write_lp(&db, "cpu bar=1,baz=2,frob=3 400000000000000").await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        assert_eq!(chunk_summaries.len(), 3);
        // The closed chunk's times should be the same
        let closed_mb_t4 = chunk_summaries[0].clone();
        assert_eq!(closed_mb_t4.storage, ChunkStorage::ClosedMutableBuffer);
        assert_chunks_times_eq(&closed_mb_t4, &closed_mb_t3);
        // The first open chunk's times should be the same
        let open_mb_t4 = chunk_summaries[1].clone();
        assert_eq!(open_mb_t4.storage, ChunkStorage::OpenMutableBuffer);
        assert_chunks_times_eq(&open_mb_t4, &open_mb_t3);
        // The second open chunk's times should be later than the first open chunk's times
        let other_open_mb_t4 = chunk_summaries[2].clone();
        assert_eq!(other_open_mb_t4.storage, ChunkStorage::OpenMutableBuffer);
        assert_chunks_times_ordered(&open_mb_t4, &other_open_mb_t4);

        // Move closed mb chunk to rb
        db.compact_chunks("cpu", "1970-01-01T00", |chunk| {
            chunk.storage().1 == ChunkStorage::ClosedMutableBuffer
        })
        .await
        .unwrap();

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        assert_eq!(chunk_summaries.len(), 3);
        // The rb chunk's times should be the same as they were when this was the closed mb chunk
        let rb_t5 = chunk_summaries[0].clone();
        assert_eq!(rb_t5.storage, ChunkStorage::ReadBuffer);
        assert_chunks_times_eq(&rb_t5, &closed_mb_t4);
        // The first open chunk's times should be the same
        let open_mb_t5 = chunk_summaries[1].clone();
        assert_eq!(open_mb_t5.storage, ChunkStorage::OpenMutableBuffer);
        assert_chunks_times_eq(&open_mb_t5, &open_mb_t4);
        // The second open chunk's times should be the same
        let other_open_mb_t5 = chunk_summaries[2].clone();
        assert_eq!(other_open_mb_t5.storage, ChunkStorage::OpenMutableBuffer);
        assert_chunks_times_eq(&other_open_mb_t5, &other_open_mb_t4);

        // Persist rb to parquet os
        time.inc(Duration::from_secs(1));
        *db.persisted_chunk_id_override.lock() = Some(ChunkId::new_test(1337));
        db.persist_partition("cpu", "1970-01-01T00", true)
            .await
            .unwrap()
            .unwrap();

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        // Persisting compacts chunks, so now there's only 2
        assert_eq!(chunk_summaries.len(), 2);
        // The rb chunk's times should be the first write of the rb chunk and the last write
        // of the first open chunk that got compacted together
        let rb_t6 = chunk_summaries[0].clone();
        assert_eq!(rb_t6.storage, ChunkStorage::ReadBufferAndObjectStore);
        assert_chunks_first_times_eq(&rb_t6, &rb_t5);
        assert_chunks_last_times_eq(&rb_t6, &open_mb_t5);
        // The first open chunk had all its points moved into the persisted chunk.
        // The remaining open chunk is the other open chunk that did not contain any points
        // for the first partition
        let open_mb_t6 = chunk_summaries[1].clone();
        assert_eq!(open_mb_t6.storage, ChunkStorage::OpenMutableBuffer);
        assert_chunks_times_eq(&open_mb_t6, &other_open_mb_t5);

        // Move open chunk to closed
        db.rollover_partition("cpu", "1970-01-05T15").await.unwrap();

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        assert_eq!(chunk_summaries.len(), 2);
        // The rb chunk's times should still be the same
        let rb_t7 = chunk_summaries[0].clone();
        assert_eq!(rb_t7.storage, ChunkStorage::ReadBufferAndObjectStore);
        assert_chunks_times_eq(&rb_t7, &rb_t6);
        // The open chunk should now be closed but the times should be the same
        let closed_mb_t7 = chunk_summaries[1].clone();
        assert_eq!(closed_mb_t7.storage, ChunkStorage::ClosedMutableBuffer);
        assert_chunks_times_eq(&closed_mb_t7, &open_mb_t6);

        // New open chunk in mb
        // This point will stay in this open mb chunk
        let t5_write = time.inc(Duration::from_secs(1));
        write_lp(&db, "cpu bar=1,baz=3,blargh=3 400000000000000").await;

        // Check first/last write times on the chunks at this point
        let mut chunk_summaries = db.chunk_summaries().expect("expected summary to return");
        chunk_summaries.sort_unstable();
        assert_eq!(chunk_summaries.len(), 3);
        // The rb chunk's times should still be the same
        let rb_t8 = chunk_summaries[0].clone();
        assert_eq!(rb_t8.storage, ChunkStorage::ReadBufferAndObjectStore);
        assert_chunks_times_eq(&rb_t8, &rb_t7);
        // The closed chunk's times should still be the same
        let closed_mb_t8 = chunk_summaries[1].clone();
        assert_eq!(closed_mb_t8.storage, ChunkStorage::ClosedMutableBuffer);
        assert_chunks_times_eq(&closed_mb_t8, &closed_mb_t7);
        // The open chunk had one write, so its times should be between t7 and t8 and first/last
        // times should be the same
        let open_mb_t8 = chunk_summaries[2].clone();
        assert_eq!(open_mb_t8.storage, ChunkStorage::OpenMutableBuffer);
        assert_first_last_times_eq(&open_mb_t8, t5_write);

        let lifecycle_action = None;

        let expected = vec![
            ChunkSummary {
                partition_key: Arc::from("1970-01-01T00"),
                table_name: Arc::from("cpu"),
                order: chunk_summaries[0].order,
                id: chunk_summaries[0].id,
                storage: ChunkStorage::ReadBufferAndObjectStore,
                lifecycle_action,
                memory_bytes: 4079,       // size of RB and OS chunks
                object_store_bytes: 1557, // size of parquet file
                row_count: 2,
                time_of_last_access: None,
                time_of_first_write: Time::from_timestamp_nanos(1),
                time_of_last_write: Time::from_timestamp_nanos(1),
            },
            ChunkSummary {
                partition_key: Arc::from("1970-01-05T15"),
                table_name: Arc::from("cpu"),
                order: chunk_summaries[1].order,
                id: chunk_summaries[1].id,
                storage: ChunkStorage::ClosedMutableBuffer,
                lifecycle_action,
                memory_bytes: 2486,
                object_store_bytes: 0, // no OS chunks
                row_count: 1,
                time_of_last_access: None,
                time_of_first_write: Time::from_timestamp_nanos(1),
                time_of_last_write: Time::from_timestamp_nanos(1),
            },
            ChunkSummary {
                partition_key: Arc::from("1970-01-05T15"),
                table_name: Arc::from("cpu"),
                order: chunk_summaries[2].order,
                id: chunk_summaries[2].id,
                storage: ChunkStorage::OpenMutableBuffer,
                lifecycle_action,
                memory_bytes: 1303,
                object_store_bytes: 0, // no OS chunks
                row_count: 1,
                time_of_last_access: None,
                time_of_first_write: Time::from_timestamp_nanos(1),
                time_of_last_write: Time::from_timestamp_nanos(1),
            },
        ];

        for (expected_summary, actual_summary) in expected.iter().zip(chunk_summaries.iter()) {
            assert!(
                expected_summary.equal_without_timestamps_and_ids(actual_summary),
                "\n\nexpected item:\n{:#?}\n\nactual item:\n{:#?}\n\n\
                     all expected:\n{:#?}\n\nall actual:\n{:#?}",
                expected_summary,
                actual_summary,
                expected,
                chunk_summaries
            );
        }

        assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 1303);
        assert_eq!(db.catalog.metrics().memory().read_buffer(), 2550);
        assert_eq!(db.catalog.metrics().memory().object_store(), 1529);
    }

    #[tokio::test]
    async fn partition_summaries() {
        // Test that chunk id listing is hooked up
        let db = make_db().await.db;

        write_lp(&db, "cpu bar=1 1").await;
        db.rollover_partition("cpu", "1970-01-01T00")
            .await
            .unwrap()
            .unwrap();
        write_lp(&db, "cpu bar=2,baz=3.0 2").await;
        write_lp(&db, "mem foo=1 1").await;

        // load a chunk to the read buffer
        db.compact_partition("cpu", "1970-01-01T00").await.unwrap();

        // write the read buffer chunk to object store
        db.persist_partition("cpu", "1970-01-01T00", true)
            .await
            .unwrap();

        // write into a separate partition
        write_lp(&db, "cpu bar=1 400000000000000").await;
        write_lp(&db, "mem frob=3 400000000000001").await;

        print!("Partitions: {:?}", db.partition_keys().unwrap());

        let partition_summaries = vec![
            db.partition_summary("cpu", "1970-01-01T00").unwrap(),
            db.partition_summary("mem", "1970-01-01T00").unwrap(),
            db.partition_summary("cpu", "1970-01-05T15").unwrap(),
            db.partition_summary("mem", "1970-01-05T15").unwrap(),
        ];

        let expected = vec![
            PartitionSummary {
                key: "1970-01-01T00".into(),
                table: TableSummary {
                    name: "cpu".into(),
                    columns: vec![
                        ColumnSummary {
                            name: "bar".into(),
                            influxdb_type: Some(InfluxDbType::Field),
                            stats: Statistics::F64(StatValues::new(Some(1.0), Some(2.0), 2, 0)),
                        },
                        ColumnSummary {
                            name: "baz".into(),
                            influxdb_type: Some(InfluxDbType::Field),
                            stats: Statistics::F64(StatValues::new(Some(3.0), Some(3.0), 2, 1)),
                        },
                        ColumnSummary {
                            name: "time".into(),
                            influxdb_type: Some(InfluxDbType::Timestamp),
                            stats: Statistics::I64(StatValues::new(Some(1), Some(2), 2, 0)),
                        },
                    ],
                },
            },
            PartitionSummary {
                key: "1970-01-01T00".into(),
                table: TableSummary {
                    name: "mem".into(),
                    columns: vec![
                        ColumnSummary {
                            name: "foo".into(),
                            influxdb_type: Some(InfluxDbType::Field),
                            stats: Statistics::F64(StatValues::new(Some(1.0), Some(1.0), 1, 0)),
                        },
                        ColumnSummary {
                            name: "time".into(),
                            influxdb_type: Some(InfluxDbType::Timestamp),
                            stats: Statistics::I64(StatValues::new(Some(1), Some(1), 1, 0)),
                        },
                    ],
                },
            },
            PartitionSummary {
                key: "1970-01-05T15".into(),
                table: TableSummary {
                    name: "cpu".into(),
                    columns: vec![
                        ColumnSummary {
                            name: "bar".into(),
                            influxdb_type: Some(InfluxDbType::Field),
                            stats: Statistics::F64(StatValues::new(Some(1.0), Some(1.0), 1, 0)),
                        },
                        ColumnSummary {
                            name: "time".into(),
                            influxdb_type: Some(InfluxDbType::Timestamp),
                            stats: Statistics::I64(StatValues::new(
                                Some(400000000000000),
                                Some(400000000000000),
                                1,
                                0,
                            )),
                        },
                    ],
                },
            },
            PartitionSummary {
                key: "1970-01-05T15".into(),
                table: TableSummary {
                    name: "mem".into(),
                    columns: vec![
                        ColumnSummary {
                            name: "frob".into(),
                            influxdb_type: Some(InfluxDbType::Field),
                            stats: Statistics::F64(StatValues::new(Some(3.0), Some(3.0), 1, 0)),
                        },
                        ColumnSummary {
                            name: "time".into(),
                            influxdb_type: Some(InfluxDbType::Timestamp),
                            stats: Statistics::I64(StatValues::new(
                                Some(400000000000001),
                                Some(400000000000001),
                                1,
                                0,
                            )),
                        },
                    ],
                },
            },
        ];

        assert_eq!(
            expected, partition_summaries,
            "expected:\n{:#?}\n\nactual:{:#?}\n\n",
            expected, partition_summaries
        );
    }

    fn mutable_chunk_ids(db: &Db, partition_key: &str) -> Vec<ChunkId> {
        let mut chunk_ids: Vec<ChunkId> = db
            .partition_chunk_summaries(partition_key)
            .into_iter()
            .filter_map(|chunk| match chunk.storage {
                ChunkStorage::OpenMutableBuffer | ChunkStorage::ClosedMutableBuffer => {
                    Some(chunk.id)
                }
                _ => None,
            })
            .collect();
        chunk_ids.sort_unstable();
        chunk_ids
    }

    fn read_buffer_chunk_ids(db: &Db, partition_key: &str) -> Vec<ChunkId> {
        let mut chunk_ids: Vec<ChunkId> = db
            .partition_chunk_summaries(partition_key)
            .into_iter()
            .filter_map(|chunk| match chunk.storage {
                ChunkStorage::ReadBuffer => Some(chunk.id),
                ChunkStorage::ReadBufferAndObjectStore => Some(chunk.id),
                _ => None,
            })
            .collect();
        chunk_ids.sort_unstable();
        chunk_ids
    }

    fn parquet_file_chunk_ids(db: &Db, partition_key: &str) -> Vec<ChunkId> {
        let mut chunk_ids: Vec<ChunkId> = db
            .partition_chunk_summaries(partition_key)
            .into_iter()
            .filter_map(|chunk| match chunk.storage {
                ChunkStorage::ReadBufferAndObjectStore => Some(chunk.id),
                ChunkStorage::ObjectStoreOnly => Some(chunk.id),
                _ => None,
            })
            .collect();
        chunk_ids.sort_unstable();
        chunk_ids
    }

    #[tokio::test]
    async fn write_chunk_to_object_store_in_background() {
        // Test that data can be written to object store using a background task
        let db = make_db().await.db;

        // create MB partition
        write_lp(db.as_ref(), "cpu bar=1 10").await;
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        // MB => RB
        let partition_key = "1970-01-01T00";
        let table_name = "cpu";
        let mb_chunk = db
            .rollover_partition(table_name, partition_key)
            .await
            .unwrap()
            .unwrap();
        let rb_chunk = db
            .compact_partition(table_name, partition_key)
            .await
            .unwrap()
            .unwrap();
        assert_ne!(mb_chunk.id(), rb_chunk.id());

        // RB => OS
        db.persist_partition(table_name, partition_key, true)
            .await
            .unwrap();

        // we should have chunks in both the read buffer only
        assert!(mutable_chunk_ids(&db, partition_key).is_empty());
        assert_eq!(read_buffer_chunk_ids(&db, partition_key).len(), 1);
        assert_eq!(parquet_file_chunk_ids(&db, partition_key).len(), 1);
    }

    #[tokio::test]
    async fn write_hard_limit() {
        let db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                buffer_size_hard: Some(NonZeroUsize::new(10).unwrap()),
                ..Default::default()
            })
            .build()
            .await
            .db;

        // inserting first line does not trigger hard buffer limit
        write_lp(db.as_ref(), "cpu bar=1 10").await;

        // but second line will
        assert!(matches!(
            try_write_lp(db.as_ref(), "cpu bar=2 20").await,
            Err(super::Error::HardLimitReached {})
        ));
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn lock_tracker_metrics() {
        let object_store = Arc::new(ObjectStore::new_in_memory());

        // Create a DB given a server id, an object store and a db name
        let server_id = ServerId::try_from(10).unwrap();
        let db_name = "lock_tracker";
        let test_db = TestDb::builder()
            .server_id(server_id)
            .object_store(Arc::clone(&object_store))
            .db_name(db_name)
            // "dispable" clean-up by setting it to a very long time to avoid interference with this test
            .worker_cleanup_avg_sleep(Duration::from_secs(1_000))
            .build()
            .await;

        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;

        let mut reporter = metric::RawReporter::default();
        test_db.metric_registry.report(&mut reporter);

        let exclusive = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "partition"),
                ("table", "cpu"),
                ("access", "exclusive"),
            ])
            .unwrap();
        assert_eq!(exclusive, &Observation::U64Counter(1));

        let shared = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "partition"),
                ("table", "cpu"),
                ("access", "shared"),
            ])
            .unwrap();
        assert_eq!(shared, &Observation::U64Counter(0));

        let chunks = db.catalog.chunks();
        assert_eq!(chunks.len(), 1);

        let (sender, receiver) = tokio::sync::oneshot::channel();

        let chunk_a = Arc::clone(&chunks[0]);
        let chunk_b = Arc::clone(&chunks[0]);

        let chunk_b = chunk_b.write();

        let task = tokio::spawn(async move {
            sender.send(()).unwrap();
            let _ = chunk_a.read();
        });

        // Wait for background task to reach lock
        let _ = receiver.await.unwrap();

        // Hold lock for 100 milliseconds blocking background task
        std::thread::sleep(std::time::Duration::from_millis(100));

        std::mem::drop(chunk_b);
        task.await.unwrap();

        let mut reporter = metric::RawReporter::default();
        test_db.metric_registry.report(&mut reporter);

        let exclusive = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "partition"),
                ("table", "cpu"),
                ("access", "exclusive"),
            ])
            .unwrap();
        assert_eq!(exclusive, &Observation::U64Counter(1));

        let shared = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "partition"),
                ("table", "cpu"),
                ("access", "shared"),
            ])
            .unwrap();
        assert_eq!(shared, &Observation::U64Counter(1));

        let exclusive_chunk = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "chunk"),
                ("table", "cpu"),
                ("access", "exclusive"),
            ])
            .unwrap();
        assert_eq!(exclusive_chunk, &Observation::U64Counter(2));

        let shared_chunk = reporter
            .metric("catalog_lock")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "chunk"),
                ("table", "cpu"),
                ("access", "shared"),
            ])
            .unwrap();
        assert_eq!(shared_chunk, &Observation::U64Counter(1));

        let shared_chunk_wait = reporter
            .metric("catalog_lock_wait")
            .unwrap()
            .observation(&[
                ("db_name", "lock_tracker"),
                ("lock", "chunk"),
                ("table", "cpu"),
                ("access", "shared"),
            ])
            .unwrap();
        assert!(
            matches!(shared_chunk_wait, Observation::DurationCounter(d) if d > &Duration::from_millis(70))
        )
    }

    #[tokio::test]
    async fn write_to_preserved_catalog() {
        // Test that parquet data is committed to preserved catalog

        // ==================== setup ====================
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let server_id = ServerId::try_from(1).unwrap();
        let db_name = "preserved_catalog_test";

        // ==================== do: create DB ====================
        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .build()
            .await;
        let db = test_db.db;

        // ==================== check: empty catalog created ====================
        // at this point, an empty preserved catalog exists
        let config = db.preserved_catalog.config();
        let maybe_preserved_catalog = load_ok(config.clone()).await;
        assert!(maybe_preserved_catalog.is_some());

        // ==================== do: write data to parquet ====================
        // create two chunks within the same table (to better test "new chunk ID" and "new table" during transaction
        // replay as well as dropping the chunk)
        let mut chunks = vec![];
        for _ in 0..4 {
            chunks.push(create_parquet_chunk(&db).await);
        }

        // ==================== do: drop last chunk ====================
        let (table_name, partition_key, chunk_id) = chunks.pop().unwrap();
        db.drop_chunk(&table_name, &partition_key, chunk_id)
            .await
            .unwrap();

        // ==================== check: catalog state ====================
        // the preserved catalog should now register a single file
        let mut paths_expected = vec![];
        for (table_name, partition_key, chunk_id) in &chunks {
            let (chunk, _order) = db.chunk(table_name, partition_key, *chunk_id).unwrap();
            let chunk = chunk.read();
            if let ChunkStage::Persisted { parquet, .. } = chunk.stage() {
                paths_expected.push(parquet.path().clone());
            } else {
                panic!("Wrong chunk state.");
            }
        }
        paths_expected.sort();
        let (_preserved_catalog, catalog) = load_ok(config).await.unwrap();
        let paths_actual = {
            let mut tmp: Vec<_> = catalog.files().map(|info| info.path.clone()).collect();
            tmp.sort();
            tmp
        };
        assert_eq!(paths_actual, paths_expected);

        // ==================== do: remember table schema ====================
        let mut table_schemas: HashMap<String, Arc<Schema>> = Default::default();
        for (table_name, _partition_key, _chunk_id) in &chunks {
            let schema = db.table_schema(table_name).unwrap();
            table_schemas.insert(table_name.clone(), schema);
        }

        // ==================== do: re-load DB ====================
        // Re-create database with same store, serverID, and DB name
        drop(db);
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== check: DB state ====================
        // Re-created DB should have an "object store only"-chunk
        assert_eq!(chunks.len(), db.chunks(&Default::default()).len());
        for (table_name, partition_key, chunk_id) in &chunks {
            let (chunk, _order) = db.chunk(table_name, partition_key, *chunk_id).unwrap();
            let chunk = chunk.read();
            assert!(matches!(
                chunk.stage(),
                ChunkStage::Persisted {
                    read_buffer: None,
                    ..
                }
            ));
        }
        for (table_name, schema) in &table_schemas {
            let schema2 = db.table_schema(table_name).unwrap();
            assert_eq!(schema2.deref(), schema.deref());
        }

        // ==================== check: DB still writable ====================
        write_lp(db.as_ref(), "cpu bar=1 10").await;
    }

    #[tokio::test]
    async fn object_store_cleanup() {
        // Test that stale parquet files are removed from object store

        // ==================== setup ====================
        let object_store = Arc::new(ObjectStore::new_in_memory());

        // ==================== do: create DB ====================
        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .object_store(Arc::clone(&object_store))
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== do: write data to parquet ====================
        // create the following chunks:
        //   0: ReadBuffer + Parquet
        //   1: only Parquet
        //   2: dropped (not in current catalog but parquet file still present for time travel)
        let mut paths_keep = vec![];
        for i in 0..3i8 {
            let (table_name, partition_key, chunk_id) = create_parquet_chunk(&db).await;
            let (chunk, _order) = db.chunk(&table_name, &partition_key, chunk_id).unwrap();
            let chunk = chunk.read();
            if let ChunkStage::Persisted { parquet, .. } = chunk.stage() {
                paths_keep.push(parquet.path().clone());
            } else {
                panic!("Wrong chunk state.");
            }

            // drop lock
            drop(chunk);

            if i == 1 {
                db.unload_read_buffer(&table_name, &partition_key, chunk_id)
                    .unwrap();
            }
            if i == 2 {
                db.drop_chunk(&table_name, &partition_key, chunk_id)
                    .await
                    .unwrap();
            }
        }

        // ==================== do: create garbage ====================
        let path_delete = ParquetFilePath::new(&ChunkAddr {
            table_name: "cpu".into(),
            partition_key: "123".into(),
            chunk_id: ChunkId::new_test(3),
            db_name: "not used".into(),
        });
        create_empty_file(&db.iox_object_store, &path_delete).await;

        // ==================== check: all files are there ====================
        let all_files = parquet_files(&db.iox_object_store).await.unwrap();
        for path in &paths_keep {
            assert!(all_files.contains(path));
        }

        // ==================== do: start background task loop ====================
        let shutdown: CancellationToken = Default::default();
        let shutdown_captured = shutdown.clone();
        let db_captured = Arc::clone(&db);
        let join_handle =
            tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });

        // ==================== check: after a while the dropped file should be gone ====================
        let t_0 = Instant::now();
        loop {
            let all_files = parquet_files(&db.iox_object_store).await.unwrap();
            if !all_files.contains(&path_delete) {
                break;
            }
            assert!(t_0.elapsed() < Duration::from_secs(10));
            tokio::time::sleep(Duration::from_millis(100)).await;
        }

        // ==================== do: stop background task loop ====================
        shutdown.cancel();
        join_handle.await.unwrap();

        // ==================== check: some files are there ====================
        let all_files = parquet_files(&db.iox_object_store).await.unwrap();
        assert!(!all_files.contains(&path_delete));
        for path in &paths_keep {
            assert!(all_files.contains(path));
        }
    }

    #[tokio::test]
    async fn checkpointing() {
        // Test that the preserved catalog creates checkpoints

        // ==================== setup ====================
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let server_id = ServerId::try_from(1).unwrap();
        let db_name = "preserved_catalog_test";

        // ==================== do: create DB ====================
        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .lifecycle_rules(LifecycleRules {
                catalog_transactions_until_checkpoint: NonZeroU64::try_from(2).unwrap(),
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== do: write data to parquet ====================
        // create two chunks within the same table (to better test "new chunk ID" and "new table" during transaction
        // replay)
        let mut chunks = vec![];
        for _ in 0..2 {
            chunks.push(create_parquet_chunk(&db).await);
        }

        // ==================== do: remove .txn files ====================
        let files = db
            .iox_object_store
            .catalog_transaction_files()
            .await
            .unwrap()
            .try_concat()
            .await
            .unwrap();
        let mut deleted_one = false;
        for file in files {
            if !file.is_checkpoint() {
                db.iox_object_store
                    .delete_catalog_transaction_file(&file)
                    .await
                    .unwrap();
                deleted_one = true;
            }
        }
        assert!(deleted_one);
        drop(db);

        // ==================== do: re-load DB ====================
        // Re-create database with same store, serverID, and DB name
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== check: DB state ====================
        // Re-created DB should have an "object store only"-chunk
        for (table_name, partition_key, chunk_id) in &chunks {
            let (chunk, _order) = db.chunk(table_name, partition_key, *chunk_id).unwrap();
            let chunk = chunk.read();
            assert!(matches!(
                chunk.stage(),
                ChunkStage::Persisted {
                    read_buffer: None,
                    ..
                }
            ));
        }

        // ==================== check: DB still writable ====================
        write_lp(db.as_ref(), "cpu bar=1 10").await;
    }

    #[tokio::test]
    async fn transaction_pruning() {
        // Test that the background worker prunes transactions

        // ==================== setup ====================
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let server_id = ServerId::try_from(1).unwrap();
        let db_name = "transaction_pruning_test";

        // ==================== do: create DB ====================
        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .lifecycle_rules(LifecycleRules {
                catalog_transactions_until_checkpoint: NonZeroU64::try_from(1).unwrap(),
                catalog_transaction_prune_age: Duration::from_millis(1),
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== do: write data to parquet ====================
        create_parquet_chunk(&db).await;

        // ==================== do: start background task loop ====================
        let shutdown: CancellationToken = Default::default();
        let shutdown_captured = shutdown.clone();
        let db_captured = Arc::clone(&db);
        let join_handle =
            tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });

        // ==================== check: after a while the dropped file should be gone ====================
        let t_0 = Instant::now();
        loop {
            let all_revisions = db
                .iox_object_store()
                .catalog_transaction_files()
                .await
                .unwrap()
                .map_ok(|files| {
                    files
                        .into_iter()
                        .map(|f| f.revision_counter)
                        .collect::<Vec<u64>>()
                })
                .try_concat()
                .await
                .unwrap();
            if !all_revisions.contains(&0) {
                break;
            }
            assert!(t_0.elapsed() < Duration::from_secs(10));
            tokio::time::sleep(Duration::from_millis(100)).await;
        }

        // ==================== do: stop background task loop ====================
        shutdown.cancel();
        join_handle.await.unwrap();
    }

    #[tokio::test]
    async fn delete_predicate_preservation() {
        // Test that delete predicates are stored within the preserved catalog
        maybe_start_logging();

        // ==================== setup ====================
        let object_store = Arc::new(ObjectStore::new_in_memory());
        let server_id = ServerId::try_from(1).unwrap();
        let db_name = "delete_predicate_preservation_test";

        // ==================== do: create DB ====================
        // Create a DB given a server id, an object store and a db name
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .lifecycle_rules(LifecycleRules {
                catalog_transactions_until_checkpoint: NonZeroU64::try_from(1).unwrap(),
                // do not prune transactions files because this tests relies on them
                catalog_transaction_prune_age: Duration::from_secs(1_000),
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                ..Default::default()
            })
            .partition_template(PartitionTemplate {
                parts: vec![TemplatePart::Column("part".to_string())],
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== do: create chunks ====================
        let table_name = "cpu";

        // 1: preserved
        let partition_key = "part_a";
        write_lp(&db, "cpu,part=a row=10,selector=0i 10").await;
        write_lp(&db, "cpu,part=a row=11,selector=1i 11").await;
        db.persist_partition(table_name, partition_key, true)
            .await
            .unwrap();

        // 2: RUB
        let partition_key = "part_b";
        write_lp(&db, "cpu,part=b row=20,selector=0i 20").await;
        write_lp(&db, "cpu,part=b row=21,selector=1i 21").await;
        db.compact_partition(table_name, partition_key)
            .await
            .unwrap();

        // 3: MUB
        let _partition_key = "part_c";
        write_lp(&db, "cpu,part=c row=30,selector=0i 30").await;
        write_lp(&db, "cpu,part=c row=31,selector=1i 31").await;

        // 4: preserved and unloaded
        let partition_key = "part_d";
        write_lp(&db, "cpu,part=d row=40,selector=0i 40").await;
        write_lp(&db, "cpu,part=d row=41,selector=1i 41").await;

        let chunk_id = db
            .persist_partition(table_name, partition_key, true)
            .await
            .unwrap()
            .unwrap()
            .id();

        db.unload_read_buffer(table_name, partition_key, chunk_id)
            .unwrap();

        // ==================== do: delete ====================
        let pred = Arc::new(DeletePredicate {
            range: TimestampRange {
                start: 0,
                end: 1_000,
            },
            exprs: vec![DeleteExpr::new(
                "selector".to_string(),
                predicate::delete_expr::Op::Eq,
                predicate::delete_expr::Scalar::I64(1),
            )],
        });
        db.delete("cpu", Arc::clone(&pred)).await.unwrap();

        // ==================== do: preserve another partition ====================
        let partition_key = "part_b";
        db.persist_partition(table_name, partition_key, true)
            .await
            .unwrap();

        // ==================== do: use background worker for a short while ====================
        let iters_start = db.worker_iterations_delete_predicate_preservation();
        let shutdown: CancellationToken = Default::default();
        let shutdown_captured = shutdown.clone();
        let db_captured = Arc::clone(&db);
        let join_handle =
            tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });

        let t_0 = Instant::now();
        loop {
            let did_delete_predicate_preservation =
                db.worker_iterations_delete_predicate_preservation() > iters_start;
            let did_compaction = db.chunk_summaries().unwrap().into_iter().any(|summary| {
                (summary.partition_key.as_ref() == "part_c")
                    && (summary.storage == ChunkStorage::ReadBuffer)
            });
            if did_delete_predicate_preservation && did_compaction {
                break;
            }
            assert!(t_0.elapsed() < Duration::from_secs(10));
            tokio::time::sleep(Duration::from_millis(100)).await;
        }

        shutdown.cancel();
        join_handle.await.unwrap();

        // ==================== check: delete predicates ====================
        let closure_check_delete_predicates = |db: &Db| {
            for chunk in db.catalog.chunks() {
                let chunk = chunk.read();
                if chunk.addr().partition_key.as_ref() == "part_b" {
                    // Strictly speaking not required because the chunk was persisted AFTER the delete predicate was
                    // registered so we can get away with materializing it during persistence.
                    continue;
                }
                if chunk.addr().partition_key.as_ref() == "part_c" {
                    // This partition was compacted, so the delete predicates were materialized.
                    continue;
                }
                let predicates = chunk.delete_predicates();
                assert_eq!(predicates.len(), 1);
                assert_eq!(predicates[0].as_ref(), pred.as_ref());
            }
        };
        closure_check_delete_predicates(&db);

        // ==================== check: query ====================
        let expected = vec![
            "+------+-----+----------+--------------------------------+",
            "| part | row | selector | time                           |",
            "+------+-----+----------+--------------------------------+",
            "| a    | 10  | 0        | 1970-01-01T00:00:00.000000010Z |",
            "| b    | 20  | 0        | 1970-01-01T00:00:00.000000020Z |",
            "| c    | 30  | 0        | 1970-01-01T00:00:00.000000030Z |",
            "| d    | 40  | 0        | 1970-01-01T00:00:00.000000040Z |",
            "+------+-----+----------+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu order by time").await;
        assert_batches_sorted_eq!(&expected, &batches);

        // ==================== do: re-load DB ====================
        // Re-create database with same store, serverID, and DB name
        drop(db);
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== check: delete predicates ====================
        closure_check_delete_predicates(&db);

        // ==================== check: query ====================
        // NOTE: partition "c" is gone here because it was not written to object store
        let expected = vec![
            "+------+-----+----------+--------------------------------+",
            "| part | row | selector | time                           |",
            "+------+-----+----------+--------------------------------+",
            "| a    | 10  | 0        | 1970-01-01T00:00:00.000000010Z |",
            "| b    | 20  | 0        | 1970-01-01T00:00:00.000000020Z |",
            "| d    | 40  | 0        | 1970-01-01T00:00:00.000000040Z |",
            "+------+-----+----------+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu order by time").await;
        assert_batches_sorted_eq!(&expected, &batches);

        // ==================== do: remove checkpoint files ====================
        let files = db
            .iox_object_store
            .catalog_transaction_files()
            .await
            .unwrap()
            .try_concat()
            .await
            .unwrap();
        let mut deleted_one = false;
        for file in files {
            if file.is_checkpoint() {
                db.iox_object_store
                    .delete_catalog_transaction_file(&file)
                    .await
                    .unwrap();
                deleted_one = true;
            }
        }
        assert!(deleted_one);

        // ==================== do: re-load DB ====================
        // Re-create database with same store, serverID, and DB name
        drop(db);
        let test_db = TestDb::builder()
            .object_store(Arc::clone(&object_store))
            .server_id(server_id)
            .db_name(db_name)
            .build()
            .await;
        let db = Arc::new(test_db.db);

        // ==================== check: delete predicates ====================
        closure_check_delete_predicates(&db);

        // ==================== check: query ====================
        // NOTE: partition "c" is gone here because it was not written to object store
        let _expected = vec![
            "+------+-----+----------+--------------------------------+",
            "| part | row | selector | time                           |",
            "+------+-----+----------+--------------------------------+",
            "| a    | 10  | 0        | 1970-01-01T00:00:00.000000010Z |",
            "| b    | 20  | 0        | 1970-01-01T00:00:00.000000020Z |",
            "| d    | 40  | 0        | 1970-01-01T00:00:00.000000040Z |",
            "+------+-----+----------+--------------------------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select * from cpu order by time").await;
        assert_batches_sorted_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn table_wide_schema_enforcement() {
        // need a table with a partition template that uses a tag column, so that we can easily write to different partitions
        let test_db = TestDb::builder()
            .partition_template(PartitionTemplate {
                parts: vec![TemplatePart::Column("tag_partition_by".to_string())],
            })
            .build()
            .await;
        let db = test_db.db;

        // first write should create schema
        try_write_lp(&db, "my_table,tag_partition_by=a field_integer=1 10")
            .await
            .unwrap();

        // other writes are allowed to evolve the schema
        try_write_lp(&db, "my_table,tag_partition_by=a field_string=\"foo\" 10")
            .await
            .unwrap();
        try_write_lp(&db, "my_table,tag_partition_by=b field_float=1.1 10")
            .await
            .unwrap();

        // check that we have the expected partitions
        let mut partition_keys = db.partition_keys().unwrap();
        partition_keys.sort();
        assert_eq!(
            partition_keys,
            vec![
                "tag_partition_by_a".to_string(),
                "tag_partition_by_b".to_string(),
            ]
        );

        // illegal changes
        let e = try_write_lp(&db, "my_table,tag_partition_by=a field_integer=\"foo\" 10")
            .await
            .unwrap_err();
        assert_store_sequenced_entry_failures!(
            e,
            [super::Error::TableBatchSchemaMergeError { .. }]
        );
        let e = try_write_lp(&db, "my_table,tag_partition_by=b field_integer=\"foo\" 10")
            .await
            .unwrap_err();
        assert_store_sequenced_entry_failures!(
            e,
            [super::Error::TableBatchSchemaMergeError { .. }]
        );
        let e = try_write_lp(&db, "my_table,tag_partition_by=c field_integer=\"foo\" 10")
            .await
            .unwrap_err();
        assert_store_sequenced_entry_failures!(
            e,
            [super::Error::TableBatchSchemaMergeError { .. }]
        );

        // drop all chunks
        for partition_key in db.partition_keys().unwrap() {
            let chunk_ids: Vec<_> = {
                let partition = db.partition("my_table", &partition_key).unwrap();
                let partition = partition.read();
                partition
                    .chunks()
                    .into_iter()
                    .map(|chunk| {
                        let chunk = chunk.read();
                        chunk.id()
                    })
                    .collect()
            };

            for chunk_id in chunk_ids {
                db.drop_chunk("my_table", &partition_key, chunk_id)
                    .await
                    .unwrap();
            }
        }

        // schema is still there
        let e = try_write_lp(&db, "my_table,tag_partition_by=a field_integer=\"foo\" 10")
            .await
            .unwrap_err();
        assert_store_sequenced_entry_failures!(
            e,
            [super::Error::TableBatchSchemaMergeError { .. }]
        );
    }

    #[tokio::test]
    async fn drop_unpersisted_chunk_on_persisted_db() {
        // We don't support dropping unpersisted chunks from a persisted DB because we would forget the write buffer
        // progress (partition checkpoints are only created when new parquet files are stored).
        // See https://github.com/influxdata/influxdb_iox/issues/2291
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                persist: true,
                ..Default::default()
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;

        let partition_key = "1970-01-01T00";
        let chunks = db.partition_chunk_summaries(partition_key);
        assert_eq!(chunks.len(), 1);
        let chunk_id = chunks[0].id;

        let err = db
            .drop_chunk("cpu", partition_key, chunk_id)
            .await
            .unwrap_err();
        assert!(matches!(
            err,
            Error::LifecycleError {
                source: super::lifecycle::Error::CannotDropUnpersistedChunk { .. }
            }
        ));
    }

    #[tokio::test]
    async fn drop_unpersisted_partition_on_persisted_db() {
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                mub_row_threshold: NonZeroUsize::try_from(1).unwrap(),
                persist: true,
                ..Default::default()
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        let partition_key = "1970-01-01T00";

        // two chunks created
        assert_eq!(db.partition_chunk_summaries(partition_key).len(), 2);

        // We don't support dropping unpersisted chunks from a persisted DB because we would forget the write buffer
        // progress (partition checkpoints are only created when new parquet files are stored).
        // See https://github.com/influxdata/influxdb_iox/issues/2291
        let err = db.drop_partition("cpu", partition_key).await.unwrap_err();
        assert!(matches!(
            err,
            Error::LifecycleError {
                source: super::lifecycle::Error::CannotDropUnpersistedChunk { .. }
            }
        ));

        // once persisted drop should work
        db.persist_partition("cpu", partition_key, true)
            .await
            .unwrap();
        db.drop_partition("cpu", partition_key).await.unwrap();

        // no chunks left
        assert_eq!(db.partition_chunk_summaries(partition_key), vec![]);
    }

    #[tokio::test]
    async fn query_after_drop_partition_on_persisted_db() {
        let test_db = TestDb::builder()
            .lifecycle_rules(LifecycleRules {
                late_arrive_window_seconds: NonZeroU32::try_from(1).unwrap(),
                mub_row_threshold: NonZeroUsize::try_from(1).unwrap(),
                persist: true,
                ..Default::default()
            })
            .build()
            .await;
        let db = Arc::new(test_db.db);

        write_lp(db.as_ref(), "cpu bar=1 10").await;
        write_lp(db.as_ref(), "cpu bar=2 20").await;

        let partition_key = "1970-01-01T00";
        db.persist_partition("cpu", partition_key, true)
            .await
            .unwrap();

        // query data before drop
        let expected = vec![
            "+-----------------+",
            "| COUNT(UInt8(1)) |",
            "+-----------------+",
            "| 2               |",
            "+-----------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select count(*) from system.columns").await;
        assert_batches_sorted_eq!(&expected, &batches);

        // Drop the partition (avoid data)
        db.drop_partition("cpu", partition_key).await.unwrap();

        // query data after drop -- should have no rows and also no error
        let expected = vec![
            "+-----------------+",
            "| COUNT(UInt8(1)) |",
            "+-----------------+",
            "| 0               |",
            "+-----------------+",
        ];
        let batches = run_query(Arc::clone(&db), "select count(*) from system.columns").await;
        assert_batches_sorted_eq!(&expected, &batches);
    }

    async fn create_parquet_chunk(db: &Arc<Db>) -> (String, String, ChunkId) {
        write_lp(db, "cpu bar=1 10").await;
        let partition_key = "1970-01-01T00";
        let table_name = "cpu";

        // Move that MB chunk to RB chunk and drop it from MB
        db.compact_open_chunk(table_name, partition_key)
            .await
            .unwrap();

        // Write the RB chunk to Object Store but keep it in RB
        let chunk = db
            .persist_partition(table_name, partition_key, true)
            .await
            .unwrap()
            .unwrap();

        // chunk ID changed during persistence
        let chunk_id = chunk.id();

        (table_name.to_string(), partition_key.to_string(), chunk_id)
    }

    async fn create_empty_file(iox_object_store: &IoxObjectStore, path: &ParquetFilePath) {
        iox_object_store
            .put_parquet_file(path, Bytes::new())
            .await
            .unwrap();
    }
}