Merge pull request #5908 from influxdata/dom/ingester-integration-tests

test: ingester integration tests
2022-10-19 12:43:23 +00:00 · 2022-10-19 12:43:23 +00:00 · 66035ada48
parent eb5a661ab3 ea7b4a0de6
commit 66035ada48
6 changed files with 857 additions and 445 deletions
--- a/ingester/src/handler.rs
+++ b/ingester/src/handler.rs
@ -435,192 +435,20 @@ impl<T> Drop for IngestHandlerImpl<T> {
 mod tests {
    use std::{num::NonZeroU32, ops::DerefMut};

-    use data_types::{Namespace, NamespaceSchema, QueryPool, Sequence, SequenceNumber};
+    use data_types::{Namespace, NamespaceSchema, Sequence, SequenceNumber};
    use dml::{DmlMeta, DmlWrite};
    use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
    use iox_time::Time;
-    use metric::{Attributes, Metric, U64Counter, U64Gauge};
    use mutable_batch_lp::lines_to_batches;
    use object_store::memory::InMemory;
    use test_helpers::maybe_start_logging;
    use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};

    use super::*;
-    use crate::data::{partition::SnapshotBatch, table::TableName};
-
-    #[tokio::test]
-    async fn read_from_write_buffer_write_to_mutable_buffer() {
-        let ingester = TestIngester::new().await;
-
-        let schema = NamespaceSchema::new(
-            ingester.namespace.id,
-            ingester.topic.id,
-            ingester.query_pool.id,
-            100,
-        );
-        let mut txn = ingester.catalog.start_transaction().await.unwrap();
-        let ingest_ts1 = Time::from_timestamp_millis(42);
-        let ingest_ts2 = Time::from_timestamp_millis(1337);
-        let w1 = DmlWrite::new(
-            "foo",
-            lines_to_batches("mem foo=1 10", 0).unwrap(),
-            Some("1970-01-01".into()),
-            DmlMeta::sequenced(
-                Sequence::new(ShardIndex::new(0), SequenceNumber::new(0)),
-                ingest_ts1,
-                None,
-                50,
-            ),
-        );
-        let schema = validate_or_insert_schema(w1.tables(), &schema, txn.deref_mut())
-            .await
-            .unwrap()
-            .unwrap();
-        ingester.write_buffer_state.push_write(w1);
-        let w2 = DmlWrite::new(
-            "foo",
-            lines_to_batches("cpu bar=2 20\ncpu bar=3 30", 0).unwrap(),
-            Some("1970-01-01".into()),
-            DmlMeta::sequenced(
-                Sequence::new(ShardIndex::new(0), SequenceNumber::new(7)),
-                ingest_ts2,
-                None,
-                150,
-            ),
-        );
-        let _schema = validate_or_insert_schema(w2.tables(), &schema, txn.deref_mut())
-            .await
-            .unwrap()
-            .unwrap();
-        ingester.write_buffer_state.push_write(w2);
-        let w3 = DmlWrite::new(
-            "foo",
-            lines_to_batches("a b=2 200", 0).unwrap(),
-            Some("1970-01-01".into()),
-            DmlMeta::sequenced(
-                Sequence::new(ShardIndex::new(0), SequenceNumber::new(9)),
-                ingest_ts2,
-                None,
-                150,
-            ),
-        );
-        let _schema = validate_or_insert_schema(w3.tables(), &schema, txn.deref_mut())
-            .await
-            .unwrap()
-            .unwrap();
-        ingester.write_buffer_state.push_write(w3);
-        txn.commit().await.unwrap();
-
-        // give the writes some time to go through the buffer. Exit once we've verified there's
-        // data in there from both writes.
-        tokio::time::timeout(Duration::from_secs(2), async {
-            let ns_name = ingester.namespace.name.into();
-            let table_name = TableName::from("a");
-            loop {
-                let mut has_measurement = false;
-
-                if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
-                    if let Some(data) = data.namespace(&ns_name) {
-                        // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
-                        {
-                            if let Some(b) = b.first() {
-                                if b.data.num_rows() > 0 {
-                                    has_measurement = true;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                // and ensure that the shard state was actually updated
-                let shard = ingester
-                    .catalog
-                    .repositories()
-                    .await
-                    .shards()
-                    .create_or_get(&ingester.topic, ingester.shard_index)
-                    .await
-                    .unwrap();
-
-                if has_measurement
-                    && shard.min_unpersisted_sequence_number == SequenceNumber::new(9)
-                {
-                    break;
-                }
-
-                tokio::time::sleep(Duration::from_millis(200)).await;
-            }
-        })
-        .await
-        .expect("timeout");
-
-        let observation = ingester
-            .metrics
-            .get_instrument::<Metric<DurationHistogram>>("ingester_op_apply_duration")
-            .unwrap()
-            .get_observer(&Attributes::from(&[
-                ("kafka_topic", "whatevs"),
-                ("kafka_partition", "0"),
-                ("result", "success"),
-            ]))
-            .unwrap()
-            .fetch();
-        let hits = observation.buckets.iter().map(|b| b.count).sum::<u64>();
-        assert_eq!(hits, 3);
-
-        let observation = ingester
-            .metrics
-            .get_instrument::<Metric<U64Counter>>("ingester_write_buffer_read_bytes")
-            .unwrap()
-            .get_observer(&Attributes::from(&[
-                ("kafka_topic", "whatevs"),
-                ("kafka_partition", "0"),
-            ]))
-            .unwrap()
-            .fetch();
-        assert_eq!(observation, 350);
-
-        let observation = ingester
-            .metrics
-            .get_instrument::<Metric<U64Gauge>>("ingester_write_buffer_last_sequence_number")
-            .unwrap()
-            .get_observer(&Attributes::from(&[
-                ("kafka_topic", "whatevs"),
-                ("kafka_partition", "0"),
-            ]))
-            .unwrap()
-            .fetch();
-        assert_eq!(observation, 9);
-
-        let observation = ingester
-            .metrics
-            .get_instrument::<Metric<U64Gauge>>("ingester_write_buffer_sequence_number_lag")
-            .unwrap()
-            .get_observer(&Attributes::from(&[
-                ("kafka_topic", "whatevs"),
-                ("kafka_partition", "0"),
-            ]))
-            .unwrap()
-            .fetch();
-        assert_eq!(observation, 0);
-
-        let observation = ingester
-            .metrics
-            .get_instrument::<Metric<U64Gauge>>("ingester_write_buffer_last_ingest_ts")
-            .unwrap()
-            .get_observer(&Attributes::from(&[
-                ("kafka_topic", "whatevs"),
-                ("kafka_partition", "0"),
-            ]))
-            .unwrap()
-            .fetch();
-        assert_eq!(observation, ingest_ts2.timestamp_nanos() as u64);
-    }

    #[tokio::test]
    async fn test_shutdown() {
-        let ingester = TestIngester::new().await.ingester;
+        let (ingester, _, _) = ingester_test_setup(vec![], 0, true).await;

        // does not exit w/o shutdown
        tokio::select! {
@ -638,7 +466,7 @@ mod tests {
    #[tokio::test]
    #[should_panic(expected = "Background worker 'bad_task' exited early!")]
    async fn test_join_task_early_shutdown() {
-        let mut ingester = TestIngester::new().await.ingester;
+        let (mut ingester, _, _) = ingester_test_setup(vec![], 0, true).await;

        let shutdown_task = tokio::spawn(async {
            // It does nothing! and stops.
@ -655,7 +483,7 @@ mod tests {
    #[tokio::test]
    #[should_panic(expected = "JoinError::Panic")]
    async fn test_join_task_panic() {
-        let mut ingester = TestIngester::new().await.ingester;
+        let (mut ingester, _, _) = ingester_test_setup(vec![], 0, true).await;

        let shutdown_task = tokio::spawn(async {
            panic!("bananas");
@ -754,88 +582,6 @@ mod tests {
        (ingester, shard, namespace)
    }

-    async fn verify_ingester_buffer_has_data(
-        ingester: IngestHandlerImpl,
-        shard: Shard,
-        namespace: Namespace,
-        custom_batch_verification: impl Fn(&SnapshotBatch) + Send,
-    ) {
-        // give the writes some time to go through the buffer. Exit once we've verified there's
-        // data in there
-        tokio::time::timeout(Duration::from_secs(1), async move {
-            let ns_name = namespace.name.into();
-            let table_name = TableName::from("cpu");
-            loop {
-                let mut has_measurement = false;
-
-                if let Some(data) = ingester.data.shard(shard.id) {
-                    if let Some(data) = data.namespace(&ns_name) {
-                        // verify there's data in the buffer
-                        if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
-                        {
-                            if let Some(b) = b.first() {
-                                custom_batch_verification(b);
-
-                                if b.data.num_rows() == 1 {
-                                    has_measurement = true;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                if has_measurement {
-                    break;
-                }
-
-                tokio::time::sleep(Duration::from_millis(200)).await;
-            }
-        })
-        .await
-        .expect("timeout");
-    }
-
-    #[tokio::test]
-    async fn seeks_on_initialization() {
-        let ingest_ts1 = Time::from_timestamp_millis(42);
-        let ingest_ts2 = Time::from_timestamp_millis(1337);
-        let write_operations = vec![
-            DmlWrite::new(
-                "foo",
-                lines_to_batches("cpu bar=2 20", 0).unwrap(),
-                Some("1970-01-01".into()),
-                DmlMeta::sequenced(
-                    Sequence::new(ShardIndex::new(0), SequenceNumber::new(1)),
-                    ingest_ts1,
-                    None,
-                    150,
-                ),
-            ),
-            DmlWrite::new(
-                "foo",
-                lines_to_batches("cpu bar=2 30", 0).unwrap(),
-                Some("1970-01-01".into()),
-                DmlMeta::sequenced(
-                    Sequence::new(ShardIndex::new(0), SequenceNumber::new(2)),
-                    ingest_ts2,
-                    None,
-                    150,
-                ),
-            ),
-        ];
-
-        let (ingester, shard, namespace) = ingester_test_setup(write_operations, 2, false).await;
-
-        verify_ingester_buffer_has_data(ingester, shard, namespace, |first_batch| {
-            if first_batch.min_sequence_number == SequenceNumber::new(1) {
-                panic!(
-                    "initialization did a seek to the beginning rather than the min_unpersisted"
-                );
-            }
-        })
-        .await;
-    }
-
    #[tokio::test]
    #[should_panic(expected = "JoinError::Panic")]
    async fn sequence_number_no_longer_exists() {
@ -912,41 +658,9 @@ mod tests {
            .unwrap();
    }

-    #[tokio::test]
-    async fn skip_to_oldest_available() {
-        maybe_start_logging();
-
-        let ingest_ts1 = Time::from_timestamp_millis(42);
-        let write_operations = vec![DmlWrite::new(
-            "foo",
-            lines_to_batches("cpu bar=2 20", 0).unwrap(),
-            Some("1970-01-01".into()),
-            DmlMeta::sequenced(
-                Sequence::new(ShardIndex::new(0), SequenceNumber::new(10)),
-                ingest_ts1,
-                None,
-                150,
-            ),
-        )];
-
-        // Set the min unpersisted to something bigger than the write's sequence number to
-        // cause an UnknownSequenceNumber error. Skip to oldest available = true, so ingester
-        // should find data
-        let (ingester, shard, namespace) = ingester_test_setup(write_operations, 1, true).await;
-
-        verify_ingester_buffer_has_data(ingester, shard, namespace, |first_batch| {
-            assert_eq!(
-                first_batch.min_sequence_number,
-                SequenceNumber::new(10),
-                "re-initialization didn't seek to the beginning",
-            );
-        })
-        .await;
-    }
-
    #[tokio::test]
    async fn limits_concurrent_queries() {
-        let mut ingester = TestIngester::new().await;
+        let (mut ingester, _, _) = ingester_test_setup(vec![], 0, true).await;
        let request = IngesterQueryRequest {
            namespace: "foo".to_string(),
            table: "cpu".to_string(),
@ -954,93 +668,14 @@ mod tests {
            predicate: None,
        };

-        let res = ingester.ingester.query(request.clone()).await.unwrap_err();
+        let res = ingester.query(request.clone()).await.unwrap_err();
        assert!(matches!(
            res,
            crate::querier_handler::Error::NamespaceNotFound { .. }
        ));

-        ingester.ingester.request_sem = Semaphore::new(0);
-        let res = ingester.ingester.query(request).await.unwrap_err();
+        ingester.request_sem = Semaphore::new(0);
+        let res = ingester.query(request).await.unwrap_err();
        assert!(matches!(res, crate::querier_handler::Error::RequestLimit));
    }
-
-    struct TestIngester {
-        catalog: Arc<dyn Catalog>,
-        shard: Shard,
-        namespace: Namespace,
-        topic: TopicMetadata,
-        shard_index: ShardIndex,
-        query_pool: QueryPool,
-        metrics: Arc<metric::Registry>,
-        write_buffer_state: MockBufferSharedState,
-        ingester: IngestHandlerImpl,
-    }
-
-    impl TestIngester {
-        async fn new() -> Self {
-            let metrics: Arc<metric::Registry> = Default::default();
-            let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
-
-            let mut txn = catalog.start_transaction().await.unwrap();
-            let topic = txn.topics().create_or_get("whatevs").await.unwrap();
-            let query_pool = txn.query_pools().create_or_get("whatevs").await.unwrap();
-            let shard_index = ShardIndex::new(0);
-            let namespace = txn
-                .namespaces()
-                .create("foo", "inf", topic.id, query_pool.id)
-                .await
-                .unwrap();
-            let shard = txn
-                .shards()
-                .create_or_get(&topic, shard_index)
-                .await
-                .unwrap();
-            txn.commit().await.unwrap();
-
-            let mut shard_states = BTreeMap::new();
-            shard_states.insert(shard_index, shard);
-
-            let write_buffer_state =
-                MockBufferSharedState::empty_with_n_shards(NonZeroU32::try_from(1).unwrap());
-            let reading: Arc<dyn WriteBufferReading> =
-                Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
-            let object_store = Arc::new(InMemory::new());
-
-            let lifecycle_config = LifecycleConfig::new(
-                1000000,
-                1000,
-                1000,
-                Duration::from_secs(10),
-                Duration::from_secs(10),
-                10000000,
-            );
-            let ingester = IngestHandlerImpl::new(
-                lifecycle_config,
-                topic.clone(),
-                shard_states,
-                Arc::clone(&catalog),
-                object_store,
-                reading,
-                Arc::new(Executor::new(1)),
-                Arc::clone(&metrics),
-                false,
-                1,
-            )
-            .await
-            .unwrap();
-
-            Self {
-                catalog,
-                shard,
-                namespace,
-                topic,
-                shard_index,
-                query_pool,
-                metrics,
-                write_buffer_state,
-                ingester,
-            }
-        }
-    }
 }
--- a/ingester/src/lifecycle.rs
+++ b/ingester/src/lifecycle.rs
@ -201,7 +201,7 @@ pub struct LifecycleConfig {
 impl LifecycleConfig {
    /// Initialize a new LifecycleConfig. panics if the passed `pause_ingest_size` is less than the
    /// `persist_memory_threshold`.
-    pub fn new(
+    pub const fn new(
        pause_ingest_size: usize,
        persist_memory_threshold: usize,
        partition_size_threshold: usize,
--- a/ingester/src/querier_handler.rs
+++ b/ingester/src/querier_handler.rs
@ -2,15 +2,15 @@

 use std::{pin::Pin, sync::Arc};

-use arrow::{error::ArrowError, record_batch::RecordBatch};
+use arrow::{array::new_null_array, error::ArrowError, record_batch::RecordBatch};
 use arrow_util::optimize::{optimize_record_batch, optimize_schema};
 use data_types::{PartitionId, SequenceNumber};
 use datafusion::physical_plan::SendableRecordBatchStream;
 use datafusion_util::MemoryStream;
-use futures::{Stream, StreamExt};
+use futures::{Stream, StreamExt, TryStreamExt};
 use generated_types::ingester::IngesterQueryRequest;
 use observability_deps::tracing::debug;
-use schema::selection::Selection;
+use schema::{merge::SchemaMerger, selection::Selection};
 use snafu::{ensure, Snafu};

 use crate::{
@ -168,10 +168,63 @@ impl IngesterQueryResponse {
            })
            .boxed()
    }
+
+    /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
+    ///
+    /// If the response contains multiple snapshots, this will merge the schemas into a single one and create
+    /// NULL-columns for snapshots that miss columns.
+    ///
+    /// # Panic
+    /// Panics if there are no batches returned at all. Also panics if the snapshot-scoped schemas do not line up with
+    /// the snapshot-scoped record batches.
+    pub async fn into_record_batches(self) -> Vec<RecordBatch> {
+        let mut snapshot_schema = None;
+        let mut schema_merger = SchemaMerger::new();
+        let mut batches = vec![];
+
+        let mut stream = self.flatten();
+        while let Some(msg) = stream.try_next().await.unwrap() {
+            match msg {
+                FlatIngesterQueryResponse::StartPartition { .. } => (),
+                FlatIngesterQueryResponse::RecordBatch { batch } => {
+                    let last_schema = snapshot_schema.as_ref().unwrap();
+                    assert_eq!(&batch.schema(), last_schema);
+                    batches.push(batch);
+                }
+                FlatIngesterQueryResponse::StartSnapshot { schema } => {
+                    snapshot_schema = Some(Arc::clone(&schema));
+
+                    schema_merger = schema_merger
+                        .merge(&schema::Schema::try_from(schema).unwrap())
+                        .unwrap();
+                }
+            }
+        }
+
+        assert!(!batches.is_empty());
+
+        // equalize schemas
+        let common_schema = schema_merger.build().as_arrow();
+        batches
+            .into_iter()
+            .map(|batch| {
+                let batch_schema = batch.schema();
+                let columns = common_schema
+                    .fields()
+                    .iter()
+                    .map(|field| match batch_schema.index_of(field.name()) {
+                        Ok(idx) => Arc::clone(batch.column(idx)),
+                        Err(_) => new_null_array(field.data_type(), batch.num_rows()),
+                    })
+                    .collect();
+                RecordBatch::try_new(Arc::clone(&common_schema), columns).unwrap()
+            })
+            .collect()
+    }
 }

 /// Flattened version of [`IngesterQueryResponse`].
-pub(crate) type FlatIngesterQueryResponseStream =
+pub type FlatIngesterQueryResponseStream =
    Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;

 /// Element within the flat wire protocol.
@ -347,17 +400,15 @@ fn prepare_data_to_querier_for_partition(
 mod tests {
    use std::task::{Context, Poll};

-    use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch};
+    use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
    use arrow_util::assert_batches_sorted_eq;
    use assert_matches::assert_matches;
    use datafusion::{
        physical_plan::RecordBatchStream,
        prelude::{col, lit},
    };
-    use futures::TryStreamExt;
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
    use predicate::Predicate;
-    use schema::merge::SchemaMerger;

    use super::*;
    use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE};
@ -490,8 +541,11 @@ mod tests {
        ];
        for (loc, scenario) in &scenarios {
            println!("Location: {loc:?}");
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
+            let result = prepare_data_to_querier(scenario, &request)
+                .await
+                .unwrap()
+                .into_record_batches()
+                .await;
            assert_batches_sorted_eq!(&expected, &result);
        }

@ -523,8 +577,11 @@ mod tests {
        ];
        for (loc, scenario) in &scenarios {
            println!("Location: {loc:?}");
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
+            let result = prepare_data_to_querier(scenario, &request)
+                .await
+                .unwrap()
+                .into_record_batches()
+                .await;
            assert_batches_sorted_eq!(&expected, &result);
        }

@ -565,8 +622,11 @@ mod tests {
        ];
        for (loc, scenario) in &scenarios {
            println!("Location: {loc:?}");
-            let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
-            let result = ingester_response_to_record_batches(stream).await;
+            let result = prepare_data_to_querier(scenario, &request)
+                .await
+                .unwrap()
+                .into_record_batches()
+                .await;
            assert_batches_sorted_eq!(&expected, &result);
        }

@ -640,60 +700,4 @@ mod tests {
    fn lp_to_batch(lp: &str) -> RecordBatch {
        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
    }
-
-    /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
-    ///
-    /// If the response contains multiple snapshots, this will merge the schemas into a single one and create
-    /// NULL-columns for snapshots that miss columns. This makes it easier to use the resulting batches with
-    /// [`assert_batches_sorted_eq`].
-    ///
-    /// # Panic
-    /// Panics if there are no batches returned at all. Also panics if the snapshot-scoped schemas do not line up with
-    /// the snapshot-scoped record batches.
-    async fn ingester_response_to_record_batches(
-        response: IngesterQueryResponse,
-    ) -> Vec<RecordBatch> {
-        let mut snapshot_schema = None;
-        let mut schema_merger = SchemaMerger::new();
-        let mut batches = vec![];
-
-        let mut stream = response.flatten();
-        while let Some(msg) = stream.try_next().await.unwrap() {
-            match msg {
-                FlatIngesterQueryResponse::StartPartition { .. } => (),
-                FlatIngesterQueryResponse::RecordBatch { batch } => {
-                    let last_schema = snapshot_schema.as_ref().unwrap();
-                    assert_eq!(&batch.schema(), last_schema);
-                    batches.push(batch);
-                }
-                FlatIngesterQueryResponse::StartSnapshot { schema } => {
-                    snapshot_schema = Some(Arc::clone(&schema));
-
-                    schema_merger = schema_merger
-                        .merge(&schema::Schema::try_from(schema).unwrap())
-                        .unwrap();
-                }
-            }
-        }
-
-        assert!(!batches.is_empty());
-
-        // equalize schemas
-        let common_schema = schema_merger.build().as_arrow();
-        batches
-            .into_iter()
-            .map(|batch| {
-                let batch_schema = batch.schema();
-                let columns = common_schema
-                    .fields()
-                    .iter()
-                    .map(|field| match batch_schema.index_of(field.name()) {
-                        Ok(idx) => Arc::clone(batch.column(idx)),
-                        Err(_) => new_null_array(field.data_type(), batch.num_rows()),
-                    })
-                    .collect();
-                RecordBatch::try_new(Arc::clone(&common_schema), columns).unwrap()
-            })
-            .collect()
-    }
 }
--- a/ingester/src/stream_handler/handler.rs
+++ b/ingester/src/stream_handler/handler.rs
@ -279,7 +279,8 @@ where
                            shard_index=%self.shard_index,
                            shard_id=%self.shard_id,
                            potential_data_loss=true,
-                            "reset stream"
+                            "unable to read from desired sequence number offset \
+                                - reset stream to oldest available data"
                        );
                        self.shard_reset_count.inc(1);
                        sequence_number_before_reset = Some(self.current_sequence_number);
@ -293,7 +294,8 @@ where
                            shard_index=%self.shard_index,
                            shard_id=%self.shard_id,
                            potential_data_loss=true,
-                            "unable to read from desired sequence number offset"
+                            "unable to read from desired sequence number offset \
+                                - aborting ingest due to configuration"
                        );
                        self.shard_unknown_sequence_number_count.inc(1);
                        None
--- a/ingester/tests/common/mod.rs
+++ b/ingester/tests/common/mod.rs
@ -0,0 +1,350 @@
+use std::{collections::HashMap, num::NonZeroU32, sync::Arc, time::Duration};
+
+use data_types::{
+    Namespace, NamespaceSchema, PartitionKey, QueryPoolId, Sequence, SequenceNumber, ShardId,
+    ShardIndex, TopicId,
+};
+use dml::{DmlMeta, DmlWrite};
+use generated_types::ingester::IngesterQueryRequest;
+use ingester::{
+    handler::{IngestHandler, IngestHandlerImpl},
+    lifecycle::LifecycleConfig,
+    querier_handler::IngesterQueryResponse,
+};
+use iox_catalog::{interface::Catalog, mem::MemCatalog, validate_or_insert_schema};
+use iox_query::exec::Executor;
+use iox_time::TimeProvider;
+use metric::{Attributes, Metric, MetricObserver};
+use mutable_batch_lp::lines_to_batches;
+use object_store::DynObjectStore;
+use observability_deps::tracing::*;
+use test_helpers::{maybe_start_logging, timeout::FutureTimeout};
+use write_buffer::{
+    core::WriteBufferReading,
+    mock::{MockBufferForReading, MockBufferSharedState},
+};
+use write_summary::ShardProgress;
+
+/// The byte size of 1 MiB.
+const ONE_MIB: usize = 1024 * 1024;
+
+/// The shard index used for the [`TestContext`].
+pub const TEST_SHARD_INDEX: ShardIndex = ShardIndex::new(0);
+
+/// The topic name used for tests.
+pub const TEST_TOPIC_NAME: &str = "banana-topics";
+
+/// The lifecycle configuration used for tests.
+pub const TEST_LIFECYCLE_CONFIG: LifecycleConfig = LifecycleConfig::new(
+    ONE_MIB,
+    ONE_MIB / 10,
+    ONE_MIB / 10,
+    Duration::from_secs(10),
+    Duration::from_secs(10),
+    1_000,
+);
+
+pub struct TestContext {
+    ingester: IngestHandlerImpl,
+
+    // Catalog data initialised at construction time for later reuse.
+    query_id: QueryPoolId,
+    topic_id: TopicId,
+    shard_id: ShardId,
+
+    // A map of namespaces to schemas, also serving as the set of known
+    // namespaces.
+    namespaces: HashMap<String, NamespaceSchema>,
+
+    catalog: Arc<dyn Catalog>,
+    object_store: Arc<DynObjectStore>,
+    write_buffer_state: MockBufferSharedState,
+    metrics: Arc<metric::Registry>,
+}
+
+impl TestContext {
+    pub async fn new() -> Self {
+        maybe_start_logging();
+
+        let metrics: Arc<metric::Registry> = Default::default();
+        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
+
+        // Initialise a topic, query pool and shard.
+        //
+        // Note that tests should set up their own namespace via
+        // ensure_namespace()
+        let mut txn = catalog.start_transaction().await.unwrap();
+        let topic = txn.topics().create_or_get(TEST_TOPIC_NAME).await.unwrap();
+        let query_id = txn
+            .query_pools()
+            .create_or_get("banana-query-pool")
+            .await
+            .unwrap()
+            .id;
+        let shard = txn
+            .shards()
+            .create_or_get(&topic, TEST_SHARD_INDEX)
+            .await
+            .unwrap();
+        txn.commit().await.unwrap();
+
+        // Mock in-memory write buffer.
+        let write_buffer_state =
+            MockBufferSharedState::empty_with_n_shards(NonZeroU32::try_from(1).unwrap());
+        let write_buffer_read: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
+
+        // Mock object store that persists in memory.
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::new());
+
+        let ingester = IngestHandlerImpl::new(
+            TEST_LIFECYCLE_CONFIG,
+            topic.clone(),
+            [(TEST_SHARD_INDEX, shard)].into_iter().collect(),
+            Arc::clone(&catalog),
+            Arc::clone(&object_store),
+            write_buffer_read,
+            Arc::new(Executor::new(1)),
+            Arc::clone(&metrics),
+            true,
+            1,
+        )
+        .await
+        .unwrap();
+
+        Self {
+            ingester,
+            query_id,
+            topic_id: topic.id,
+            shard_id: shard.id,
+            catalog,
+            object_store,
+            write_buffer_state,
+            metrics,
+            namespaces: Default::default(),
+        }
+    }
+
+    /// Restart the Ingester, driving initialisation again.
+    ///
+    /// NOTE: metric contents are not reset.
+    pub async fn restart(&mut self) {
+        info!("restarting test context ingester");
+
+        let write_buffer_read: Arc<dyn WriteBufferReading> =
+            Arc::new(MockBufferForReading::new(self.write_buffer_state.clone(), None).unwrap());
+
+        let topic = self
+            .catalog
+            .repositories()
+            .await
+            .topics()
+            .create_or_get(TEST_TOPIC_NAME)
+            .await
+            .unwrap();
+
+        let shard = self
+            .catalog
+            .repositories()
+            .await
+            .shards()
+            .create_or_get(&topic, TEST_SHARD_INDEX)
+            .await
+            .unwrap();
+
+        self.ingester = IngestHandlerImpl::new(
+            TEST_LIFECYCLE_CONFIG,
+            topic,
+            [(TEST_SHARD_INDEX, shard)].into_iter().collect(),
+            Arc::clone(&self.catalog),
+            Arc::clone(&self.object_store),
+            write_buffer_read,
+            Arc::new(Executor::new(1)),
+            Arc::clone(&self.metrics),
+            true,
+            1,
+        )
+        .await
+        .unwrap();
+    }
+
+    /// Create a namespace in the catalog for the ingester to discover.
+    ///
+    /// # Panics
+    ///
+    /// Must not be called twice with the same `name`.
+    #[track_caller]
+    pub async fn ensure_namespace(&mut self, name: &str) -> Namespace {
+        let ns = self
+            .catalog
+            .repositories()
+            .await
+            .namespaces()
+            .create(
+                name,
+                iox_catalog::INFINITE_RETENTION_POLICY,
+                self.topic_id,
+                self.query_id,
+            )
+            .await
+            .expect("failed to create test namespace");
+
+        assert!(
+            self.namespaces
+                .insert(
+                    name.to_owned(),
+                    NamespaceSchema::new(
+                        ns.id,
+                        self.topic_id,
+                        self.query_id,
+                        iox_catalog::DEFAULT_MAX_COLUMNS_PER_TABLE,
+                    ),
+                )
+                .is_none(),
+            "namespace must not be duplicated"
+        );
+
+        debug!(?ns, "test namespace created");
+
+        ns
+    }
+
+    /// Enqueue the specified `op` into the write buffer for the ingester to
+    /// consume.
+    ///
+    /// This call takes care of validating the schema of `op` and populating the
+    /// catalog with any new schema elements.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if the namespace for `op` does not exist, or the
+    /// schema is invalid or conflicts with the existing namespace schema.
+    #[track_caller]
+    pub async fn enqueue_write(&mut self, op: DmlWrite) -> SequenceNumber {
+        let schema = self
+            .namespaces
+            .get_mut(op.namespace())
+            .expect("namespace does not exist");
+
+        // Pull the sequence number out of the op to return it back to the user
+        // for simplicity.
+        let offset = op
+            .meta()
+            .sequence()
+            .expect("write must be sequenced")
+            .sequence_number;
+
+        // Perform schema validation, populating the catalog.
+        let mut repo = self.catalog.repositories().await;
+        if let Some(new) = validate_or_insert_schema(op.tables(), schema, repo.as_mut())
+            .await
+            .expect("failed schema validation for enqueuing write")
+        {
+            // Retain the updated schema.
+            debug!(?schema, "updated test context schema");
+            *schema = new;
+        }
+
+        // Push the write into the write buffer.
+        self.write_buffer_state.push_write(op);
+
+        debug!(?offset, "enqueued write in write buffer");
+        offset
+    }
+
+    /// A helper wrapper over [`Self::enqueue_write()`] for line-protocol.
+    #[track_caller]
+    pub async fn write_lp(
+        &mut self,
+        namespace: &str,
+        lp: &str,
+        partition_key: PartitionKey,
+        sequence_number: i64,
+    ) -> SequenceNumber {
+        self.enqueue_write(DmlWrite::new(
+            namespace,
+            lines_to_batches(lp, 0).unwrap(),
+            Some(partition_key),
+            DmlMeta::sequenced(
+                Sequence::new(TEST_SHARD_INDEX, SequenceNumber::new(sequence_number)),
+                iox_time::SystemProvider::new().now(),
+                None,
+                50,
+            ),
+        ))
+        .await
+    }
+
+    /// Utilise the progress API to query for the current state of the test
+    /// shard.
+    pub async fn progress(&self) -> ShardProgress {
+        self.ingester
+            .progresses(vec![TEST_SHARD_INDEX])
+            .await
+            .get(&TEST_SHARD_INDEX)
+            .unwrap()
+            .clone()
+    }
+
+    /// Wait for the specified `offset` to be readable according to the external
+    /// progress API.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if `offset` is not readable within 10 seconds.
+    pub async fn wait_for_readable(&self, offset: SequenceNumber) {
+        async {
+            loop {
+                let is_readable = self.progress().await.readable(offset);
+                if is_readable {
+                    debug!(?offset, "offset reported as readable");
+                    return;
+                }
+
+                trace!(?offset, "offset reported as not yet readable");
+                tokio::time::sleep(Duration::from_millis(100)).await;
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(10))
+        .await;
+    }
+
+    /// Submit a query to the ingester's public query interface.
+    pub async fn query(
+        &self,
+        req: IngesterQueryRequest,
+    ) -> Result<IngesterQueryResponse, ingester::querier_handler::Error> {
+        self.ingester.query(req).await
+    }
+
+    /// Retrieve the specified metric value.
+    pub fn get_metric<T, A>(&self, name: &'static str, attrs: A) -> T::Recorder
+    where
+        T: MetricObserver,
+        A: Into<Attributes>,
+    {
+        let attrs = attrs.into();
+
+        self.metrics
+            .get_instrument::<Metric<T>>(name)
+            .unwrap_or_else(|| panic!("failed to find metric {}", name))
+            .get_observer(&attrs)
+            .unwrap_or_else(|| {
+                panic!(
+                    "failed to find metric {} with attributes {:?}",
+                    name, &attrs
+                )
+            })
+            .recorder()
+    }
+
+    /// Return a reference to the catalog.
+    pub fn catalog(&self) -> &dyn Catalog {
+        self.catalog.as_ref()
+    }
+
+    /// Return the [`ShardId`] of the test shard.
+    pub fn shard_id(&self) -> ShardId {
+        self.shard_id
+    }
+}
--- a/ingester/tests/write.rs
+++ b/ingester/tests/write.rs
@ -0,0 +1,421 @@
+mod common;
+
+use arrow_util::assert_batches_sorted_eq;
+use assert_matches::assert_matches;
+pub use common::*;
+use data_types::{Partition, PartitionKey, SequenceNumber};
+use generated_types::ingester::IngesterQueryRequest;
+use iox_time::{SystemProvider, TimeProvider};
+use metric::{DurationHistogram, U64Counter, U64Gauge};
+
+// Write data to an ingester through the write buffer interface, utilise the
+// progress API to wait for it to become readable, and finally query the data
+// and validate the contents.
+#[tokio::test]
+async fn test_write_query() {
+    let mut ctx = TestContext::new().await;
+
+    ctx.ensure_namespace("test_namespace").await;
+
+    // Initial write
+    let partition_key = PartitionKey::from("1970-01-01");
+    ctx.write_lp(
+        "test_namespace",
+        "bananas greatness=\"unbounded\" 10",
+        partition_key.clone(),
+        0,
+    )
+    .await;
+
+    // A subsequent write with a non-contiguous sequence number to a different table.
+    ctx.write_lp(
+        "test_namespace",
+        "cpu bar=2 20\ncpu bar=3 30",
+        partition_key.clone(),
+        7,
+    )
+    .await;
+
+    // And a third write that appends more data to the table in the initial
+    // write.
+    let offset = ctx
+        .write_lp(
+            "test_namespace",
+            "bananas count=42 200",
+            partition_key.clone(),
+            42,
+        )
+        .await;
+
+    ctx.wait_for_readable(offset).await;
+
+    // Perform a query to validate the actual data buffered.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-------+-----------+--------------------------------+",
+        "| count | greatness | time                           |",
+        "+-------+-----------+--------------------------------+",
+        "|       | unbounded | 1970-01-01T00:00:00.000000010Z |",
+        "| 42    |           | 1970-01-01T00:00:00.000000200Z |",
+        "+-------+-----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // Assert various ingest metrics.
+    let hist = ctx
+        .get_metric::<DurationHistogram, _>(
+            "ingester_op_apply_duration",
+            &[
+                ("kafka_topic", TEST_TOPIC_NAME),
+                ("kafka_partition", "0"),
+                ("result", "success"),
+            ],
+        )
+        .fetch();
+    assert_eq!(hist.sample_count(), 3);
+
+    let metric = ctx
+        .get_metric::<U64Counter, _>(
+            "ingester_write_buffer_read_bytes",
+            &[("kafka_topic", TEST_TOPIC_NAME), ("kafka_partition", "0")],
+        )
+        .fetch();
+    assert_eq!(metric, 150);
+
+    let metric = ctx
+        .get_metric::<U64Gauge, _>(
+            "ingester_write_buffer_last_sequence_number",
+            &[("kafka_topic", TEST_TOPIC_NAME), ("kafka_partition", "0")],
+        )
+        .fetch();
+    assert_eq!(metric, 42);
+
+    let metric = ctx
+        .get_metric::<U64Gauge, _>(
+            "ingester_write_buffer_sequence_number_lag",
+            &[("kafka_topic", TEST_TOPIC_NAME), ("kafka_partition", "0")],
+        )
+        .fetch();
+    assert_eq!(metric, 0);
+
+    let metric = ctx
+        .get_metric::<U64Gauge, _>(
+            "ingester_write_buffer_last_ingest_ts",
+            &[("kafka_topic", TEST_TOPIC_NAME), ("kafka_partition", "0")],
+        )
+        .fetch();
+    let now = SystemProvider::new().now();
+    assert!(metric < now.timestamp_nanos() as _);
+}
+
+// Ensure an ingester correctly seeks to the offset stored in the catalog at
+// startup, skipping any empty offsets.
+#[tokio::test]
+async fn test_seek_on_init() {
+    let mut ctx = TestContext::new().await;
+
+    // Place some writes into the write buffer.
+
+    let partition_key = PartitionKey::from("1970-01-01");
+
+    ctx.ensure_namespace("test_namespace").await;
+    ctx.write_lp(
+        "test_namespace",
+        "bananas greatness=\"unbounded\" 10",
+        partition_key.clone(),
+        0,
+    )
+    .await;
+
+    // A subsequent write with a non-contiguous sequence number to a different
+    // table.
+    //
+    // Resuming will be configured against an offset in the middle of the two
+    // ranges.
+    let w2 = ctx
+        .write_lp(
+            "test_namespace",
+            "bananas greatness=\"amazing\",platanos=42 20",
+            partition_key.clone(),
+            7,
+        )
+        .await;
+
+    // Wait for the writes to be processed.
+    ctx.wait_for_readable(w2).await;
+
+    // Assert the data in memory.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-----------+----------+--------------------------------+",
+        "| greatness | platanos | time                           |",
+        "+-----------+----------+--------------------------------+",
+        "| amazing   | 42       | 1970-01-01T00:00:00.000000020Z |",
+        "| unbounded |          | 1970-01-01T00:00:00.000000010Z |",
+        "+-----------+----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // Update the catalog state, causing the next boot of the ingester to seek
+    // past the first write, but before the second write.
+    ctx.catalog()
+        .repositories()
+        .await
+        .shards()
+        .update_min_unpersisted_sequence_number(ctx.shard_id(), SequenceNumber::new(3))
+        .await
+        .expect("failed to update persisted marker");
+
+    // Restart the ingester.
+    ctx.restart().await;
+
+    // Wait for the second write to become readable again.
+    ctx.wait_for_readable(w2).await;
+
+    // Assert the data in memory now contains only w2.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-----------+----------+--------------------------------+",
+        "| greatness | platanos | time                           |",
+        "+-----------+----------+--------------------------------+",
+        "| amazing   | 42       | 1970-01-01T00:00:00.000000020Z |",
+        "+-----------+----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+}
+
+// Ensure an ingester respects the per-partition persist watermark, skipping
+// already applied ops.
+#[tokio::test]
+async fn test_skip_previously_applied_partition_ops() {
+    let mut ctx = TestContext::new().await;
+
+    // Place some writes into the write buffer.
+    let ns = ctx.ensure_namespace("test_namespace").await;
+    let partition_key = PartitionKey::from("1970-01-01");
+    ctx.write_lp(
+        "test_namespace",
+        "bananas greatness=\"unbounded\" 10",
+        partition_key.clone(),
+        5,
+    )
+    .await;
+    let w2 = ctx
+        .write_lp(
+            "test_namespace",
+            "bananas greatness=\"amazing\",platanos=42 20",
+            partition_key.clone(),
+            10,
+        )
+        .await;
+
+    // Wait for the writes to be processed.
+    ctx.wait_for_readable(w2).await;
+
+    // Assert the data in memory.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-----------+----------+--------------------------------+",
+        "| greatness | platanos | time                           |",
+        "+-----------+----------+--------------------------------+",
+        "| amazing   | 42       | 1970-01-01T00:00:00.000000020Z |",
+        "| unbounded |          | 1970-01-01T00:00:00.000000010Z |",
+        "+-----------+----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // Read the partition ID of the writes above.
+    let partitions = ctx
+        .catalog()
+        .repositories()
+        .await
+        .partitions()
+        .list_by_namespace(ns.id)
+        .await
+        .unwrap();
+    assert_matches!(&*partitions, &[Partition { .. }]);
+
+    // And set the per-partition persist marker after the first write, but
+    // before the second.
+    ctx.catalog()
+        .repositories()
+        .await
+        .partitions()
+        .update_persisted_sequence_number(partitions[0].id, SequenceNumber::new(6))
+        .await
+        .expect("failed to update persisted marker");
+
+    // Restart the ingester, which shall seek to the shard offset of 0, and
+    // begin replaying ops.
+    ctx.restart().await;
+
+    // Wait for the second write to become readable again.
+    ctx.wait_for_readable(w2).await;
+
+    // Assert the partition replay skipped the first write.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-----------+----------+--------------------------------+",
+        "| greatness | platanos | time                           |",
+        "+-----------+----------+--------------------------------+",
+        "| amazing   | 42       | 1970-01-01T00:00:00.000000020Z |",
+        "+-----------+----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+}
+
+// Ensure a seek beyond the actual data available (i.e. into the future) causes
+// a panic to bring about a human response.
+#[tokio::test]
+#[should_panic = "attempted to seek to offset 42, but current high watermark for partition 0 is 0"]
+async fn test_seek_beyond_available_data() {
+    let mut ctx = TestContext::new().await;
+
+    // Place a write into the write buffer so it is not empty.
+    ctx.ensure_namespace("test_namespace").await;
+    ctx.write_lp(
+        "test_namespace",
+        "bananas greatness=\"unbounded\" 10",
+        PartitionKey::from("1970-01-01"),
+        0,
+    )
+    .await;
+
+    // Update the catalog state, causing the next boot of the ingester to seek
+    // past the write, beyond valid data offsets.
+    ctx.catalog()
+        .repositories()
+        .await
+        .shards()
+        .update_min_unpersisted_sequence_number(ctx.shard_id(), SequenceNumber::new(42))
+        .await
+        .expect("failed to update persisted marker");
+
+    // Restart the ingester.
+    ctx.restart().await;
+}
+
+// Ensure an ingester configured to resume from offset 1 correctly seeks to the
+// oldest available data when that offset no longer exists.
+#[tokio::test]
+async fn test_seek_dropped_offset() {
+    let mut ctx = TestContext::new().await;
+
+    // Place a write into the write buffer so it is not empty.
+    ctx.ensure_namespace("test_namespace").await;
+
+    // A write at offset 42
+    let w1 = ctx
+        .write_lp(
+            "test_namespace",
+            "bananas greatness=\"unbounded\" 10",
+            PartitionKey::from("1970-01-01"),
+            42,
+        )
+        .await;
+
+    // Configure the ingester to seek to offset 1, which does not exist.
+    ctx.catalog()
+        .repositories()
+        .await
+        .shards()
+        .update_min_unpersisted_sequence_number(ctx.shard_id(), SequenceNumber::new(1))
+        .await
+        .expect("failed to update persisted marker");
+
+    // Restart the ingester.
+    ctx.restart().await;
+
+    // Wait for the op to be applied
+    ctx.wait_for_readable(w1).await;
+
+    // Assert the data in memory now contains only w2.
+    let data = ctx
+        .query(IngesterQueryRequest {
+            namespace: "test_namespace".to_string(),
+            table: "bananas".to_string(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query should succeed")
+        .into_record_batches()
+        .await;
+
+    let expected = vec![
+        "+-----------+--------------------------------+",
+        "| greatness | time                           |",
+        "+-----------+--------------------------------+",
+        "| unbounded | 1970-01-01T00:00:00.000000010Z |",
+        "+-----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // Ensure the metric was set to cause an alert for potential data loss.
+    let metric = ctx
+        .get_metric::<U64Counter, _>(
+            "shard_reset_count",
+            &[
+                ("kafka_topic", TEST_TOPIC_NAME),
+                ("kafka_partition", "0"),
+                ("potential_data_loss", "true"),
+            ],
+        )
+        .fetch();
+    assert!(metric > 0);
+}