Merge branch 'main' into idpe-17789/provide-job-on-commit

2023-07-31 08:20:45 -07:00 · 2023-07-31 08:20:45 -07:00 · cc70a2c38b
parent 1ce8e50f1a 878f217631
commit cc70a2c38b
85 changed files with 3291 additions and 1229 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -699,7 +699,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
 dependencies = [
 "memchr",
- "regex-automata 0.3.3",
+ "regex-automata 0.3.4",
 "serde",
 ]

@ -763,6 +763,7 @@ dependencies = [
 "pdatastructs",
 "proptest",
 "rand",
+ "test_helpers",
 "tokio",
 "tokio-util",
 "trace",
@ -2662,6 +2663,7 @@ dependencies = [
 "flatbuffers",
 "futures",
 "generated_types",
+ "gossip",
 "hashbrown 0.14.0",
 "influxdb_iox_client",
 "ingester_query_grpc",
@ -3107,6 +3109,7 @@ dependencies = [
 "authz",
 "clap_blocks",
 "data_types",
+ "gossip",
 "hashbrown 0.14.0",
 "hyper",
 "iox_catalog",
@ -4575,7 +4578,7 @@ checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
 dependencies = [
 "aho-corasick",
 "memchr",
- "regex-automata 0.3.3",
+ "regex-automata 0.3.4",
 "regex-syntax 0.7.4",
 ]

@ -4590,9 +4593,9 @@ dependencies = [

 [[package]]
 name = "regex-automata"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
+checksum = "b7b6d6190b7594385f61bd3911cd1be99dfddcfc365a4160cc2ab5bff4aed294"
 dependencies = [
 "aho-corasick",
 "memchr",
@ -4693,6 +4696,7 @@ dependencies = [
 "flate2",
 "futures",
 "generated_types",
+ "gossip",
 "hashbrown 0.14.0",
 "hyper",
 "influxdb-line-protocol",
@ -4906,18 +4910,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.177"
+version = "1.0.179"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63ba2516aa6bf82e0b19ca8b50019d52df58455d3cf9bdaf6315225fdd0c560a"
+checksum = "0a5bf42b8d227d4abf38a1ddb08602e229108a517cd4e5bb28f9c7eaafdce5c0"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.177"
+version = "1.0.179"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "401797fe7833d72109fedec6bfcbe67c0eed9b99772f26eb8afd261f0abc6fd3"
+checksum = "741e124f5485c7e60c03b043f79f320bff3527f4bbf12cf3831750dc46a0ec2c"
 dependencies = [
 "proc-macro2",
 "quote",
@ -6265,6 +6269,7 @@ dependencies = [
 "pin-project",
 "sysinfo",
 "tempfile",
+ "test_helpers",
 "tokio",
 "tokio-util",
 "trace",
@ -6879,7 +6884,7 @@ dependencies = [
 "rand",
 "rand_core",
 "regex",
- "regex-automata 0.3.3",
+ "regex-automata 0.3.4",
 "regex-syntax 0.7.4",
 "reqwest",
 "ring",
--- a/cache_system/Cargo.toml
+++ b/cache_system/Cargo.toml
@ -23,6 +23,7 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 [dev-dependencies]
 criterion = { version = "0.5", default-features = false, features = ["rayon"]}
 proptest = { version = "1", default_features = false, features = ["std"] }
+test_helpers = { path = "../test_helpers" }

 [lib]
 # Allow --save-baseline to work
--- a/cache_system/src/backend/policy/integration_tests.rs
+++ b/cache_system/src/backend/policy/integration_tests.rs
@ -5,6 +5,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
 use iox_time::{MockProvider, Time};
 use parking_lot::Mutex;
 use rand::rngs::mock::StepRng;
+use test_helpers::maybe_start_logging;
 use tokio::{runtime::Handle, sync::Notify};

 use crate::{
@ -116,6 +117,7 @@ async fn test_refresh_does_not_update_lru_time() {
        time_provider,
        loader,
        notify_idle,
+        pool,
        ..
    } = TestStateLruAndRefresh::new();

@ -135,12 +137,14 @@ async fn test_refresh_does_not_update_lru_time() {

    let barrier = loader.block_next(1, String::from("foo"));
    backend.set(1, String::from("a"));
+    pool.wait_converged().await;

    // trigger refresh
    time_provider.inc(Duration::from_secs(1));

    time_provider.inc(Duration::from_secs(1));
    backend.set(2, String::from("b"));
+    pool.wait_converged().await;

    time_provider.inc(Duration::from_secs(1));

@ -150,6 +154,7 @@ async fn test_refresh_does_not_update_lru_time() {

    // add a third item to the cache, forcing LRU to evict one of the items
    backend.set(3, String::from("c"));
+    pool.wait_converged().await;

    // Should evict `1` even though it was refreshed after `2` was added
    assert_eq!(backend.get(&1), None);
@ -192,6 +197,8 @@ async fn test_if_refresh_to_slow_then_expire() {

 #[tokio::test]
 async fn test_refresh_can_trigger_lru_eviction() {
+    maybe_start_logging();
+
    let TestStateLRUAndRefresh {
        mut backend,
        refresh_duration_provider,
@ -224,13 +231,16 @@ async fn test_refresh_can_trigger_lru_eviction() {
    backend.set(1, String::from("a"));
    backend.set(2, String::from("c"));
    backend.set(3, String::from("d"));
-    assert_eq!(backend.get(&1), Some(String::from("a")));
+    pool.wait_converged().await;
    assert_eq!(backend.get(&2), Some(String::from("c")));
    assert_eq!(backend.get(&3), Some(String::from("d")));
+    time_provider.inc(Duration::from_millis(1));
+    assert_eq!(backend.get(&1), Some(String::from("a")));

    // refresh
-    time_provider.inc(Duration::from_secs(1));
+    time_provider.inc(Duration::from_secs(10));
    notify_idle.notified_with_timeout().await;
+    pool.wait_converged().await;

    // needed to evict 2->"c"
    assert_eq!(backend.get(&1), Some(String::from("b")));
@ -285,6 +295,7 @@ async fn test_remove_if_check_does_not_extend_lifetime() {
        size_estimator,
        time_provider,
        remove_if_handle,
+        pool,
        ..
    } = TestStateLruAndRemoveIf::new().await;

@ -293,15 +304,18 @@ async fn test_remove_if_check_does_not_extend_lifetime() {
    size_estimator.mock_size(3, String::from("c"), TestSize(4));

    backend.set(1, String::from("a"));
+    pool.wait_converged().await;
    time_provider.inc(Duration::from_secs(1));

    backend.set(2, String::from("b"));
+    pool.wait_converged().await;
    time_provider.inc(Duration::from_secs(1));

    // Checking remove_if should not count as a "use" of 1
    // for the "least recently used" calculation
    remove_if_handle.remove_if(&1, |_| false);
    backend.set(3, String::from("c"));
+    pool.wait_converged().await;

    // adding "c" totals 12 size, but backend has room for only 10
    // so "least recently used" (in this case 1, not 2) should be removed
@ -397,6 +411,7 @@ impl TestStateLRUAndRefresh {
            "my_pool",
            TestSize(10),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));
        backend.add_policy(LruPolicy::new(
            Arc::clone(&pool),
@ -442,6 +457,7 @@ impl TestStateTtlAndLRU {
            "my_pool",
            TestSize(10),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));
        backend.add_policy(LruPolicy::new(
            Arc::clone(&pool),
@ -465,6 +481,7 @@ struct TestStateLruAndRemoveIf {
    time_provider: Arc<MockProvider>,
    size_estimator: Arc<TestSizeEstimator>,
    remove_if_handle: RemoveIfHandle<u8, String>,
+    pool: Arc<ResourcePool<TestSize>>,
 }

 impl TestStateLruAndRemoveIf {
@ -479,6 +496,7 @@ impl TestStateLruAndRemoveIf {
            "my_pool",
            TestSize(10),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));
        backend.add_policy(LruPolicy::new(
            Arc::clone(&pool),
@ -495,6 +513,7 @@ impl TestStateLruAndRemoveIf {
            time_provider,
            size_estimator,
            remove_if_handle,
+            pool,
        }
    }
 }
@ -507,6 +526,7 @@ struct TestStateLruAndRefresh {
    time_provider: Arc<MockProvider>,
    loader: Arc<TestLoader<u8, (), String>>,
    notify_idle: Arc<Notify>,
+    pool: Arc<ResourcePool<TestSize>>,
 }

 impl TestStateLruAndRefresh {
@ -537,6 +557,7 @@ impl TestStateLruAndRefresh {
            "my_pool",
            TestSize(10),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));
        backend.add_policy(LruPolicy::new(
            Arc::clone(&pool),
@ -551,6 +572,7 @@ impl TestStateLruAndRefresh {
            time_provider,
            loader,
            notify_idle,
+            pool,
        }
    }
 }
--- a/cache_system/src/backend/policy/lru.rs
+++ b/cache_system/src/backend/policy/lru.rs
--- a/cache_system/src/backend/policy/mod.rs
+++ b/cache_system/src/backend/policy/mod.rs
@ -393,7 +393,11 @@ where
    /// structures while calling this function if you plan to also [subscribe](Subscriber) to
    /// changes because this would easily lead to deadlocks.
    pub fn execute_requests(&mut self, change_requests: Vec<ChangeRequest<'_, K, V>>) {
-        let inner = self.inner.upgrade().expect("backend gone");
+        let Some(inner) = self.inner.upgrade() else {
+            // backend gone, can happen during shutdowns, try not to panic
+            return;
+        };
+
        lock_inner!(mut guard = inner);
        perform_changes(&mut guard, change_requests);
    }
--- a/cache_system/src/resource_consumption.rs
+++ b/cache_system/src/resource_consumption.rs
@ -9,7 +9,15 @@ use std::{
 ///
 /// Can be used to represent in-RAM memory as well as on-disc memory.
 pub trait Resource:
-    Add<Output = Self> + Copy + Debug + Into<u64> + PartialOrd + Send + Sub<Output = Self> + 'static
+    Add<Output = Self>
+    + Copy
+    + Debug
+    + Into<u64>
+    + PartialOrd
+    + Send
+    + Sync
+    + Sub<Output = Self>
+    + 'static
 {
    /// Create resource consumption of zero.
    fn zero() -> Self;
--- a/clap_blocks/src/gossip.rs
+++ b/clap_blocks/src/gossip.rs
@ -0,0 +1,49 @@
+//! CLI config for cluster gossip communication.
+
+use crate::socket_addr::SocketAddr;
+
+/// Configuration parameters for the cluster gossip communication mechanism.
+#[derive(Debug, Clone, clap::Parser)]
+#[allow(missing_copy_implementations)]
+pub struct GossipConfig {
+    /// A comma-delimited set of seed gossip peer addresses.
+    ///
+    /// Example: "10.0.0.1:4242,10.0.0.2:4242"
+    ///
+    /// These seeds will be used to discover all other peers that talk to the
+    /// same seeds. Typically all nodes in the cluster should use the same set
+    /// of seeds.
+    #[clap(
+        long = "gossip-seed-list",
+        env = "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+        requires = "gossip_bind_address", // Field name, not flag
+    )]
+    pub seed_list: Vec<String>,
+
+    /// The UDP socket address IOx will use for gossip communication between
+    /// peers.
+    ///
+    /// Example: "0.0.0.0:4242"
+    ///
+    /// If not provided, the gossip sub-system is disabled.
+    #[clap(
+        long = "gossip-bind-address",
+        env = "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+        requires = "seed_list", // Field name, not flag
+        action
+    )]
+    pub gossip_bind_address: Option<SocketAddr>,
+}
+
+impl GossipConfig {
+    /// Initialise the gossip config to be disabled.
+    pub fn disabled() -> Self {
+        Self {
+            seed_list: vec![],
+            gossip_bind_address: None,
+        }
+    }
+}
--- a/clap_blocks/src/ingester.rs
+++ b/clap_blocks/src/ingester.rs
@ -2,10 +2,16 @@

 use std::path::PathBuf;

+use crate::gossip::GossipConfig;
+
 /// CLI config for the ingester using the RPC write path
 #[derive(Debug, Clone, clap::Parser)]
 #[allow(missing_copy_implementations)]
 pub struct IngesterConfig {
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: GossipConfig,
+
    /// Where this ingester instance should store its write-ahead log files. Each ingester instance
    /// must have its own directory.
    #[clap(long = "wal-directory", env = "INFLUXDB_IOX_WAL_DIRECTORY", action)]
--- a/clap_blocks/src/lib.rs
+++ b/clap_blocks/src/lib.rs
@ -22,6 +22,7 @@ pub mod catalog_dsn;
 pub mod compactor;
 pub mod compactor_scheduler;
 pub mod garbage_collector;
+pub mod gossip;
 pub mod ingester;
 pub mod ingester_address;
 pub mod object_store;
--- a/clap_blocks/src/router.rs
+++ b/clap_blocks/src/router.rs
@ -1,6 +1,7 @@
 //! CLI config for the router using the RPC write path

 use crate::{
+    gossip::GossipConfig,
    ingester_address::IngesterAddress,
    single_tenant::{
        CONFIG_AUTHZ_ENV_NAME, CONFIG_AUTHZ_FLAG, CONFIG_CST_ENV_NAME, CONFIG_CST_FLAG,
@ -15,6 +16,10 @@ use std::{
 #[derive(Debug, Clone, clap::Parser)]
 #[allow(missing_copy_implementations)]
 pub struct RouterConfig {
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: GossipConfig,
+
    /// Addr for connection to authz
    #[clap(
        long = CONFIG_AUTHZ_FLAG,
--- a/compactor/src/components/df_planner/query_chunk.rs
+++ b/compactor/src/components/df_planner/query_chunk.rs
@ -171,7 +171,7 @@ fn to_queryable_parquet_chunk(
        parquet_file_id = file.file.id.get(),
        parquet_file_namespace_id = file.file.namespace_id.get(),
        parquet_file_table_id = file.file.table_id.get(),
-        parquet_file_partition_id = file.file.partition_id.get(),
+        parquet_file_partition_id = %file.file.partition_id,
        parquet_file_object_store_id = uuid.to_string().as_str(),
        "built parquet chunk from metadata"
    );
--- a/compactor/src/components/parquet_file_sink/mock.rs
+++ b/compactor/src/components/parquet_file_sink/mock.rs
@ -70,8 +70,7 @@ impl ParquetFileSink for MockParquetFileSink {
        let out = ((row_count > 0) || !self.filter_empty_files).then(|| ParquetFileParams {
            namespace_id: partition.namespace_id,
            table_id: partition.table.id,
-            partition_id: partition.partition_id,
-            partition_hash_id: partition.partition_hash_id.clone(),
+            partition_id: partition.transition_partition_id(),
            object_store_id: Uuid::from_u128(guard.len() as u128),
            min_time: Timestamp::new(0),
            max_time: Timestamp::new(0),
@ -95,7 +94,7 @@ impl ParquetFileSink for MockParquetFileSink {
 #[cfg(test)]
 mod tests {
    use arrow_util::assert_batches_eq;
-    use data_types::{NamespaceId, PartitionId, TableId};
+    use data_types::{NamespaceId, TableId};
    use datafusion::{
        arrow::{array::new_null_array, datatypes::DataType},
        physical_plan::stream::RecordBatchStreamAdapter,
@ -159,7 +158,7 @@ mod tests {
            Arc::clone(&schema),
            futures::stream::once(async move { Ok(record_batch_captured) }),
        ));
-        let partition_hash_id = partition.partition_hash_id.clone();
+        let partition_id = partition.transition_partition_id();
        assert_eq!(
            sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
                .await
@ -167,8 +166,7 @@ mod tests {
            Some(ParquetFileParams {
                namespace_id: NamespaceId::new(2),
                table_id: TableId::new(3),
-                partition_id: PartitionId::new(1),
-                partition_hash_id,
+                partition_id,
                object_store_id: Uuid::from_u128(2),
                min_time: Timestamp::new(0),
                max_time: Timestamp::new(0),
@ -223,7 +221,7 @@ mod tests {
            Arc::clone(&schema),
            futures::stream::empty(),
        ));
-        let partition_hash_id = partition.partition_hash_id.clone();
+        let partition_id = partition.transition_partition_id();
        assert_eq!(
            sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
                .await
@ -231,8 +229,7 @@ mod tests {
            Some(ParquetFileParams {
                namespace_id: NamespaceId::new(2),
                table_id: TableId::new(3),
-                partition_id: PartitionId::new(1),
-                partition_hash_id,
+                partition_id,
                object_store_id: Uuid::from_u128(0),
                min_time: Timestamp::new(0),
                max_time: Timestamp::new(0),
--- a/compactor/src/components/partition_files_source/mock.rs
+++ b/compactor/src/components/partition_files_source/mock.rs
@ -1,19 +1,35 @@
 use std::{collections::HashMap, fmt::Display};

-use async_trait::async_trait;
-use data_types::{ParquetFile, PartitionId};
-
 use super::PartitionFilesSource;
+use async_trait::async_trait;
+use data_types::{ParquetFile, PartitionId, TransitionPartitionId};

 #[derive(Debug)]
 pub struct MockPartitionFilesSource {
-    files: HashMap<PartitionId, Vec<ParquetFile>>,
+    // This complexity is because we're in the process of moving to partition hash IDs rather than
+    // partition catalog IDs, and Parquet files might only have the partition hash ID on their
+    // record, but the compactor deals with partition catalog IDs because we haven't transitioned
+    // it yet. This should become simpler when the transition is complete.
+    partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
+    file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>>,
 }

 impl MockPartitionFilesSource {
-    #[allow(dead_code)] // not used anywhere
-    pub fn new(files: HashMap<PartitionId, Vec<ParquetFile>>) -> Self {
-        Self { files }
+    #[cfg(test)]
+    pub fn new(
+        partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
+        parquet_files: Vec<ParquetFile>,
+    ) -> Self {
+        let mut file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>> = HashMap::new();
+        for file in parquet_files {
+            let files = file_lookup.entry(file.partition_id.clone()).or_default();
+            files.push(file);
+        }
+
+        Self {
+            partition_lookup,
+            file_lookup,
+        }
    }
 }

@ -25,46 +41,60 @@ impl Display for MockPartitionFilesSource {

 #[async_trait]
 impl PartitionFilesSource for MockPartitionFilesSource {
-    async fn fetch(&self, partition: PartitionId) -> Vec<ParquetFile> {
-        self.files.get(&partition).cloned().unwrap_or_default()
+    async fn fetch(&self, partition_id: PartitionId) -> Vec<ParquetFile> {
+        self.partition_lookup
+            .get(&partition_id)
+            .and_then(|partition_hash_id| self.file_lookup.get(partition_hash_id).cloned())
+            .unwrap_or_default()
    }
 }

 #[cfg(test)]
 mod tests {
-    use iox_tests::ParquetFileBuilder;
-
    use super::*;
+    use iox_tests::{partition_identifier, ParquetFileBuilder};

    #[test]
    fn test_display() {
        assert_eq!(
-            MockPartitionFilesSource::new(HashMap::default()).to_string(),
+            MockPartitionFilesSource::new(Default::default(), Default::default()).to_string(),
            "mock",
        )
    }

    #[tokio::test]
    async fn test_fetch() {
-        let f_1_1 = ParquetFileBuilder::new(1).with_partition(1).build();
-        let f_1_2 = ParquetFileBuilder::new(2).with_partition(1).build();
-        let f_2_1 = ParquetFileBuilder::new(3).with_partition(2).build();
+        let partition_id_1 = PartitionId::new(1);
+        let partition_id_2 = PartitionId::new(2);
+        let partition_identifier_1 = partition_identifier(1);
+        let partition_identifier_2 = partition_identifier(2);
+        let f_1_1 = ParquetFileBuilder::new(1)
+            .with_partition(partition_identifier_1.clone())
+            .build();
+        let f_1_2 = ParquetFileBuilder::new(2)
+            .with_partition(partition_identifier_1.clone())
+            .build();
+        let f_2_1 = ParquetFileBuilder::new(3)
+            .with_partition(partition_identifier_2.clone())
+            .build();

-        let files = HashMap::from([
-            (PartitionId::new(1), vec![f_1_1.clone(), f_1_2.clone()]),
-            (PartitionId::new(2), vec![f_2_1.clone()]),
+        let partition_lookup = HashMap::from([
+            (partition_id_1, partition_identifier_1.clone()),
+            (partition_id_2, partition_identifier_2.clone()),
        ]);
-        let source = MockPartitionFilesSource::new(files);
+
+        let files = vec![f_1_1.clone(), f_1_2.clone(), f_2_1.clone()];
+        let source = MockPartitionFilesSource::new(partition_lookup, files);

        // different partitions
        assert_eq!(
-            source.fetch(PartitionId::new(1)).await,
+            source.fetch(partition_id_1).await,
            vec![f_1_1.clone(), f_1_2.clone()],
        );
-        assert_eq!(source.fetch(PartitionId::new(2)).await, vec![f_2_1],);
+        assert_eq!(source.fetch(partition_id_2).await, vec![f_2_1],);

        // fetching does not drain
-        assert_eq!(source.fetch(PartitionId::new(1)).await, vec![f_1_1, f_1_2],);
+        assert_eq!(source.fetch(partition_id_1).await, vec![f_1_1, f_1_2],);

        // unknown partition => empty result
        assert_eq!(source.fetch(PartitionId::new(3)).await, vec![],);
--- a/compactor/src/components/round_info_source/mod.rs
+++ b/compactor/src/components/round_info_source/mod.rs
@ -172,7 +172,11 @@ impl RoundInfoSource for LevelBasedRoundInfo {
        _partition_info: &PartitionInfo,
        files: &[ParquetFile],
    ) -> Result<RoundInfo, DynError> {
-        let start_level = get_start_level(files);
+        let start_level = get_start_level(
+            files,
+            self.max_num_files_per_plan,
+            self.max_total_file_size_per_plan,
+        );

        if self.too_many_small_files_to_compact(files, start_level) {
            return Ok(RoundInfo::ManySmallFiles {
@ -187,23 +191,53 @@ impl RoundInfoSource for LevelBasedRoundInfo {
    }
 }

-fn get_start_level(files: &[ParquetFile]) -> CompactionLevel {
+// get_start_level decides what level to start compaction from.  Often this is the lowest level
+// we have ParquetFiles in, but occasionally we decide to compact L1->L2 when L0s still exist.
+//
+// If we ignore the invariants (where intra-level overlaps are allowed), this would be a math problem
+// to optimize write amplification.
+//
+// However, allowing intra-level overlaps in L0 but not L1/L2 adds extra challenge to compacting L0s to L1.
+// This is especially true when there are large quantitites of overlapping L0s and L1s, potentially resulting
+// in many split/compact cycles to resolve the overlaps.
+//
+// Since L1 & L2 only have inter-level overlaps, they can be compacted with just a few splits to align the L1s
+// with the L2s.  The relative ease of moving data from L1 to L2 provides additional motivation to compact the
+// L1s to L2s when a backlog of L0s exist. The easily solvable L1->L2 compaction can give us a clean slate in
+// L1, greatly simplifying the remaining L0->L1 compactions.
+fn get_start_level(files: &[ParquetFile], max_files: usize, max_bytes: usize) -> CompactionLevel {
    // panic if the files are empty
    assert!(!files.is_empty());

-    // Start with initial level
-    // If there are files in  this level, itis the start level
-    // Otherwise repeat until reaching the final level.
-    let mut level = CompactionLevel::Initial;
-    while level != CompactionLevel::Final {
-        if files.iter().any(|f| f.compaction_level == level) {
-            return level;
-        }
+    let mut l0_cnt: usize = 0;
+    let mut l0_bytes: usize = 0;
+    let mut l1_bytes: usize = 0;

-        level = level.next();
+    for f in files {
+        match f.compaction_level {
+            CompactionLevel::Initial => {
+                l0_cnt += 1;
+                l0_bytes += f.file_size_bytes as usize;
+            }
+            CompactionLevel::FileNonOverlapped => {
+                l1_bytes += f.file_size_bytes as usize;
+            }
+            _ => {}
+        }
    }

-    level
+    if l1_bytes > 3 * max_bytes && (l0_cnt > max_files || l0_bytes > max_bytes) {
+        // L1 is big enough to pose an overlap challenge compacting from L0, and there is quite a bit more coming from L0.
+        // The criteria for this early L1->L2 compaction significanly impacts write amplification.  The above values optimize
+        // existing test cases, but may be changed as additional test cases are added.
+        CompactionLevel::FileNonOverlapped
+    } else if l0_bytes > 0 {
+        CompactionLevel::Initial
+    } else if l1_bytes > 0 {
+        CompactionLevel::FileNonOverlapped
+    } else {
+        CompactionLevel::Final
+    }
 }

 fn get_num_overlapped_files(
--- a/compactor/src/components/split_or_compact/start_level_files_to_split.rs
+++ b/compactor/src/components/split_or_compact/start_level_files_to_split.rs
@ -301,7 +301,26 @@ pub fn merge_small_l0_chains(
    for chain in &chains {
        let this_chain_bytes = chain.iter().map(|f| f.file_size_bytes as usize).sum();

-        if prior_chain_bytes > 0 && prior_chain_bytes + this_chain_bytes <= max_compact_size {
+        // matching max_lo_created_at times indicates that the files were deliberately split.  We shouldn't merge
+        // chains with matching max_lo_created_at times, because that would encourage undoing the previous split,
+        // which minimally increases write amplification, and may cause unproductive split/compact loops.
+        let mut matches = 0;
+        if prior_chain_bytes > 0 {
+            for f in chain.iter() {
+                for f2 in &merged_chains[prior_chain_idx as usize] {
+                    if f.max_l0_created_at == f2.max_l0_created_at {
+                        matches += 1;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Merge it if: there a prior chain to merge with, and merging wouldn't make it too big, or undo a previous split
+        if prior_chain_bytes > 0
+            && prior_chain_bytes + this_chain_bytes <= max_compact_size
+            && matches == 0
+        {
            // this chain can be added to the prior chain.
            merged_chains[prior_chain_idx as usize].append(&mut chain.clone());
            prior_chain_bytes += this_chain_bytes;
--- a/compactor/tests/integration.rs
+++ b/compactor/tests/integration.rs
@ -68,8 +68,8 @@ async fn test_num_files_over_limit() {
    assert_levels(
        &files,
        vec![
-            (8, CompactionLevel::FileNonOverlapped),
            (9, CompactionLevel::FileNonOverlapped),
+            (10, CompactionLevel::FileNonOverlapped),
        ],
    );
 }
--- a/compactor/tests/layouts/backfill.rs
+++ b/compactor/tests/layouts/backfill.rs
@ -746,97 +746,85 @@ async fn random_backfill_over_l2s() {
    - "Committing partition 1:"
    - "  Soft Deleting 4 files: L0.76, L0.77, L0.79, L0.80"
    - "  Creating 8 files"
-    - "**** Simulation run 15, type=compact(ManySmallFiles). 10 Input Files, 200mb total:"
-    - "L0                                                                                                                 "
-    - "L0.75[42,356] 1.04us 33mb|-----------L0.75-----------|                                                             "
-    - "L0.86[357,357] 1.04us 0b                               |L0.86|                                                     "
-    - "L0.87[358,670] 1.04us 33mb                              |-----------L0.87-----------|                               "
-    - "L0.84[671,672] 1.04us 109kb                                                           |L0.84|                        "
-    - "L0.85[673,986] 1.04us 33mb                                                            |-----------L0.85-----------| "
-    - "L0.78[42,356] 1.05us 33mb|-----------L0.78-----------|                                                             "
-    - "L0.90[357,357] 1.05us 0b                               |L0.90|                                                     "
-    - "L0.91[358,670] 1.05us 33mb                              |-----------L0.91-----------|                               "
-    - "L0.88[671,672] 1.05us 109kb                                                           |L0.88|                        "
-    - "L0.89[673,986] 1.05us 33mb                                                            |-----------L0.89-----------| "
-    - "**** 1 Output Files (parquet_file_id not yet assigned), 200mb total:"
-    - "L0, all files 200mb                                                                                                "
-    - "L0.?[42,986] 1.05us      |------------------------------------------L0.?------------------------------------------|"
-    - "Committing partition 1:"
-    - "  Soft Deleting 10 files: L0.75, L0.78, L0.84, L0.85, L0.86, L0.87, L0.88, L0.89, L0.90, L0.91"
-    - "  Creating 1 files"
-    - "**** Simulation run 16, type=split(HighL0OverlapSingleFile)(split_times=[670]). 1 Input Files, 100mb total:"
-    - "L1, all files 100mb                                                                                                "
-    - "L1.82[358,672] 1.03us    |-----------------------------------------L1.82------------------------------------------|"
-    - "**** 2 Output Files (parquet_file_id not yet assigned), 100mb total:"
-    - "L1                                                                                                                 "
-    - "L1.?[358,670] 1.03us 99mb|-----------------------------------------L1.?------------------------------------------| "
-    - "L1.?[671,672] 1.03us 651kb                                                                                         |L1.?|"
-    - "**** Simulation run 17, type=split(HighL0OverlapSingleFile)(split_times=[356]). 1 Input Files, 100mb total:"
-    - "L1, all files 100mb                                                                                                "
-    - "L1.81[42,357] 1.03us     |-----------------------------------------L1.81------------------------------------------|"
-    - "**** 2 Output Files (parquet_file_id not yet assigned), 100mb total:"
-    - "L1                                                                                                                 "
-    - "L1.?[42,356] 1.03us 100mb|-----------------------------------------L1.?------------------------------------------| "
-    - "L1.?[357,357] 1.03us 325kb                                                                                          |L1.?|"
-    - "**** Simulation run 18, type=split(HighL0OverlapSingleFile)(split_times=[356, 670]). 1 Input Files, 200mb total:"
-    - "L0, all files 200mb                                                                                                "
-    - "L0.92[42,986] 1.05us     |-----------------------------------------L0.92------------------------------------------|"
-    - "**** 3 Output Files (parquet_file_id not yet assigned), 200mb total:"
-    - "L0                                                                                                                 "
-    - "L0.?[42,356] 1.05us 67mb |-----------L0.?------------|                                                             "
-    - "L0.?[357,670] 1.05us 66mb                              |-----------L0.?------------|                               "
-    - "L0.?[671,986] 1.05us 67mb                                                           |------------L0.?------------| "
-    - "Committing partition 1:"
-    - "  Soft Deleting 3 files: L1.81, L1.82, L0.92"
-    - "  Creating 7 files"
-    - "**** Simulation run 19, type=split(ReduceOverlap)(split_times=[672]). 1 Input Files, 67mb total:"
+    - "**** Simulation run 15, type=compact(ManySmallFiles). 2 Input Files, 67mb total:"
+    - "L0, all files 33mb                                                                                                 "
+    - "L0.75[42,356] 1.04us     |-----------------------------------------L0.75------------------------------------------|"
+    - "L0.78[42,356] 1.05us     |-----------------------------------------L0.78------------------------------------------|"
+    - "**** 1 Output Files (parquet_file_id not yet assigned), 67mb total:"
    - "L0, all files 67mb                                                                                                 "
-    - "L0.99[671,986] 1.05us    |-----------------------------------------L0.99------------------------------------------|"
-    - "**** 2 Output Files (parquet_file_id not yet assigned), 67mb total:"
-    - "L0                                                                                                                 "
-    - "L0.?[671,672] 1.05us 218kb|L0.?|                                                                                    "
-    - "L0.?[673,986] 1.05us 67mb|-----------------------------------------L0.?------------------------------------------| "
-    - "**** Simulation run 20, type=split(ReduceOverlap)(split_times=[357]). 1 Input Files, 66mb total:"
-    - "L0, all files 66mb                                                                                                 "
-    - "L0.98[357,670] 1.05us    |-----------------------------------------L0.98------------------------------------------|"
-    - "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
-    - "L0                                                                                                                 "
-    - "L0.?[357,357] 1.05us 0b  |L0.?|                                                                                    "
-    - "L0.?[358,670] 1.05us 66mb|-----------------------------------------L0.?------------------------------------------| "
+    - "L0.?[42,356] 1.05us      |------------------------------------------L0.?------------------------------------------|"
    - "Committing partition 1:"
-    - "  Soft Deleting 2 files: L0.98, L0.99"
-    - "  Creating 4 files"
-    - "**** Simulation run 21, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[232]). 4 Input Files, 167mb total:"
+    - "  Soft Deleting 2 files: L0.75, L0.78"
+    - "  Creating 1 files"
+    - "**** Simulation run 16, type=compact(ManySmallFiles). 2 Input Files, 66mb total:"
+    - "L0, all files 33mb                                                                                                 "
+    - "L0.87[358,670] 1.04us    |-----------------------------------------L0.87------------------------------------------|"
+    - "L0.91[358,670] 1.05us    |-----------------------------------------L0.91------------------------------------------|"
+    - "**** 1 Output Files (parquet_file_id not yet assigned), 66mb total:"
+    - "L0, all files 66mb                                                                                                 "
+    - "L0.?[358,670] 1.05us     |------------------------------------------L0.?------------------------------------------|"
+    - "Committing partition 1:"
+    - "  Soft Deleting 2 files: L0.87, L0.91"
+    - "  Creating 1 files"
+    - "**** Simulation run 17, type=compact(ManySmallFiles). 2 Input Files, 218kb total:"
+    - "L0, all files 109kb                                                                                                "
+    - "L0.84[671,672] 1.04us    |-----------------------------------------L0.84------------------------------------------|"
+    - "L0.88[671,672] 1.05us    |-----------------------------------------L0.88------------------------------------------|"
+    - "**** 1 Output Files (parquet_file_id not yet assigned), 218kb total:"
+    - "L0, all files 218kb                                                                                                "
+    - "L0.?[671,672] 1.05us     |------------------------------------------L0.?------------------------------------------|"
+    - "Committing partition 1:"
+    - "  Soft Deleting 2 files: L0.84, L0.88"
+    - "  Creating 1 files"
+    - "**** Simulation run 18, type=compact(ManySmallFiles). 2 Input Files, 67mb total:"
+    - "L0, all files 33mb                                                                                                 "
+    - "L0.85[673,986] 1.04us    |-----------------------------------------L0.85------------------------------------------|"
+    - "L0.89[673,986] 1.05us    |-----------------------------------------L0.89------------------------------------------|"
+    - "**** 1 Output Files (parquet_file_id not yet assigned), 67mb total:"
+    - "L0, all files 67mb                                                                                                 "
+    - "L0.?[673,986] 1.05us     |------------------------------------------L0.?------------------------------------------|"
+    - "Committing partition 1:"
+    - "  Soft Deleting 2 files: L0.85, L0.89"
+    - "  Creating 1 files"
+    - "**** Simulation run 19, type=compact(ManySmallFiles). 2 Input Files, 0b total:"
+    - "L0, all files 0b                                                                                                   "
+    - "L0.86[357,357] 1.04us    |-----------------------------------------L0.86------------------------------------------|"
+    - "L0.90[357,357] 1.05us    |-----------------------------------------L0.90------------------------------------------|"
+    - "**** 1 Output Files (parquet_file_id not yet assigned), 0b total:"
+    - "L0, all files 0b                                                                                                   "
+    - "L0.?[357,357] 1.05us     |------------------------------------------L0.?------------------------------------------|"
+    - "Committing partition 1:"
+    - "  Soft Deleting 2 files: L0.86, L0.90"
+    - "  Creating 1 files"
+    - "**** Simulation run 20, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[232]). 3 Input Files, 167mb total:"
    - "L0                                                                                                                 "
-    - "L0.97[42,356] 1.05us 67mb|-----------------------------------------L0.97-----------------------------------------| "
-    - "L0.102[357,357] 1.05us 0b                                                                                          |L0.102|"
+    - "L0.92[42,356] 1.05us 67mb|-----------------------------------------L0.92-----------------------------------------| "
+    - "L0.96[357,357] 1.05us 0b                                                                                           |L0.96|"
    - "L1                                                                                                                 "
-    - "L1.95[42,356] 1.03us 100mb|-----------------------------------------L1.95-----------------------------------------| "
-    - "L1.96[357,357] 1.03us 325kb                                                                                          |L1.96|"
+    - "L1.81[42,357] 1.03us 100mb|-----------------------------------------L1.81------------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 167mb total:"
    - "L1                                                                                                                 "
    - "L1.?[42,232] 1.05us 101mb|------------------------L1.?------------------------|                                    "
    - "L1.?[233,357] 1.05us 66mb                                                      |--------------L1.?---------------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 4 files: L1.95, L1.96, L0.97, L0.102"
+    - "  Soft Deleting 3 files: L1.81, L0.92, L0.96"
    - "  Creating 2 files"
-    - "**** Simulation run 22, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[547]). 4 Input Files, 166mb total:"
+    - "**** Simulation run 21, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[547]). 3 Input Files, 166mb total:"
    - "L0                                                                                                                 "
-    - "L0.103[358,670] 1.05us 66mb|----------------------------------------L0.103-----------------------------------------| "
-    - "L0.100[671,672] 1.05us 218kb                                                                                         |L0.100|"
+    - "L0.93[358,670] 1.05us 66mb|-----------------------------------------L0.93-----------------------------------------| "
+    - "L0.94[671,672] 1.05us 218kb                                                                                         |L0.94|"
    - "L1                                                                                                                 "
-    - "L1.93[358,670] 1.03us 99mb|-----------------------------------------L1.93-----------------------------------------| "
-    - "L1.94[671,672] 1.03us 651kb                                                                                         |L1.94|"
+    - "L1.82[358,672] 1.03us 100mb|-----------------------------------------L1.82------------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 166mb total:"
    - "L1                                                                                                                 "
    - "L1.?[358,547] 1.05us 100mb|------------------------L1.?------------------------|                                    "
    - "L1.?[548,672] 1.05us 66mb                                                      |--------------L1.?---------------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 4 files: L1.93, L1.94, L0.100, L0.103"
+    - "  Soft Deleting 3 files: L1.82, L0.93, L0.94"
    - "  Creating 2 files"
-    - "**** Simulation run 23, type=split(CompactAndSplitOutput(TotalSizeLessThanMaxCompactSize))(split_times=[861]). 2 Input Files, 167mb total:"
+    - "**** Simulation run 22, type=split(CompactAndSplitOutput(TotalSizeLessThanMaxCompactSize))(split_times=[861]). 2 Input Files, 167mb total:"
    - "L0                                                                                                                 "
-    - "L0.101[673,986] 1.05us 67mb|-----------------------------------------L0.101-----------------------------------------|"
+    - "L0.95[673,986] 1.05us 67mb|-----------------------------------------L0.95------------------------------------------|"
    - "L1                                                                                                                 "
    - "L1.83[673,986] 1.03us 100mb|-----------------------------------------L1.83------------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 167mb total:"
@ -844,60 +832,60 @@ async fn random_backfill_over_l2s() {
    - "L1.?[673,861] 1.05us 100mb|------------------------L1.?------------------------|                                    "
    - "L1.?[862,986] 1.05us 67mb                                                      |--------------L1.?---------------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 2 files: L1.83, L0.101"
+    - "  Soft Deleting 2 files: L1.83, L0.95"
    - "  Creating 2 files"
-    - "**** Simulation run 24, type=split(ReduceOverlap)(split_times=[399, 499]). 1 Input Files, 100mb total:"
+    - "**** Simulation run 23, type=split(ReduceOverlap)(split_times=[399, 499]). 1 Input Files, 100mb total:"
    - "L1, all files 100mb                                                                                                "
-    - "L1.106[358,547] 1.05us   |-----------------------------------------L1.106-----------------------------------------|"
+    - "L1.99[358,547] 1.05us    |-----------------------------------------L1.99------------------------------------------|"
    - "**** 3 Output Files (parquet_file_id not yet assigned), 100mb total:"
    - "L1                                                                                                                 "
    - "L1.?[358,399] 1.05us 22mb|------L1.?-------|                                                                       "
    - "L1.?[400,499] 1.05us 52mb                    |--------------------L1.?---------------------|                       "
    - "L1.?[500,547] 1.05us 26mb                                                                   |--------L1.?--------| "
-    - "**** Simulation run 25, type=split(ReduceOverlap)(split_times=[299]). 1 Input Files, 66mb total:"
+    - "**** Simulation run 24, type=split(ReduceOverlap)(split_times=[299]). 1 Input Files, 66mb total:"
    - "L1, all files 66mb                                                                                                 "
-    - "L1.105[233,357] 1.05us   |-----------------------------------------L1.105-----------------------------------------|"
+    - "L1.98[233,357] 1.05us    |-----------------------------------------L1.98------------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
    - "L1                                                                                                                 "
    - "L1.?[233,299] 1.05us 35mb|--------------------L1.?---------------------|                                           "
    - "L1.?[300,357] 1.05us 31mb                                                |-----------------L1.?------------------| "
-    - "**** Simulation run 26, type=split(ReduceOverlap)(split_times=[99, 199]). 1 Input Files, 101mb total:"
+    - "**** Simulation run 25, type=split(ReduceOverlap)(split_times=[99, 199]). 1 Input Files, 101mb total:"
    - "L1, all files 101mb                                                                                                "
-    - "L1.104[42,232] 1.05us    |-----------------------------------------L1.104-----------------------------------------|"
+    - "L1.97[42,232] 1.05us     |-----------------------------------------L1.97------------------------------------------|"
    - "**** 3 Output Files (parquet_file_id not yet assigned), 101mb total:"
    - "L1                                                                                                                 "
    - "L1.?[42,99] 1.05us 30mb  |----------L1.?-----------|                                                               "
    - "L1.?[100,199] 1.05us 52mb                           |--------------------L1.?--------------------|                 "
    - "L1.?[200,232] 1.05us 18mb                                                                          |----L1.?-----| "
-    - "**** Simulation run 27, type=split(ReduceOverlap)(split_times=[599]). 1 Input Files, 66mb total:"
+    - "**** Simulation run 26, type=split(ReduceOverlap)(split_times=[599]). 1 Input Files, 66mb total:"
    - "L1, all files 66mb                                                                                                 "
-    - "L1.107[548,672] 1.05us   |-----------------------------------------L1.107-----------------------------------------|"
+    - "L1.100[548,672] 1.05us   |-----------------------------------------L1.100-----------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
    - "L1                                                                                                                 "
    - "L1.?[548,599] 1.05us 27mb|---------------L1.?----------------|                                                     "
    - "L1.?[600,672] 1.05us 39mb                                     |-----------------------L1.?-----------------------| "
-    - "**** Simulation run 28, type=split(ReduceOverlap)(split_times=[899]). 1 Input Files, 67mb total:"
+    - "**** Simulation run 27, type=split(ReduceOverlap)(split_times=[899]). 1 Input Files, 67mb total:"
    - "L1, all files 67mb                                                                                                 "
-    - "L1.109[862,986] 1.05us   |-----------------------------------------L1.109-----------------------------------------|"
+    - "L1.102[862,986] 1.05us   |-----------------------------------------L1.102-----------------------------------------|"
    - "**** 2 Output Files (parquet_file_id not yet assigned), 67mb total:"
    - "L1                                                                                                                 "
    - "L1.?[862,899] 1.05us 20mb|----------L1.?----------|                                                                "
    - "L1.?[900,986] 1.05us 47mb                           |----------------------------L1.?----------------------------| "
-    - "**** Simulation run 29, type=split(ReduceOverlap)(split_times=[699, 799]). 1 Input Files, 100mb total:"
+    - "**** Simulation run 28, type=split(ReduceOverlap)(split_times=[699, 799]). 1 Input Files, 100mb total:"
    - "L1, all files 100mb                                                                                                "
-    - "L1.108[673,861] 1.05us   |-----------------------------------------L1.108-----------------------------------------|"
+    - "L1.101[673,861] 1.05us   |-----------------------------------------L1.101-----------------------------------------|"
    - "**** 3 Output Files (parquet_file_id not yet assigned), 100mb total:"
    - "L1                                                                                                                 "
    - "L1.?[673,699] 1.05us 14mb|---L1.?---|                                                                              "
    - "L1.?[700,799] 1.05us 53mb            |--------------------L1.?---------------------|                               "
    - "L1.?[800,861] 1.05us 34mb                                                            |-----------L1.?------------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 6 files: L1.104, L1.105, L1.106, L1.107, L1.108, L1.109"
+    - "  Soft Deleting 6 files: L1.97, L1.98, L1.99, L1.100, L1.101, L1.102"
    - "  Creating 15 files"
-    - "**** Simulation run 30, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[71, 142]). 4 Input Files, 283mb total:"
+    - "**** Simulation run 29, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[71, 142]). 4 Input Files, 283mb total:"
    - "L1                                                                                                                 "
-    - "L1.115[42,99] 1.05us 30mb                  |--------L1.115---------|                                               "
-    - "L1.116[100,199] 1.05us 52mb                                             |------------------L1.116------------------| "
+    - "L1.108[42,99] 1.05us 30mb                  |--------L1.108---------|                                               "
+    - "L1.109[100,199] 1.05us 52mb                                             |------------------L1.109------------------| "
    - "L2                                                                                                                 "
    - "L2.1[0,99] 99ns 100mb    |-------------------L2.1-------------------|                                              "
    - "L2.2[100,199] 199ns 100mb                                             |-------------------L2.2-------------------| "
@ -907,13 +895,13 @@ async fn random_backfill_over_l2s() {
    - "L2.?[72,142] 1.05us 99mb                                 |------------L2.?-------------|                           "
    - "L2.?[143,199] 1.05us 82mb                                                                |---------L2.?----------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 4 files: L2.1, L2.2, L1.115, L1.116"
+    - "  Soft Deleting 4 files: L2.1, L2.2, L1.108, L1.109"
    - "  Creating 3 files"
-    - "**** Simulation run 31, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[271, 342]). 5 Input Files, 284mb total:"
+    - "**** Simulation run 30, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[271, 342]). 5 Input Files, 284mb total:"
    - "L1                                                                                                                 "
-    - "L1.117[200,232] 1.05us 18mb|---L1.117---|                                                                            "
-    - "L1.113[233,299] 1.05us 35mb              |----------L1.113-----------|                                               "
-    - "L1.114[300,357] 1.05us 31mb                                             |--------L1.114---------|                    "
+    - "L1.110[200,232] 1.05us 18mb|---L1.110---|                                                                            "
+    - "L1.106[233,299] 1.05us 35mb              |----------L1.106-----------|                                               "
+    - "L1.107[300,357] 1.05us 31mb                                             |--------L1.107---------|                    "
    - "L2                                                                                                                 "
    - "L2.3[200,299] 299ns 100mb|-------------------L2.3-------------------|                                              "
    - "L2.4[300,399] 399ns 100mb                                             |-------------------L2.4-------------------| "
@ -923,14 +911,14 @@ async fn random_backfill_over_l2s() {
    - "L2.?[272,342] 1.05us 100mb                                |------------L2.?-------------|                           "
    - "L2.?[343,399] 1.05us 83mb                                                                |---------L2.?----------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 5 files: L2.3, L2.4, L1.113, L1.114, L1.117"
+    - "  Soft Deleting 5 files: L2.3, L2.4, L1.106, L1.107, L1.110"
    - "  Creating 3 files"
-    - "**** Simulation run 32, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[404, 465]). 4 Input Files, 257mb total:"
+    - "**** Simulation run 31, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[404, 465]). 4 Input Files, 257mb total:"
    - "L1                                                                                                                 "
-    - "L1.110[358,399] 1.05us 22mb        |-------L1.110--------|                                                           "
-    - "L1.111[400,499] 1.05us 52mb                                |------------------------L1.111-------------------------| "
+    - "L1.103[358,399] 1.05us 22mb        |-------L1.103--------|                                                           "
+    - "L1.104[400,499] 1.05us 52mb                                |------------------------L1.104-------------------------| "
    - "L2                                                                                                                 "
-    - "L2.130[343,399] 1.05us 83mb|------------L2.130------------|                                                          "
+    - "L2.123[343,399] 1.05us 83mb|------------L2.123------------|                                                          "
    - "L2.5[400,499] 499ns 100mb                                |-------------------------L2.5--------------------------| "
    - "**** 3 Output Files (parquet_file_id not yet assigned), 257mb total:"
    - "L2                                                                                                                 "
@ -938,13 +926,13 @@ async fn random_backfill_over_l2s() {
    - "L2.?[405,465] 1.05us 99mb                                   |--------------L2.?--------------|                     "
    - "L2.?[466,499] 1.05us 58mb                                                                      |------L2.?-------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 4 files: L2.5, L1.110, L1.111, L2.130"
+    - "  Soft Deleting 4 files: L2.5, L1.103, L1.104, L2.123"
    - "  Creating 3 files"
-    - "**** Simulation run 33, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[569, 638]). 5 Input Files, 292mb total:"
+    - "**** Simulation run 32, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[569, 638]). 5 Input Files, 292mb total:"
    - "L1                                                                                                                 "
-    - "L1.112[500,547] 1.05us 26mb|------L1.112-------|                                                                     "
-    - "L1.118[548,599] 1.05us 27mb                     |-------L1.118--------|                                              "
-    - "L1.119[600,672] 1.05us 39mb                                             |------------L1.119------------|             "
+    - "L1.105[500,547] 1.05us 26mb|------L1.105-------|                                                                     "
+    - "L1.111[548,599] 1.05us 27mb                     |-------L1.111--------|                                              "
+    - "L1.112[600,672] 1.05us 39mb                                             |------------L1.112------------|             "
    - "L2                                                                                                                 "
    - "L2.6[500,599] 599ns 100mb|-------------------L2.6-------------------|                                              "
    - "L2.7[600,699] 699ns 100mb                                             |-------------------L2.7-------------------| "
@ -954,14 +942,14 @@ async fn random_backfill_over_l2s() {
    - "L2.?[570,638] 1.05us 100mb                               |------------L2.?------------|                             "
    - "L2.?[639,699] 1.05us 91mb                                                              |----------L2.?-----------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 5 files: L2.6, L2.7, L1.112, L1.118, L1.119"
+    - "  Soft Deleting 5 files: L2.6, L2.7, L1.105, L1.111, L1.112"
    - "  Creating 3 files"
-    - "**** Simulation run 34, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[702, 765]). 4 Input Files, 258mb total:"
+    - "**** Simulation run 33, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[702, 765]). 4 Input Files, 258mb total:"
    - "L1                                                                                                                 "
-    - "L1.122[673,699] 1.05us 14mb                   |---L1.122---|                                                         "
-    - "L1.123[700,799] 1.05us 53mb                                  |-----------------------L1.123------------------------| "
+    - "L1.115[673,699] 1.05us 14mb                   |---L1.115---|                                                         "
+    - "L1.116[700,799] 1.05us 53mb                                  |-----------------------L1.116------------------------| "
    - "L2                                                                                                                 "
-    - "L2.136[639,699] 1.05us 91mb|------------L2.136-------------|                                                         "
+    - "L2.129[639,699] 1.05us 91mb|------------L2.129-------------|                                                         "
    - "L2.8[700,799] 799ns 100mb                                  |------------------------L2.8-------------------------| "
    - "**** 3 Output Files (parquet_file_id not yet assigned), 258mb total:"
    - "L2                                                                                                                 "
@ -969,12 +957,12 @@ async fn random_backfill_over_l2s() {
    - "L2.?[703,765] 1.05us 100mb                                    |--------------L2.?--------------|                    "
    - "L2.?[766,799] 1.05us 56mb                                                                       |------L2.?------| "
    - "Committing partition 1:"
-    - "  Soft Deleting 4 files: L2.8, L1.122, L1.123, L2.136"
+    - "  Soft Deleting 4 files: L2.8, L1.115, L1.116, L2.129"
    - "  Creating 3 files"
-    - "**** Simulation run 35, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[865]). 3 Input Files, 154mb total:"
+    - "**** Simulation run 34, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[865]). 3 Input Files, 154mb total:"
    - "L1                                                                                                                 "
-    - "L1.124[800,861] 1.05us 34mb|-----------------------L1.124------------------------|                                   "
-    - "L1.120[862,899] 1.05us 20mb                                                        |------------L1.120-------------| "
+    - "L1.117[800,861] 1.05us 34mb|-----------------------L1.117------------------------|                                   "
+    - "L1.113[862,899] 1.05us 20mb                                                        |------------L1.113-------------| "
    - "L2                                                                                                                 "
    - "L2.9[800,899] 899ns 100mb|-----------------------------------------L2.9------------------------------------------| "
    - "**** 2 Output Files (parquet_file_id not yet assigned), 154mb total:"
@ -982,28 +970,28 @@ async fn random_backfill_over_l2s() {
    - "L2.?[800,865] 1.05us 101mb|--------------------------L2.?---------------------------|                               "
    - "L2.?[866,899] 1.05us 53mb                                                           |-----------L2.?------------|  "
    - "Committing partition 1:"
-    - "  Soft Deleting 3 files: L2.9, L1.120, L1.124"
+    - "  Soft Deleting 3 files: L2.9, L1.113, L1.117"
    - "  Creating 2 files"
-    - "**** Final Output Files (4.58gb written)"
+    - "**** Final Output Files (4.06gb written)"
    - "L1                                                                                                                 "
-    - "L1.121[900,986] 1.05us 47mb                                                                                 |L1.121| "
+    - "L1.114[900,986] 1.05us 47mb                                                                                 |L1.114| "
    - "L2                                                                                                                 "
    - "L2.10[900,999] 999ns 100mb                                                                                 |L2.10-| "
-    - "L2.125[0,71] 1.05us 101mb|L2.125|                                                                                  "
-    - "L2.126[72,142] 1.05us 99mb      |L2.126|                                                                            "
-    - "L2.127[143,199] 1.05us 82mb            |L2.127|                                                                      "
-    - "L2.128[200,271] 1.05us 101mb                  |L2.128|                                                                "
-    - "L2.129[272,342] 1.05us 100mb                        |L2.129|                                                          "
-    - "L2.131[343,404] 1.05us 100mb                              |L2.131|                                                    "
-    - "L2.132[405,465] 1.05us 99mb                                    |L2.132|                                              "
-    - "L2.133[466,499] 1.05us 58mb                                         |L2.133|                                         "
-    - "L2.134[500,569] 1.05us 101mb                                             |L2.134|                                     "
-    - "L2.135[570,638] 1.05us 100mb                                                   |L2.135|                               "
-    - "L2.137[639,702] 1.05us 101mb                                                         |L2.137|                         "
-    - "L2.138[703,765] 1.05us 100mb                                                               |L2.138|                   "
-    - "L2.139[766,799] 1.05us 56mb                                                                     |L2.139|             "
-    - "L2.140[800,865] 1.05us 101mb                                                                        |L2.140|          "
-    - "L2.141[866,899] 1.05us 53mb                                                                              |L2.141|    "
+    - "L2.118[0,71] 1.05us 101mb|L2.118|                                                                                  "
+    - "L2.119[72,142] 1.05us 99mb      |L2.119|                                                                            "
+    - "L2.120[143,199] 1.05us 82mb            |L2.120|                                                                      "
+    - "L2.121[200,271] 1.05us 101mb                  |L2.121|                                                                "
+    - "L2.122[272,342] 1.05us 100mb                        |L2.122|                                                          "
+    - "L2.124[343,404] 1.05us 100mb                              |L2.124|                                                    "
+    - "L2.125[405,465] 1.05us 99mb                                    |L2.125|                                              "
+    - "L2.126[466,499] 1.05us 58mb                                         |L2.126|                                         "
+    - "L2.127[500,569] 1.05us 101mb                                             |L2.127|                                     "
+    - "L2.128[570,638] 1.05us 100mb                                                   |L2.128|                               "
+    - "L2.130[639,702] 1.05us 101mb                                                         |L2.130|                         "
+    - "L2.131[703,765] 1.05us 100mb                                                               |L2.131|                   "
+    - "L2.132[766,799] 1.05us 56mb                                                                     |L2.132|             "
+    - "L2.133[800,865] 1.05us 101mb                                                                        |L2.133|          "
+    - "L2.134[866,899] 1.05us 53mb                                                                              |L2.134|    "
    "###
    );
 }
@ -3020,63 +3008,66 @@ async fn actual_case_from_catalog_1() {
    - "WARNING: file L0.161[327,333] 336ns 183mb exceeds soft limit 100mb by more than 50%"
    - "WARNING: file L0.162[330,338] 340ns 231mb exceeds soft limit 100mb by more than 50%"
    - "WARNING: file L0.163[331,338] 341ns 232mb exceeds soft limit 100mb by more than 50%"
-    - "**** Final Output Files (17.64gb written)"
+    - "**** Final Output Files (15.47gb written)"
    - "L2                                                                                                                 "
-    - "L2.578[134,149] 342ns 202mb                                   |L2.578|                                               "
-    - "L2.579[150,165] 342ns 218mb                                       |L2.579|                                           "
-    - "L2.580[166,176] 342ns 186mb                                           |L2.580|                                       "
-    - "L2.581[177,182] 342ns 150mb                                              |L2.581|                                    "
-    - "L2.582[183,197] 342ns 267mb                                                |L2.582|                                  "
-    - "L2.583[198,207] 342ns 157mb                                                    |L2.583|                              "
-    - "L2.584[208,220] 342ns 147mb                                                       |L2.584|                           "
-    - "L2.585[221,232] 342ns 270mb                                                          |L2.585|                        "
-    - "L2.588[233,253] 342ns 286mb                                                             |L2.588|                     "
-    - "L2.589[254,270] 342ns 289mb                                                                   |L2.589|               "
-    - "L2.590[271,281] 342ns 225mb                                                                       |L2.590|           "
-    - "L2.591[282,296] 342ns 234mb                                                                          |L2.591|        "
-    - "L2.592[297,302] 342ns 232mb                                                                              |L2.592|    "
-    - "L2.593[303,308] 342ns 244mb                                                                                |L2.593|  "
-    - "L2.594[309,314] 342ns 282mb                                                                                  |L2.594|"
-    - "L2.595[315,317] 342ns 214mb                                                                                   |L2.595|"
-    - "L2.596[318,320] 342ns 222mb                                                                                    |L2.596|"
-    - "L2.597[321,323] 342ns 146mb                                                                                     |L2.597|"
-    - "L2.598[324,326] 342ns 254mb                                                                                      |L2.598|"
-    - "L2.599[327,329] 342ns 197mb                                                                                      |L2.599|"
-    - "L2.600[330,332] 342ns 228mb                                                                                       |L2.600|"
-    - "L2.601[333,335] 342ns 199mb                                                                                        |L2.601|"
-    - "L2.602[336,338] 342ns 280mb                                                                                         |L2.602|"
-    - "L2.850[1,26] 342ns 101mb |L2.850|                                                                                  "
-    - "L2.853[69,85] 342ns 104mb                  |L2.853|                                                                "
-    - "L2.854[86,98] 342ns 107mb                      |L2.854|                                                            "
-    - "L2.861[27,48] 342ns 103mb      |L2.861|                                                                            "
-    - "L2.862[49,68] 342ns 98mb             |L2.862|                                                                      "
-    - "L2.863[99,108] 342ns 102mb                          |L2.863|                                                        "
-    - "L2.864[109,117] 342ns 91mb                            |L2.864|                                                      "
-    - "L2.865[118,124] 342ns 91mb                               |L2.865|                                                   "
-    - "L2.866[125,130] 342ns 107mb                                 |L2.866|                                                 "
-    - "L2.867[131,133] 342ns 64mb                                  |L2.867|                                                "
-    - "L2.868[339,339] 342ns 25mb                                                                                          |L2.868|"
-    - "WARNING: file L2.578[134,149] 342ns 202mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.579[150,165] 342ns 218mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.580[166,176] 342ns 186mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.581[177,182] 342ns 150mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.582[183,197] 342ns 267mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.583[198,207] 342ns 157mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.585[221,232] 342ns 270mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.588[233,253] 342ns 286mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.589[254,270] 342ns 289mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.590[271,281] 342ns 225mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.591[282,296] 342ns 234mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.592[297,302] 342ns 232mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.593[303,308] 342ns 244mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.594[309,314] 342ns 282mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.595[315,317] 342ns 214mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.596[318,320] 342ns 222mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.598[324,326] 342ns 254mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.599[327,329] 342ns 197mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.600[330,332] 342ns 228mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.601[333,335] 342ns 199mb exceeds soft limit 100mb by more than 50%"
-    - "WARNING: file L2.602[336,338] 342ns 280mb exceeds soft limit 100mb by more than 50%"
+    - "L2.594[150,165] 342ns 218mb                                       |L2.594|                                           "
+    - "L2.595[166,171] 342ns 118mb                                           |L2.595|                                       "
+    - "L2.598[183,197] 342ns 267mb                                                |L2.598|                                  "
+    - "L2.599[198,207] 342ns 157mb                                                    |L2.599|                              "
+    - "L2.600[208,220] 342ns 147mb                                                       |L2.600|                           "
+    - "L2.601[221,232] 342ns 270mb                                                          |L2.601|                        "
+    - "L2.602[233,244] 342ns 147mb                                                             |L2.602|                     "
+    - "L2.603[245,253] 342ns 139mb                                                                |L2.603|                  "
+    - "L2.604[271,276] 342ns 117mb                                                                       |L2.604|           "
+    - "L2.605[277,281] 342ns 109mb                                                                         |L2.605|         "
+    - "L2.612[254,261] 342ns 105mb                                                                   |L2.612|               "
+    - "L2.613[262,270] 342ns 184mb                                                                     |L2.613|             "
+    - "L2.616[309,311] 342ns 101mb                                                                                  |L2.616|"
+    - "L2.617[312,314] 342ns 181mb                                                                                  |L2.617|"
+    - "L2.618[315,317] 342ns 214mb                                                                                   |L2.618|"
+    - "L2.619[318,320] 342ns 222mb                                                                                    |L2.619|"
+    - "L2.620[321,323] 342ns 146mb                                                                                     |L2.620|"
+    - "L2.621[324,326] 342ns 254mb                                                                                      |L2.621|"
+    - "L2.622[327,329] 342ns 197mb                                                                                      |L2.622|"
+    - "L2.623[330,332] 342ns 228mb                                                                                       |L2.623|"
+    - "L2.624[333,335] 342ns 199mb                                                                                        |L2.624|"
+    - "L2.625[336,337] 342ns 156mb                                                                                         |L2.625|"
+    - "L2.626[338,338] 342ns 124mb                                                                                         |L2.626|"
+    - "L2.628[1,36] 342ns 103mb |L2.628-|                                                                                 "
+    - "L2.629[37,71] 342ns 103mb         |L2.629-|                                                                        "
+    - "L2.630[72,83] 342ns 103mb                  |L2.630|                                                                "
+    - "L2.638[172,177] 342ns 109mb                                             |L2.638|                                     "
+    - "L2.639[178,182] 342ns 109mb                                               |L2.639|                                   "
+    - "L2.640[282,288] 342ns 100mb                                                                          |L2.640|        "
+    - "L2.643[300,303] 342ns 110mb                                                                               |L2.643|   "
+    - "L2.646[84,94] 342ns 107mb                      |L2.646|                                                            "
+    - "L2.647[95,104] 342ns 97mb                         |L2.647|                                                         "
+    - "L2.648[105,111] 342ns 86mb                           |L2.648|                                                       "
+    - "L2.649[112,119] 342ns 114mb                             |L2.649|                                                     "
+    - "L2.650[120,126] 342ns 98mb                               |L2.650|                                                   "
+    - "L2.651[127,130] 342ns 82mb                                 |L2.651|                                                 "
+    - "L2.652[131,138] 342ns 108mb                                  |L2.652|                                                "
+    - "L2.653[139,145] 342ns 93mb                                    |L2.653|                                              "
+    - "L2.654[146,149] 342ns 77mb                                      |L2.654|                                            "
+    - "L2.655[289,293] 342ns 110mb                                                                            |L2.655|      "
+    - "L2.656[294,297] 342ns 82mb                                                                              |L2.656|    "
+    - "L2.657[298,299] 342ns 82mb                                                                               |L2.657|   "
+    - "L2.658[304,306] 342ns 113mb                                                                                |L2.658|  "
+    - "L2.659[307,308] 342ns 113mb                                                                                 |L2.659| "
+    - "L2.660[339,339] 342ns 25mb                                                                                          |L2.660|"
+    - "WARNING: file L2.594[150,165] 342ns 218mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.598[183,197] 342ns 267mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.599[198,207] 342ns 157mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.601[221,232] 342ns 270mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.613[262,270] 342ns 184mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.617[312,314] 342ns 181mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.618[315,317] 342ns 214mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.619[318,320] 342ns 222mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.621[324,326] 342ns 254mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.622[327,329] 342ns 197mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.623[330,332] 342ns 228mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.624[333,335] 342ns 199mb exceeds soft limit 100mb by more than 50%"
+    - "WARNING: file L2.625[336,337] 342ns 156mb exceeds soft limit 100mb by more than 50%"
    "###
    );
 }
--- a/compactor/tests/layouts/many_files.rs
+++ b/compactor/tests/layouts/many_files.rs
@ -4670,17 +4670,17 @@ async fn l0s_almost_needing_vertical_split() {
    - "L0.998[24,100] 1.02us    |-----------------------------------------L0.998-----------------------------------------|"
    - "L0.999[24,100] 1.02us    |-----------------------------------------L0.999-----------------------------------------|"
    - "L0.1000[24,100] 1.02us   |----------------------------------------L0.1000-----------------------------------------|"
-    - "**** Final Output Files (6.5gb written)"
+    - "**** Final Output Files (5.23gb written)"
    - "L2                                                                                                                 "
-    - "L2.3141[24,37] 1.02us 108mb|---L2.3141---|                                                                           "
-    - "L2.3150[38,49] 1.02us 102mb                |--L2.3150--|                                                             "
-    - "L2.3151[50,60] 1.02us 93mb                              |-L2.3151-|                                                 "
-    - "L2.3152[61,63] 1.02us 37mb                                           |L2.3152|                                      "
-    - "L2.3153[64,73] 1.02us 101mb                                               |L2.3153-|                                 "
-    - "L2.3154[74,82] 1.02us 90mb                                                           |L2.3154|                      "
-    - "L2.3155[83,90] 1.02us 101mb                                                                     |L2.3155|            "
-    - "L2.3156[91,98] 1.02us 93mb                                                                               |L2.3156|  "
-    - "L2.3157[99,100] 1.02us 26mb                                                                                        |L2.3157|"
+    - "L2.3086[24,35] 1.02us 102mb|--L2.3086--|                                                                             "
+    - "L2.3095[36,47] 1.02us 105mb              |--L2.3095--|                                                               "
+    - "L2.3096[48,58] 1.02us 95mb                            |-L2.3096-|                                                   "
+    - "L2.3097[59,65] 1.02us 76mb                                         |L2.3097|                                        "
+    - "L2.3098[66,76] 1.02us 106mb                                                 |-L2.3098-|                              "
+    - "L2.3099[77,86] 1.02us 96mb                                                              |L2.3099-|                  "
+    - "L2.3100[87,90] 1.02us 53mb                                                                          |L2.3100|       "
+    - "L2.3101[91,98] 1.02us 90mb                                                                               |L2.3101|  "
+    - "L2.3102[99,100] 1.02us 26mb                                                                                        |L2.3102|"
    "###
    );
 }
--- a/compactor/tests/layouts/stuck.rs
+++ b/compactor/tests/layouts/stuck.rs
--- a/compactor_scheduler/src/commit/logging.rs
+++ b/compactor_scheduler/src/commit/logging.rs
@ -78,14 +78,12 @@ where

 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use assert_matches::assert_matches;
-    use test_helpers::tracing::TracingCapture;
-
    use super::*;
    use crate::commit::mock::{CommitHistoryEntry, MockCommit};
-    use iox_tests::ParquetFileBuilder;
+    use assert_matches::assert_matches;
+    use iox_tests::{partition_identifier, ParquetFileBuilder};
+    use std::sync::Arc;
+    use test_helpers::tracing::TracingCapture;

    #[test]
    fn test_display() {
@ -111,14 +109,21 @@ mod tests {
            .with_row_count(105)
            .build();

-        let created_1 = ParquetFileBuilder::new(1000).with_partition(1).build();
-        let created_2 = ParquetFileBuilder::new(1001).with_partition(1).build();
+        let partition_id_1 = PartitionId::new(1);
+        let transition_partition_id_1 = partition_identifier(1);
+
+        let created_1 = ParquetFileBuilder::new(1000)
+            .with_partition(transition_partition_id_1.clone())
+            .build();
+        let created_2 = ParquetFileBuilder::new(1001)
+            .with_partition(transition_partition_id_1)
+            .build();

        let capture = TracingCapture::new();

        let ids = commit
            .commit(
-                PartitionId::new(1),
+                partition_id_1,
                &[existing_1.clone()],
                &[],
                &[created_1.clone().into(), created_2.clone().into()],
@ -130,9 +135,11 @@ mod tests {
            Ok(res) if res == vec![ParquetFileId::new(1000), ParquetFileId::new(1001)]
        );

+        let partition_id_2 = PartitionId::new(2);
+
        let ids = commit
            .commit(
-                PartitionId::new(2),
+                partition_id_2,
                &[existing_2.clone(), existing_3.clone()],
                &[existing_1.clone()],
                &[],
@ -151,14 +158,14 @@ level = INFO; message = committed parquet file change; target_level = Final; par
            inner.history(),
            vec![
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(1),
+                    partition_id: partition_id_1,
                    delete: vec![existing_1.clone()],
                    upgrade: vec![],
                    created: vec![created_1, created_2],
                    target_level: CompactionLevel::Final,
                },
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(2),
+                    partition_id: partition_id_2,
                    delete: vec![existing_2, existing_3],
                    upgrade: vec![existing_1],
                    created: vec![],
--- a/compactor_scheduler/src/commit/metrics.rs
+++ b/compactor_scheduler/src/commit/metrics.rs
@ -303,15 +303,12 @@ where

 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use assert_matches::assert_matches;
-    use metric::{assert_histogram, Attributes};
-
-    use crate::commit::mock::{CommitHistoryEntry, MockCommit};
-    use iox_tests::ParquetFileBuilder;
-
    use super::*;
+    use crate::commit::mock::{CommitHistoryEntry, MockCommit};
+    use assert_matches::assert_matches;
+    use iox_tests::{partition_identifier, ParquetFileBuilder};
+    use metric::{assert_histogram, Attributes};
+    use std::sync::Arc;

    #[test]
    fn test_display() {
@ -326,6 +323,9 @@ mod tests {
        let inner = Arc::new(MockCommit::new());
        let commit = MetricsCommitWrapper::new(Arc::clone(&inner), &registry);

+        let partition_id_1 = PartitionId::new(1);
+        let transition_partition_id_1 = partition_identifier(1);
+
        let existing_1 = ParquetFileBuilder::new(1)
            .with_file_size_bytes(10_001)
            .with_row_count(1_001)
@ -350,7 +350,7 @@ mod tests {
        let created = ParquetFileBuilder::new(1000)
            .with_file_size_bytes(10_016)
            .with_row_count(1_016)
-            .with_partition(1)
+            .with_partition(transition_partition_id_1)
            .with_compaction_level(CompactionLevel::Initial)
            .build();

@ -392,7 +392,7 @@ mod tests {

        let ids = commit
            .commit(
-                PartitionId::new(1),
+                partition_id_1,
                &[existing_1.clone()],
                &[existing_2a.clone()],
                &[created.clone().into()],
@ -401,9 +401,11 @@ mod tests {
            .await;
        assert_matches!(ids, Ok(res) if res == vec![ParquetFileId::new(1000)]);

+        let partition_id_2 = PartitionId::new(2);
+
        let ids = commit
            .commit(
-                PartitionId::new(2),
+                partition_id_2,
                &[existing_2b.clone(), existing_3.clone()],
                &[existing_4.clone()],
                &[],
@ -449,14 +451,14 @@ mod tests {
            inner.history(),
            vec![
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(1),
+                    partition_id: partition_id_1,
                    delete: vec![existing_1],
                    upgrade: vec![existing_2a.clone()],
                    created: vec![created],
                    target_level: CompactionLevel::FileNonOverlapped,
                },
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(2),
+                    partition_id: partition_id_2,
                    delete: vec![existing_2b, existing_3],
                    upgrade: vec![existing_4],
                    created: vec![],
--- a/compactor_scheduler/src/commit/mock.rs
+++ b/compactor_scheduler/src/commit/mock.rs
@ -78,10 +78,9 @@ impl Commit for MockCommit {

 #[cfg(test)]
 mod tests {
-    use assert_matches::assert_matches;
-    use iox_tests::ParquetFileBuilder;
-
    use super::*;
+    use assert_matches::assert_matches;
+    use iox_tests::{partition_identifier, ParquetFileBuilder};

    #[test]
    fn test_display() {
@ -92,6 +91,11 @@ mod tests {
    async fn test_commit() {
        let commit = MockCommit::new();

+        let partition_id_1 = PartitionId::new(1);
+        let transition_partition_id_1 = partition_identifier(1);
+        let partition_id_2 = PartitionId::new(2);
+        let transition_partition_id_2 = partition_identifier(2);
+
        let existing_1 = ParquetFileBuilder::new(1).build();
        let existing_2 = ParquetFileBuilder::new(2).build();
        let existing_3 = ParquetFileBuilder::new(3).build();
@ -101,14 +105,22 @@ mod tests {
        let existing_7 = ParquetFileBuilder::new(7).build();
        let existing_8 = ParquetFileBuilder::new(8).build();

-        let created_1_1 = ParquetFileBuilder::new(1000).with_partition(1).build();
-        let created_1_2 = ParquetFileBuilder::new(1001).with_partition(1).build();
-        let created_1_3 = ParquetFileBuilder::new(1003).with_partition(1).build();
-        let created_2_1 = ParquetFileBuilder::new(1002).with_partition(2).build();
+        let created_1_1 = ParquetFileBuilder::new(1000)
+            .with_partition(transition_partition_id_1.clone())
+            .build();
+        let created_1_2 = ParquetFileBuilder::new(1001)
+            .with_partition(transition_partition_id_1.clone())
+            .build();
+        let created_1_3 = ParquetFileBuilder::new(1003)
+            .with_partition(transition_partition_id_1)
+            .build();
+        let created_2_1 = ParquetFileBuilder::new(1002)
+            .with_partition(transition_partition_id_2)
+            .build();

        let ids = commit
            .commit(
-                PartitionId::new(1),
+                partition_id_1,
                &[existing_1.clone(), existing_2.clone()],
                &[existing_3.clone(), existing_4.clone()],
                &[created_1_1.clone().into(), created_1_2.clone().into()],
@ -122,7 +134,7 @@ mod tests {

        let ids = commit
            .commit(
-                PartitionId::new(2),
+                partition_id_2,
                &[existing_3.clone()],
                &[],
                &[created_2_1.clone().into()],
@ -136,7 +148,7 @@ mod tests {

        let ids = commit
            .commit(
-                PartitionId::new(1),
+                partition_id_1,
                &[existing_5.clone(), existing_6.clone(), existing_7.clone()],
                &[],
                &[created_1_3.clone().into()],
@ -151,7 +163,7 @@ mod tests {
        // simulate fill implosion of the file (this may happen w/ delete predicates)
        let ids = commit
            .commit(
-                PartitionId::new(1),
+                partition_id_1,
                &[existing_8.clone()],
                &[],
                &[],
@ -167,28 +179,28 @@ mod tests {
            commit.history(),
            vec![
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(1),
+                    partition_id: partition_id_1,
                    delete: vec![existing_1, existing_2],
                    upgrade: vec![existing_3.clone(), existing_4.clone()],
                    created: vec![created_1_1, created_1_2],
                    target_level: CompactionLevel::FileNonOverlapped,
                },
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(2),
+                    partition_id: partition_id_2,
                    delete: vec![existing_3],
                    upgrade: vec![],
                    created: vec![created_2_1],
                    target_level: CompactionLevel::Final,
                },
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(1),
+                    partition_id: partition_id_1,
                    delete: vec![existing_5, existing_6, existing_7,],
                    upgrade: vec![],
                    created: vec![created_1_3],
                    target_level: CompactionLevel::FileNonOverlapped,
                },
                CommitHistoryEntry {
-                    partition_id: PartitionId::new(1),
+                    partition_id: partition_id_1,
                    delete: vec![existing_8],
                    upgrade: vec![],
                    created: vec![],
--- a/compactor_scheduler/tests/local_scheduler/mod.rs
+++ b/compactor_scheduler/tests/local_scheduler/mod.rs
@ -4,7 +4,7 @@ use assert_matches::assert_matches;
 use compactor_scheduler::{
    create_scheduler, CompactionJob, LocalSchedulerConfig, Scheduler, SchedulerConfig,
 };
-use data_types::{ColumnType, ParquetFile, ParquetFileParams, PartitionId};
+use data_types::{ColumnType, ParquetFile, ParquetFileParams, PartitionId, TransitionPartitionId};
 use iox_tests::{ParquetFileBuilder, TestCatalog, TestParquetFileBuilder, TestPartition};

 mod end_job;
@ -65,7 +65,7 @@ impl TestLocalScheduler {

    pub async fn create_params_for_new_parquet_file(&self) -> ParquetFileParams {
        ParquetFileBuilder::new(42)
-            .with_partition(self.get_partition_id().get())
+            .with_partition(self.get_transition_partition_id())
            .build()
            .into()
    }
@ -81,4 +81,8 @@ impl TestLocalScheduler {
    pub fn get_partition_id(&self) -> PartitionId {
        self.test_partition.partition.id
    }
+
+    pub fn get_transition_partition_id(&self) -> TransitionPartitionId {
+        self.test_partition.partition.transition_partition_id()
+    }
 }
--- a/compactor_test_utils/src/simulator.rs
+++ b/compactor_test_utils/src/simulator.rs
@ -202,8 +202,7 @@ impl SimulatedFile {
        ParquetFileParams {
            namespace_id: partition_info.namespace_id,
            table_id: partition_info.table.id,
-            partition_id: partition_info.partition_id,
-            partition_hash_id: partition_info.partition_hash_id.clone(),
+            partition_id: partition_info.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            min_time,
            max_time,
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@ -527,10 +527,9 @@ pub struct ParquetFile {
    pub namespace_id: NamespaceId,
    /// the table
    pub table_id: TableId,
-    /// the partition
-    pub partition_id: PartitionId,
-    /// the partition hash ID, if generated
-    pub partition_hash_id: Option<PartitionHashId>,
+    /// the partition identifier
+    #[sqlx(flatten)]
+    pub partition_id: TransitionPartitionId,
    /// the uuid used in the object store path for this file
    pub object_store_id: Uuid,
    /// the min timestamp of data in this file
@ -588,7 +587,6 @@ impl ParquetFile {
            namespace_id: params.namespace_id,
            table_id: params.table_id,
            partition_id: params.partition_id,
-            partition_hash_id: params.partition_hash_id,
            object_store_id: params.object_store_id,
            min_time: params.min_time,
            max_time: params.max_time,
@ -602,21 +600,9 @@ impl ParquetFile {
        }
    }

-    /// If this parquet file params will be storing a `PartitionHashId` in the catalog, use that.
-    /// Otherwise, use the database-assigned `PartitionId`.
-    pub fn transition_partition_id(&self) -> TransitionPartitionId {
-        TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
-    }
-
    /// Estimate the memory consumption of this object and its contents
    pub fn size(&self) -> usize {
-        std::mem::size_of_val(self)
-            + self
-                .partition_hash_id
-                .as_ref()
-                .map(|id| id.size() - std::mem::size_of_val(id))
-                .unwrap_or_default()
-            + self.column_set.size()
+        std::mem::size_of_val(self) + self.partition_id.size() + self.column_set.size()
            - std::mem::size_of_val(&self.column_set)
    }

@ -638,10 +624,8 @@ pub struct ParquetFileParams {
    pub namespace_id: NamespaceId,
    /// the table
    pub table_id: TableId,
-    /// the partition
-    pub partition_id: PartitionId,
-    /// the partition hash ID, if generated
-    pub partition_hash_id: Option<PartitionHashId>,
+    /// the partition identifier
+    pub partition_id: TransitionPartitionId,
    /// the uuid used in the object store path for this file
    pub object_store_id: Uuid,
    /// the min timestamp of data in this file
@ -662,21 +646,12 @@ pub struct ParquetFileParams {
    pub max_l0_created_at: Timestamp,
 }

-impl ParquetFileParams {
-    /// If this parquet file params will be storing a `PartitionHashId` in the catalog, use that.
-    /// Otherwise, use the database-assigned `PartitionId`.
-    pub fn transition_partition_id(&self) -> TransitionPartitionId {
-        TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
-    }
-}
-
 impl From<ParquetFile> for ParquetFileParams {
    fn from(value: ParquetFile) -> Self {
        Self {
            namespace_id: value.namespace_id,
            table_id: value.table_id,
            partition_id: value.partition_id,
-            partition_hash_id: value.partition_hash_id,
            object_store_id: value.object_store_id,
            min_time: value.min_time,
            max_time: value.max_time,
--- a/data_types/src/partition.rs
+++ b/data_types/src/partition.rs
@ -31,6 +31,34 @@ impl TransitionPartitionId {
    }
 }

+impl<'a, R> sqlx::FromRow<'a, R> for TransitionPartitionId
+where
+    R: sqlx::Row,
+    &'static str: sqlx::ColumnIndex<R>,
+    PartitionId: sqlx::decode::Decode<'a, R::Database>,
+    PartitionId: sqlx::types::Type<R::Database>,
+    Option<PartitionHashId>: sqlx::decode::Decode<'a, R::Database>,
+    Option<PartitionHashId>: sqlx::types::Type<R::Database>,
+{
+    fn from_row(row: &'a R) -> sqlx::Result<Self> {
+        let partition_id: Option<PartitionId> = row.try_get("partition_id")?;
+        let partition_hash_id: Option<PartitionHashId> = row.try_get("partition_hash_id")?;
+
+        let transition_partition_id = match (partition_id, partition_hash_id) {
+            (_, Some(hash_id)) => TransitionPartitionId::Deterministic(hash_id),
+            (Some(id), _) => TransitionPartitionId::Deprecated(id),
+            (None, None) => {
+                return Err(sqlx::Error::ColumnDecode {
+                    index: "partition_id".into(),
+                    source: "Both partition_id and partition_hash_id were NULL".into(),
+                })
+            }
+        };
+
+        Ok(transition_partition_id)
+    }
+}
+
 impl From<(PartitionId, Option<&PartitionHashId>)> for TransitionPartitionId {
    fn from((partition_id, partition_hash_id): (PartitionId, Option<&PartitionHashId>)) -> Self {
        partition_hash_id
--- a/garbage_collector/src/objectstore/checker.rs
+++ b/garbage_collector/src/objectstore/checker.rs
@ -267,8 +267,7 @@ mod tests {
        let parquet_file_params = ParquetFileParams {
            namespace_id: namespace.id,
            table_id: partition.table_id,
-            partition_id: partition.id,
-            partition_hash_id: partition.hash_id().cloned(),
+            partition_id: partition.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            min_time: Timestamp::new(1),
            max_time: Timestamp::new(10),
@ -298,7 +297,7 @@ mod tests {
        let location = ParquetFilePath::new(
            file_in_catalog.namespace_id,
            file_in_catalog.table_id,
-            &file_in_catalog.transition_partition_id(),
+            &file_in_catalog.partition_id.clone(),
            file_in_catalog.object_store_id,
        )
        .object_store_path();
@ -376,7 +375,7 @@ mod tests {
        let location = ParquetFilePath::new(
            file_in_catalog.namespace_id,
            file_in_catalog.table_id,
-            &file_in_catalog.transition_partition_id(),
+            &file_in_catalog.partition_id.clone(),
            file_in_catalog.object_store_id,
        )
        .object_store_path();
@ -469,7 +468,7 @@ mod tests {
        let loc = ParquetFilePath::new(
            file_in_catalog.namespace_id,
            file_in_catalog.table_id,
-            &file_in_catalog.transition_partition_id(),
+            &file_in_catalog.partition_id.clone(),
            file_in_catalog.object_store_id,
        )
        .object_store_path();
--- a/generated_types/build.rs
+++ b/generated_types/build.rs
@ -52,6 +52,7 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
    let proto_files = vec![
        authz_path.join("authz.proto"),
        catalog_path.join("parquet_file.proto"),
+        catalog_path.join("partition_identifier.proto"),
        catalog_path.join("service.proto"),
        compactor_path.join("service.proto"),
        delete_path.join("service.proto"),
--- a/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
@ -2,6 +2,8 @@ syntax = "proto3";
 package influxdata.iox.catalog.v1;
 option go_package = "github.com/influxdata/iox/catalog/v1";

+import "influxdata/iox/catalog/v1/partition_identifier.proto";
+
 message ParquetFile {
    reserved 7;
    reserved "min_sequence_number";
@ -11,6 +13,8 @@ message ParquetFile {
    reserved "shard_id";
    reserved 8;
    reserved "max_sequence_number";
+    reserved 5;
+    reserved "partition_id";

    // the id of the file in the catalog
    int64 id = 1;
@ -18,8 +22,9 @@ message ParquetFile {
    int64 namespace_id = 3;
    // the table id
    int64 table_id = 4;
-    // the partition id
-    int64 partition_id = 5;
+
+    PartitionIdentifier partition_identifier = 19;
+
    // the object store uuid
    string object_store_id = 6;
    // the min timestamp of data in this file
--- a/generated_types/protos/influxdata/iox/catalog/v1/partition_identifier.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v1/partition_identifier.proto
@ -0,0 +1,12 @@
+syntax = "proto3";
+package influxdata.iox.catalog.v1;
+option go_package = "github.com/influxdata/iox/catalog/v1";
+
+message PartitionIdentifier {
+  // Either the catalog-assigned partition ID or the deterministic identifier created from the
+  // table ID and partition key.
+  oneof id {
+    int64 catalog_id = 1;
+    bytes hash_id = 2;
+  }
+}
--- a/generated_types/protos/influxdata/iox/catalog/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v1/service.proto
@ -3,6 +3,7 @@ package influxdata.iox.catalog.v1;
 option go_package = "github.com/influxdata/iox/catalog/v1";

 import "influxdata/iox/catalog/v1/parquet_file.proto";
+import "influxdata/iox/catalog/v1/partition_identifier.proto";

 service CatalogService {
    // Get the parquet_file catalog records in the given partition
@ -19,8 +20,11 @@ service CatalogService {
 }

 message GetParquetFilesByPartitionIdRequest {
-    // the partition id
-    int64 partition_id = 1;
+    // Was the catalog-assigned partition ID.
+    reserved 1;
+    reserved "partition_id";
+
+    PartitionIdentifier partition_identifier = 2;
 }

 message GetParquetFilesByPartitionIdResponse {
@ -35,15 +39,17 @@ message Partition {
    reserved "sequencer_id";
    reserved 7;
    reserved "shard_id";
+    reserved 1;
+    reserved "id";

-    // the partition id
-    int64 id = 1;
    // the table id the partition is in
    int64 table_id = 3;
    // the partition key
    string key = 4;
    // the sort key for data in parquet files in the partition
    repeated string array_sort_key = 6;
+
+    PartitionIdentifier identifier = 8;
 }

 message GetPartitionsByTableIdRequest {
--- a/gossip/src/dispatcher.rs
+++ b/gossip/src/dispatcher.rs
@ -1,5 +1,5 @@
 use async_trait::async_trait;
-use tracing::warn;
+use tracing::{debug, warn};

 // Re-export the bytes type to ensure upstream users of this crate are
 // interacting with the same type.
@ -32,5 +32,7 @@ pub struct NopDispatcher;

 #[async_trait::async_trait]
 impl Dispatcher for NopDispatcher {
-    async fn dispatch(&self, _payload: crate::Bytes) {}
+    async fn dispatch(&self, _payload: crate::Bytes) {
+        debug!("received no-op message payload");
+    }
 }
--- a/import_export/src/file/export.rs
+++ b/import_export/src/file/export.rs
@ -1,10 +1,13 @@
+use data_types::{PartitionHashId, PartitionId, TransitionPartitionId};
 use futures_util::TryStreamExt;
 use influxdb_iox_client::{
-    catalog::{self, generated_types::ParquetFile},
+    catalog::{
+        self,
+        generated_types::{partition_identifier, ParquetFile, PartitionIdentifier},
+    },
    connection::Connection,
    store,
 };
-use observability_deps::tracing::{debug, info};
 use std::path::{Path, PathBuf};
 use thiserror::Error;
 use tokio::{
@ -35,10 +38,6 @@ type Result<T, E = ExportError> = std::result::Result<T, E>;
 pub struct RemoteExporter {
    catalog_client: catalog::Client,
    store_client: store::Client,
-
-    /// Optional partition filter. If `Some(partition_id)`, only these
-    /// files with that `partition_id` are downloaded.
-    partition_filter: Option<i64>,
 }

 impl RemoteExporter {
@ -46,19 +45,9 @@ impl RemoteExporter {
        Self {
            catalog_client: catalog::Client::new(connection.clone()),
            store_client: store::Client::new(connection),
-            partition_filter: None,
        }
    }

-    /// Specify that only files and metadata for the specific
-    /// partition id should be exported.
-    pub fn with_partition_filter(mut self, partition_id: i64) -> Self {
-        info!(partition_id, "Filtering by partition");
-
-        self.partition_filter = Some(partition_id);
-        self
-    }
-
    /// Exports all data and metadata for `table_name` in
    /// `namespace` to local files.
    ///
@ -95,39 +84,14 @@ impl RemoteExporter {
        let indexed_parquet_file_metadata = parquet_files.into_iter().enumerate();

        for (index, parquet_file) in indexed_parquet_file_metadata {
-            if self.should_export(parquet_file.partition_id) {
-                self.export_parquet_file(
-                    &output_directory,
-                    index,
-                    num_parquet_files,
-                    &parquet_file,
-                )
+            self.export_parquet_file(&output_directory, index, num_parquet_files, &parquet_file)
                .await?;
-            } else {
-                debug!(
-                    "skipping file {} of {num_parquet_files} ({} does not match request)",
-                    index + 1,
-                    parquet_file.partition_id
-                );
-            }
        }
        println!("Done.");

        Ok(())
    }

-    /// Return true if this partition should be exported
-    fn should_export(&self, partition_id: i64) -> bool {
-        self.partition_filter
-            .map(|partition_filter| {
-                // if a partition filter was specified, only export
-                // the file if the partition matches
-                partition_filter == partition_id
-            })
-            // export files if there is no partition
-            .unwrap_or(true)
-    }
-
    /// Exports table and partition information for the specified
    /// table. Overwrites existing files, if any, to ensure it has the
    /// latest catalog information.
@ -158,13 +122,11 @@ impl RemoteExporter {
            .await?;

        for partition in partitions {
-            let partition_id = partition.id;
-            if self.should_export(partition_id) {
-                let partition_json = serde_json::to_string_pretty(&partition)?;
-                let filename = format!("partition.{partition_id}.json");
-                let file_path = output_directory.join(&filename);
-                write_string_to_file(&partition_json, &file_path).await?;
-            }
+            let partition_id = to_partition_id(partition.identifier.as_ref());
+            let partition_json = serde_json::to_string_pretty(&partition)?;
+            let filename = format!("partition.{partition_id}.json");
+            let file_path = output_directory.join(&filename);
+            write_string_to_file(&partition_json, &file_path).await?;
        }

        Ok(())
@ -183,9 +145,10 @@ impl RemoteExporter {
        parquet_file: &ParquetFile,
    ) -> Result<()> {
        let uuid = &parquet_file.object_store_id;
-        let partition_id = parquet_file.partition_id;
        let file_size_bytes = parquet_file.file_size_bytes as u64;

+        let partition_id = to_partition_id(parquet_file.partition_identifier.as_ref());
+
        // copy out the metadata as pbjson encoded data always (to
        // ensure we have the most up to date version)
        {
@ -230,6 +193,21 @@ impl RemoteExporter {
    }
 }

+fn to_partition_id(partition_identifier: Option<&PartitionIdentifier>) -> TransitionPartitionId {
+    match partition_identifier
+        .and_then(|pi| pi.id.as_ref())
+        .expect("Catalog service should send the partition identifier")
+    {
+        partition_identifier::Id::HashId(bytes) => TransitionPartitionId::Deterministic(
+            PartitionHashId::try_from(&bytes[..])
+                .expect("Catalog service should send valid hash_id bytes"),
+        ),
+        partition_identifier::Id::CatalogId(id) => {
+            TransitionPartitionId::Deprecated(PartitionId::new(*id))
+        }
+    }
+}
+
 /// writes the contents of a string to a file, overwriting the previous contents, if any
 async fn write_string_to_file(contents: &str, path: &Path) -> Result<()> {
    let mut file = OpenOptions::new()
--- a/import_export/src/file/import.rs
+++ b/import_export/src/file/import.rs
@ -7,7 +7,7 @@ use data_types::{
        NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, PARTITION_BY_DAY_PROTO,
    },
    ColumnSet, ColumnType, CompactionLevel, Namespace, NamespaceName, NamespaceNameError,
-    ParquetFileParams, Partition, PartitionHashId, Statistics, Table, TableId, Timestamp,
+    ParquetFileParams, Partition, Statistics, Table, TableId, Timestamp,
 };
 use generated_types::influxdata::iox::catalog::v1 as proto;
 //    ParquetFile as ProtoParquetFile, Partition as ProtoPartition,
@ -567,9 +567,6 @@ impl RemoteImporter {
        // need to make columns in the target catalog
        let column_set = insert_columns(table.id, decoded_iox_parquet_metadata, repos).await?;

-        // Create the the partition_hash_id
-        let partition_hash_id = Some(PartitionHashId::new(table.id, &partition.partition_key));
-
        let params = if let Some(proto_parquet_file) = &parquet_metadata {
            let compaction_level = proto_parquet_file
                .compaction_level
@ -579,8 +576,7 @@ impl RemoteImporter {
            ParquetFileParams {
                namespace_id: namespace.id,
                table_id: table.id,
-                partition_hash_id,
-                partition_id: partition.id,
+                partition_id: partition.transition_partition_id(),
                object_store_id,
                min_time: Timestamp::new(proto_parquet_file.min_time),
                max_time: Timestamp::new(proto_parquet_file.max_time),
@ -599,8 +595,7 @@ impl RemoteImporter {
            ParquetFileParams {
                namespace_id: namespace.id,
                table_id: table.id,
-                partition_hash_id,
-                partition_id: partition.id,
+                partition_id: partition.transition_partition_id(),
                object_store_id,
                min_time,
                max_time,
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@ -67,7 +67,7 @@ libc = { version = "0.2" }
 num_cpus = "1.16.0"
 once_cell = { version = "1.18", features = ["parking_lot"] }
 rustyline = { version = "12.0", default-features = false, features = ["with-file-history"]}
-serde = "1.0.177"
+serde = "1.0.179"
 serde_json = "1.0.104"
 snafu = "0.7"
 tempfile = "3.7.0"
--- a/influxdb_iox/src/commands/remote/store.rs
+++ b/influxdb_iox/src/commands/remote/store.rs
@ -55,10 +55,6 @@ struct GetTable {
    #[clap(action)]
    table: String,

-    /// If specified, only files from the specified partitions are downloaded
-    #[clap(action, short, long)]
-    partition_id: Option<i64>,
-
    /// The output directory to use. If not specified, files will be placed in a directory named
    /// after the table in the current working directory.
    #[clap(action, short)]
@ -91,13 +87,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
        Command::GetTable(GetTable {
            namespace,
            table,
-            partition_id,
            output_directory,
        }) => {
            let mut exporter = RemoteExporter::new(connection);
-            if let Some(partition_id) = partition_id {
-                exporter = exporter.with_partition_filter(partition_id);
-            }
            Ok(exporter
                .export_table(output_directory, namespace, table)
                .await?)
--- a/influxdb_iox/src/commands/run/all_in_one.rs
+++ b/influxdb_iox/src/commands/run/all_in_one.rs
@ -7,6 +7,7 @@ use clap_blocks::{
    catalog_dsn::CatalogDsnConfig,
    compactor::CompactorConfig,
    compactor_scheduler::CompactorSchedulerConfig,
+    gossip::GossipConfig,
    ingester::IngesterConfig,
    ingester_address::IngesterAddress,
    object_store::{make_object_store, ObjectStoreConfig},
@ -476,6 +477,7 @@ impl Config {
            persist_queue_depth,
            persist_hot_partition_cost,
            rpc_write_max_incoming_bytes: 1024 * 1024 * 1024, // 1GiB
+            gossip_config: GossipConfig::disabled(),
        };

        let router_config = RouterConfig {
@ -489,6 +491,7 @@ impl Config {
            rpc_write_replicas: 1.try_into().unwrap(),
            rpc_write_max_outgoing_bytes: ingester_config.rpc_write_max_incoming_bytes,
            rpc_write_health_error_window_seconds: Duration::from_secs(5),
+            gossip_config: GossipConfig::disabled(),
        };

        // create a CompactorConfig for the all in one server based on
@ -637,6 +640,7 @@ pub async fn command(config: Config) -> Result<()> {
        Arc::clone(&catalog),
        Arc::clone(&object_store),
        &router_config,
+        &GossipConfig::disabled(),
        router_run_config
            .tracing_config()
            .traces_jaeger_trace_context_header_name
--- a/influxdb_iox/src/commands/run/router.rs
+++ b/influxdb_iox/src/commands/run/router.rs
@ -98,6 +98,7 @@ pub async fn command(config: Config) -> Result<()> {
        catalog,
        object_store,
        &config.router_config,
+        &config.router_config.gossip_config,
        config
            .run_config
            .tracing_config()
--- a/influxdb_iox/tests/end_to_end_cases/compactor.rs
+++ b/influxdb_iox/tests/end_to_end_cases/compactor.rs
@ -157,10 +157,12 @@ async fn sharded_compactor_0_always_compacts_partition_1() {
                        .assert()
                        .success()
                        .stdout(
-                            // Important parts are the expected partition ID
-                            predicate::str::contains(r#""partitionId": "1","#)
-                                // and compaction level
-                                .and(predicate::str::contains(r#""compactionLevel": 1"#)),
+                            // Important parts are the expected partition identifier
+                            predicate::str::contains(
+                                r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
+                            )
+                            // and compaction level
+                            .and(predicate::str::contains(r#""compactionLevel": 1"#)),
                        );
                }
                .boxed()
@ -240,10 +242,12 @@ async fn sharded_compactor_1_never_compacts_partition_1() {
                        .assert()
                        .success()
                        .stdout(
-                            // Important parts are the expected partition ID
-                            predicate::str::contains(r#""partitionId": "1","#)
-                                // and compaction level is 0 so it's not returned
-                                .and(predicate::str::contains("compactionLevel").not()),
+                            // Important parts are the expected partition identifier
+                            predicate::str::contains(
+                                r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
+                            )
+                            // and compaction level is 0 so it's not returned
+                            .and(predicate::str::contains("compactionLevel").not()),
                        );
                }
                .boxed()
--- a/influxdb_iox/tests/end_to_end_cases/remote.rs
+++ b/influxdb_iox/tests/end_to_end_cases/remote.rs
@ -280,10 +280,9 @@ async fn remote_partition_and_get_from_store_and_pull() {
                        .arg("1")
                        .assert()
                        .success()
-                        .stdout(
-                            predicate::str::contains(r#""id": "1""#)
-                                .and(predicate::str::contains(r#""partitionId": "1","#)),
-                        )
+                        .stdout(predicate::str::contains(
+                            r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
+                        ))
                        .get_output()
                        .stdout
                        .clone();
--- a/influxdb_iox_client/src/client/catalog.rs
+++ b/influxdb_iox_client/src/client/catalog.rs
@ -29,9 +29,15 @@ impl Client {
        &mut self,
        partition_id: i64,
    ) -> Result<Vec<ParquetFile>, Error> {
+        let partition_identifier = PartitionIdentifier {
+            id: Some(partition_identifier::Id::CatalogId(partition_id)),
+        };
+
        let response = self
            .inner
-            .get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest { partition_id })
+            .get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest {
+                partition_identifier: Some(partition_identifier),
+            })
            .await?;

        Ok(response.into_inner().parquet_files)
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@ -48,6 +48,7 @@ trace = { version = "0.1.0", path = "../trace" }
 uuid = "1.4.1"
 wal = { version = "0.1.0", path = "../wal" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
+gossip = { version = "0.1.0", path = "../gossip" }

 [dev-dependencies]
 assert_matches = "1.5.0"
--- a/ingester/src/buffer_tree/partition.rs
+++ b/ingester/src/buffer_tree/partition.rs
@ -1,6 +1,6 @@
 //! Partition level data buffer structures.

-use std::{collections::VecDeque, sync::Arc};
+use std::sync::Arc;

 use data_types::{
    sequence_number_set::SequenceNumberSet, NamespaceId, PartitionHashId, PartitionId,
@ -8,11 +8,12 @@ use data_types::{
 };
 use mutable_batch::MutableBatch;
 use observability_deps::tracing::*;
-use schema::sort::SortKey;
+use schema::{merge::SchemaMerger, sort::SortKey, Schema};

 use self::{
-    buffer::{traits::Queryable, BufferState, DataBuffer, Persisting},
+    buffer::{traits::Queryable, DataBuffer},
    persisting::{BatchIdent, PersistingData},
+    persisting_list::PersistingList,
 };
 use super::{namespace::NamespaceName, table::TableMetadata};
 use crate::{
@ -21,6 +22,7 @@ use crate::{

 mod buffer;
 pub(crate) mod persisting;
+mod persisting_list;
 pub(crate) mod resolver;

 /// The load state of the [`SortKey`] for a given partition.
@ -89,7 +91,7 @@ pub struct PartitionData {
    ///
    /// The [`BatchIdent`] is a generational counter that is used to tag each
    /// persisting with a unique, opaque identifier.
-    persisting: VecDeque<(BatchIdent, BufferState<Persisting>)>,
+    persisting: PersistingList,

    /// The number of persist operations started over the lifetime of this
    /// [`PartitionData`].
@ -123,7 +125,7 @@ impl PartitionData {
            table_id,
            table,
            buffer: DataBuffer::default(),
-            persisting: VecDeque::with_capacity(1),
+            persisting: PersistingList::default(),
            started_persistence_count: BatchIdent::default(),
            completed_persistence_count: 0,
        }
@ -169,7 +171,7 @@ impl PartitionData {
    /// persisting batches, plus 1 for the "hot" buffer. Reading the row count
    /// of each batch is `O(1)`. This method is expected to be fast.
    pub(crate) fn rows(&self) -> usize {
-        self.persisting.iter().map(|(_, v)| v.rows()).sum::<usize>() + self.buffer.rows()
+        self.persisting.rows() + self.buffer.rows()
    }

    /// Return the timestamp min/max values for the data contained within this
@ -188,11 +190,8 @@ impl PartitionData {
    /// statistics for each batch is `O(1)`. This method is expected to be fast.
    pub(crate) fn timestamp_stats(&self) -> Option<TimestampMinMax> {
        self.persisting
-            .iter()
-            .map(|(_, v)| {
-                v.timestamp_stats()
-                    .expect("persisting batches must be non-empty")
-            })
+            .timestamp_stats()
+            .into_iter()
            .chain(self.buffer.timestamp_stats())
            .reduce(|acc, v| TimestampMinMax {
                min: acc.min.min(v.min),
@ -200,6 +199,30 @@ impl PartitionData {
            })
    }

+    /// Return the schema of the data currently buffered within this
+    /// [`PartitionData`].
+    ///
+    /// This schema is not additive - it is the union of the individual schema
+    /// batches currently buffered and as such columns are removed as the
+    /// individual batches containing those columns are persisted and dropped.
+    pub(crate) fn schema(&self) -> Option<Schema> {
+        if self.persisting.is_empty() && self.buffer.rows() == 0 {
+            return None;
+        }
+
+        Some(
+            self.persisting
+                .schema()
+                .into_iter()
+                .cloned()
+                .chain(self.buffer.schema())
+                .fold(SchemaMerger::new(), |acc, v| {
+                    acc.merge(&v).expect("schemas are incompatible")
+                })
+                .build(),
+        )
+    }
+
    /// Return all data for this partition, ordered by the calls to
    /// [`PartitionData::buffer_write()`].
    pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Option<QueryAdaptor> {
@ -213,8 +236,7 @@ impl PartitionData {
        // existing rows materialise to the correct output.
        let data = self
            .persisting
-            .iter()
-            .flat_map(|(_, b)| b.get_query_data(projection))
+            .get_query_data(projection)
            .chain(buffered_data)
            .collect::<Vec<_>>();

@ -287,7 +309,7 @@ impl PartitionData {
        // Increment the "started persist" counter.
        //
        // This is used to cheaply identify batches given to the
-        // mark_persisted() call.
+        // mark_persisted() call and ensure monotonicity.
        let batch_ident = self.started_persistence_count.next();

        debug!(
@ -310,10 +332,9 @@ impl PartitionData {
            batch_ident,
        );

-        // Push the new buffer to the back of the persisting queue, so that
-        // iterating from back to front during queries iterates over writes from
-        // oldest to newest.
-        self.persisting.push_back((batch_ident, fsm));
+        // Push the buffer into the persisting list (which maintains batch
+        // order).
+        self.persisting.push(batch_ident, fsm);

        Some(data)
    }
@ -328,22 +349,11 @@ impl PartitionData {
    /// This method panics if [`Self`] is not marked as undergoing a persist
    /// operation, or `batch` is not currently being persisted.
    pub(crate) fn mark_persisted(&mut self, batch: PersistingData) -> SequenceNumberSet {
-        // Find the batch in the persisting queue.
-        let idx = self
-            .persisting
-            .iter()
-            .position(|(old, _)| *old == batch.batch_ident())
-            .expect("no currently persisting batch");
-
-        // Remove the batch from the queue, preserving the order of the queue
-        // for batch iteration during queries.
-        let (old_ident, fsm) = self.persisting.remove(idx).unwrap();
-        assert_eq!(old_ident, batch.batch_ident());
+        let fsm = self.persisting.remove(batch.batch_ident());

        self.completed_persistence_count += 1;

        debug!(
-            batch_ident = %old_ident,
            persistence_count = %self.completed_persistence_count,
            namespace_id = %self.namespace_id,
            table_id = %self.table_id,
--- a/ingester/src/buffer_tree/partition/buffer/mutable_buffer.rs
+++ b/ingester/src/buffer_tree/partition/buffer/mutable_buffer.rs
@ -7,7 +7,7 @@ use schema::Projection;
 ///
 /// A [`Buffer`] can contain no writes.
 ///
-/// [`BufferState`]: super::super::BufferState
+/// [`BufferState`]: super::BufferState
 #[derive(Debug, Default)]
 pub(super) struct Buffer {
    buffer: Option<MutableBatch>,
--- a/ingester/src/buffer_tree/partition/buffer/state_machine.rs
+++ b/ingester/src/buffer_tree/partition/buffer/state_machine.rs
@ -77,7 +77,7 @@ pub(crate) struct BufferState<T> {

 impl BufferState<Buffering> {
    /// Initialise a new buffer state machine.
-    pub(super) fn new() -> Self {
+    pub(crate) fn new() -> Self {
        Self {
            state: Buffering::default(),
            sequence_numbers: SequenceNumberSet::default(),
--- a/ingester/src/buffer_tree/partition/persisting.rs
+++ b/ingester/src/buffer_tree/partition/persisting.rs
@ -2,14 +2,18 @@ use std::fmt::Display;

 use crate::query_adaptor::QueryAdaptor;

-/// An opaque generational identifier of a buffer in a [`PartitionData`].
+/// An opaque, monotonic generational identifier of a buffer in a
+/// [`PartitionData`].
+///
+/// A [`BatchIdent`] is strictly greater than all those that were obtained
+/// before it.
 ///
 /// [`PartitionData`]: super::PartitionData
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
-pub(super) struct BatchIdent(u64);
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd)]
+pub(crate) struct BatchIdent(u64);

 impl BatchIdent {
-    /// Return the next unique value.
+    /// Return the next unique monotonic value.
    pub(super) fn next(&mut self) -> Self {
        self.0 += 1;
        Self(self.0)
--- a/ingester/src/buffer_tree/partition/persisting_list.rs
+++ b/ingester/src/buffer_tree/partition/persisting_list.rs
@ -0,0 +1,467 @@
+use std::collections::VecDeque;
+
+use arrow::record_batch::RecordBatch;
+use data_types::TimestampMinMax;
+use schema::{merge::SchemaMerger, Schema};
+
+use crate::query::projection::OwnedProjection;
+
+use super::{
+    buffer::{traits::Queryable, BufferState, Persisting},
+    persisting::BatchIdent,
+};
+
+/// An ordered list of buffered, persisting data as [`BufferState<Persisting>`]
+/// FSM instances.
+///
+/// This type maintains a cache of row count & timestamp min/max statistics
+/// across all persisting batches, and performs incremental computation at
+/// persist time, moving it out of the query execution path.
+#[derive(Debug)]
+pub(crate) struct PersistingList {
+    /// The currently persisting [`DataBuffer`] instances, if any.
+    ///
+    /// This queue is ordered from newest at the head, to oldest at the tail -
+    /// forward iteration order matches write order.
+    ///
+    /// The [`BatchIdent`] is a generational counter that is used to tag each
+    /// persisting with a unique, opaque, monotonic identifier.
+    ///
+    /// [`DataBuffer`]: super::buffer::DataBuffer
+    persisting: VecDeque<(BatchIdent, BufferState<Persisting>)>,
+
+    cached: Option<CachedStats>,
+}
+
+impl Default for PersistingList {
+    fn default() -> Self {
+        Self {
+            persisting: VecDeque::with_capacity(1),
+            cached: None,
+        }
+    }
+}
+
+impl PersistingList {
+    /// Add this `buffer` which was assigned `ident` when marked as persisting
+    /// to the list.
+    ///
+    /// This call incrementally recomputes the cached data statistics.
+    ///
+    /// # Panics
+    ///
+    /// Panics if a batch with a later `ident` has already been added to this
+    /// list - calls MUST push ordered buffers/idents to maintain correct
+    /// ordering of row updates across batches.
+    ///
+    /// The provided buffer MUST be non-empty (containing a timestamp column,
+    /// and a schema)
+    pub(crate) fn push(&mut self, ident: BatchIdent, buffer: BufferState<Persisting>) {
+        // Recompute the statistics.
+        match &mut self.cached {
+            Some(v) => v.push(&buffer),
+            None => {
+                // Set the cached stats, as there's no other stats to merge
+                // with, so skip merging schemas.
+                self.cached = Some(CachedStats {
+                    rows: buffer.rows(),
+                    timestamps: buffer
+                        .timestamp_stats()
+                        .expect("persisting batch must contain timestamps"),
+                    schema: buffer.schema().expect("persisting batch must have schema"),
+                });
+            }
+        }
+
+        // Invariant: the batch being added MUST be ordered strictly after
+        // existing batches.
+        //
+        // The BatchIdent provides this ordering assurance, as it is a monotonic
+        // (opaque) identifier.
+        assert!(self
+            .persisting
+            .back()
+            .map(|(last, _)| ident > *last)
+            .unwrap_or(true));
+
+        self.persisting.push_back((ident, buffer));
+    }
+
+    /// Remove the buffer identified by `ident` from the list.
+    ///
+    /// There is no ordering requirement for this call, but is more efficient
+    /// when removals match the order of calls to [`PersistingList::push()`].
+    ///
+    /// # Panics
+    ///
+    /// This method panics if there is currently no batch identified by `ident`
+    /// in the list.
+    pub(crate) fn remove(&mut self, ident: BatchIdent) -> BufferState<Persisting> {
+        let idx = self
+            .persisting
+            .iter()
+            .position(|(old, _)| *old == ident)
+            .expect("no currently persisting batch");
+
+        let (old_ident, fsm) = self.persisting.remove(idx).unwrap();
+        assert_eq!(old_ident, ident);
+
+        // Recompute the cache of all remaining persisting batch stats (if any)
+        self.cached = CachedStats::new(self.persisting.iter().map(|(_, v)| v));
+
+        fsm
+    }
+
+    pub(crate) fn is_empty(&self) -> bool {
+        self.persisting.is_empty()
+    }
+
+    /// Returns the row count sum across all batches in this list.
+    ///
+    /// This is an `O(1)` operation.
+    pub(crate) fn rows(&self) -> usize {
+        self.cached.as_ref().map(|v| v.rows).unwrap_or_default()
+    }
+
+    /// Returns the timestamp min/max values across all batches in this list.
+    ///
+    /// This is an `O(1)` operation.
+    pub(crate) fn timestamp_stats(&self) -> Option<TimestampMinMax> {
+        self.cached.as_ref().map(|v| v.timestamps)
+    }
+
+    /// Returns the merged schema of all batches in this list.
+    ///
+    /// This is an `O(1)` operation.
+    pub(crate) fn schema(&self) -> Option<&Schema> {
+        self.cached.as_ref().map(|v| &v.schema)
+    }
+
+    /// Returns the [`RecordBatch`] in this list, optionally applying the given
+    /// projection.
+    ///
+    /// This is an `O(n)` operation.
+    pub(crate) fn get_query_data<'a, 'b: 'a>(
+        &'a self,
+        projection: &'b OwnedProjection,
+    ) -> impl Iterator<Item = RecordBatch> + 'a {
+        self.persisting
+            .iter()
+            .flat_map(move |(_, b)| b.get_query_data(projection))
+    }
+}
+
+/// The set of cached statistics describing the batches of data within the
+/// [`PersistingList`].
+#[derive(Debug)]
+struct CachedStats {
+    rows: usize,
+    timestamps: TimestampMinMax,
+
+    /// The merged schema of all the persisting batches.
+    schema: Schema,
+}
+
+impl CachedStats {
+    /// Generate a new [`CachedStats`] from an iterator of batches, if any.
+    ///
+    /// # Panics
+    ///
+    /// If any batches are empty (containing no schema or timestamp column), or
+    /// the batches do not contain compatible schemas, this call panics.
+    fn new<'a, T>(mut iter: T) -> Option<Self>
+    where
+        T: Iterator<Item = &'a BufferState<Persisting>> + 'a,
+    {
+        let v = iter.next()?;
+
+        let mut schema = SchemaMerger::new();
+        schema = schema
+            .merge(&v.schema().expect("persisting batch must be non-empty"))
+            .unwrap();
+
+        let mut rows = v.rows();
+        debug_assert!(rows > 0);
+
+        let mut timestamps = v
+            .timestamp_stats()
+            .expect("unprojected batch should have timestamp");
+
+        for buf in iter {
+            rows += buf.rows();
+            if let Some(v) = buf.schema() {
+                debug_assert!(buf.rows() > 0);
+
+                schema = schema
+                    .merge(&v)
+                    .expect("persit list contains incompatible schemas");
+
+                let ts = buf
+                    .timestamp_stats()
+                    .expect("no timestamp for bach containing rows");
+
+                timestamps.min = timestamps.min.min(ts.min);
+                timestamps.max = timestamps.max.max(ts.max);
+            }
+        }
+
+        Some(Self {
+            rows,
+            timestamps,
+            schema: schema.build(),
+        })
+    }
+
+    // Incrementally recompute the cached stats by adding `buffer` to the
+    // statistics.
+    fn push(&mut self, buffer: &BufferState<Persisting>) {
+        // This re-computation below MUST complete - no early exit is allowed or
+        // the stats will be left in an inconsistent state.
+
+        self.rows += buffer.rows();
+
+        let ts = buffer
+            .timestamp_stats()
+            .expect("persisting batch must contain timestamps");
+
+        self.timestamps.min = self.timestamps.min.min(ts.min);
+        self.timestamps.max = self.timestamps.max.max(ts.max);
+
+        let mut schema = SchemaMerger::new();
+        schema = schema.merge(&self.schema).unwrap();
+        schema = schema
+            .merge(&buffer.schema().expect("persisting batch must have schema"))
+            .expect("incompatible schema");
+        self.schema = schema.build()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeSet;
+
+    use arrow_util::assert_batches_eq;
+    use assert_matches::assert_matches;
+    use data_types::SequenceNumber;
+    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
+
+    use crate::buffer_tree::partition::buffer::Transition;
+
+    use super::*;
+
+    /// Ensure the ordering of yielded batches matches that of the calls to
+    /// push(), preserving batch ordering, and in turn, causal row ordering.
+    #[test]
+    fn test_batch_ordering() {
+        let mut list = PersistingList::default();
+        let mut ident_oracle = BatchIdent::default();
+
+        assert!(list.is_empty());
+
+        // Generate a buffer with a single row.
+        let buffer = buffer_with_lp(r#"bananas,tag=platanos great="yes" 42"#);
+
+        // Add it to the list.
+        list.push(ident_oracle.next(), buffer);
+
+        // The statistics must now match the expected values.
+        assert!(!list.is_empty());
+        assert_eq!(list.rows(), 1);
+        assert_matches!(
+            list.timestamp_stats(),
+            Some(TimestampMinMax { min: 42, max: 42 })
+        );
+        assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
+
+        // Assert the row content
+        let data = list
+            .get_query_data(&OwnedProjection::default())
+            .collect::<Vec<_>>();
+        let expected = vec![
+            "+-------+----------+--------------------------------+",
+            "| great | tag      | time                           |",
+            "+-------+----------+--------------------------------+",
+            "| yes   | platanos | 1970-01-01T00:00:00.000000042Z |",
+            "+-------+----------+--------------------------------+",
+        ];
+        assert_eq!(data.len(), 1);
+        assert_batches_eq!(&expected, &data);
+
+        // Push a new buffer updating the last row to check yielded row ordering.
+        let buffer = buffer_with_lp(r#"bananas,tag=platanos great="definitely" 42"#);
+        list.push(ident_oracle.next(), buffer);
+
+        // The statistics must now match the expected values.
+        assert!(!list.is_empty());
+        assert_eq!(list.rows(), 2);
+        assert_matches!(
+            list.timestamp_stats(),
+            Some(TimestampMinMax { min: 42, max: 42 })
+        );
+        assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
+
+        // Assert the row content
+        let data = list
+            .get_query_data(&OwnedProjection::default())
+            .collect::<Vec<_>>();
+        let expected = vec![
+            "+------------+----------+--------------------------------+",
+            "| great      | tag      | time                           |",
+            "+------------+----------+--------------------------------+",
+            "| yes        | platanos | 1970-01-01T00:00:00.000000042Z |",
+            "| definitely | platanos | 1970-01-01T00:00:00.000000042Z |",
+            "+------------+----------+--------------------------------+",
+        ];
+        assert_eq!(data.len(), 2);
+        assert_batches_eq!(&expected, &data);
+    }
+
+    /// Assert projection across batches works, and does not panic when given a
+    /// missing column.
+    #[test]
+    fn test_projection() {
+        let mut list = PersistingList::default();
+        let mut ident_oracle = BatchIdent::default();
+
+        assert!(list.is_empty());
+
+        // Populate the list.
+        list.push(
+            ident_oracle.next(),
+            buffer_with_lp(
+                "\
+                bananas,tag=platanos v=1 42\n\
+                bananas,tag=platanos v=2,bananas=100 4242\n\
+            ",
+            ),
+        );
+
+        list.push(
+            ident_oracle.next(),
+            buffer_with_lp(
+                "\
+                bananas,tag=platanos v=3 424242\n\
+                bananas v=4,bananas=200 42424242\n\
+            ",
+            ),
+        );
+
+        // Assert the row content
+        let data = list
+            .get_query_data(&OwnedProjection::from(vec!["time", "tag", "missing"]))
+            .collect::<Vec<_>>();
+        let expected = vec![
+            "+--------------------------------+----------+",
+            "| time                           | tag      |",
+            "+--------------------------------+----------+",
+            "| 1970-01-01T00:00:00.000000042Z | platanos |",
+            "| 1970-01-01T00:00:00.000004242Z | platanos |",
+            "| 1970-01-01T00:00:00.000424242Z | platanos |",
+            "| 1970-01-01T00:00:00.042424242Z |          |",
+            "+--------------------------------+----------+",
+        ];
+        assert_batches_eq!(&expected, &data);
+    }
+
+    /// Validate the cached statistics as batches are added and removed.
+    #[test]
+    fn test_cached_statistics() {
+        let mut list = PersistingList::default();
+        let mut ident_oracle = BatchIdent::default();
+
+        assert!(list.is_empty());
+
+        // Generate a buffer with a single row.
+        let first_batch = ident_oracle.next();
+        list.push(
+            first_batch,
+            buffer_with_lp(r#"bananas,tag=platanos great="yes" 42"#),
+        );
+
+        // The statistics must now match the expected values.
+        assert!(!list.is_empty());
+        assert_eq!(list.rows(), 1);
+        assert_matches!(
+            list.timestamp_stats(),
+            Some(TimestampMinMax { min: 42, max: 42 })
+        );
+        assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
+
+        // Push another row.
+        let second_batch = ident_oracle.next();
+        list.push(
+            second_batch,
+            buffer_with_lp(r#"bananas,another=yes great="definitely",incremental=true 4242"#),
+        );
+
+        // The statistics must now match the expected values.
+        assert!(!list.is_empty());
+        assert_eq!(list.rows(), 2);
+        assert_matches!(
+            list.timestamp_stats(),
+            Some(TimestampMinMax { min: 42, max: 4242 })
+        );
+        assert_schema_matches(
+            list.schema().unwrap(),
+            &["time", "great", "tag", "another", "incremental"],
+        );
+
+        // Remove the first batch.
+        list.remove(first_batch);
+
+        // The statistics must now match the second batch values.
+        assert!(!list.is_empty());
+        assert_eq!(list.rows(), 1);
+        assert_matches!(
+            list.timestamp_stats(),
+            Some(TimestampMinMax {
+                min: 4242,
+                max: 4242
+            })
+        );
+        assert_schema_matches(
+            list.schema().unwrap(),
+            &["time", "great", "another", "incremental"],
+        );
+
+        // Remove the second/final batch.
+        list.remove(second_batch);
+
+        assert!(list.is_empty());
+        assert_eq!(list.rows(), 0);
+        assert_matches!(list.timestamp_stats(), None);
+        assert_matches!(list.schema(), None);
+    }
+
+    /// Assert the schema columns match the given names.
+    fn assert_schema_matches(schema: &Schema, cols: &[&str]) {
+        let schema = schema.as_arrow();
+        let got = schema
+            .all_fields()
+            .into_iter()
+            .map(|v| v.name().to_owned())
+            .collect::<BTreeSet<_>>();
+
+        let want = cols
+            .iter()
+            .map(ToString::to_string)
+            .collect::<BTreeSet<_>>();
+
+        assert_eq!(got, want);
+    }
+
+    /// Return a persisting buffer containing the given LP content.
+    fn buffer_with_lp(lp: &str) -> BufferState<Persisting> {
+        let mut buffer = BufferState::new();
+        // Write some data to a buffer.
+        buffer
+            .write(lp_to_mutable_batch(lp).1, SequenceNumber::new(0))
+            .expect("write to empty buffer should succeed");
+
+        // Convert the buffer into a persisting snapshot.
+        match buffer.snapshot() {
+            Transition::Ok(v) => v.into_persisting(),
+            Transition::Unchanged(_) => panic!("did not transition to snapshot state"),
+        }
+    }
+}
--- a/ingester/src/init.rs
+++ b/ingester/src/init.rs
@ -1,3 +1,5 @@
+use gossip::{GossipHandle, NopDispatcher};
+
 /// This needs to be pub for the benchmarks but should not be used outside the crate.
 #[cfg(feature = "benches")]
 pub use wal_replay::*;
@ -5,7 +7,7 @@ pub use wal_replay::*;
 mod graceful_shutdown;
 mod wal_replay;

-use std::{path::PathBuf, sync::Arc, time::Duration};
+use std::{net::SocketAddr, path::PathBuf, sync::Arc, time::Duration};

 use arrow_flight::flight_service_server::FlightService;
 use backoff::BackoffConfig;
@ -109,6 +111,9 @@ pub struct IngesterGuard<T> {
    /// The task handle executing the graceful shutdown once triggered.
    graceful_shutdown_handler: tokio::task::JoinHandle<()>,
    shutdown_complete: Shared<oneshot::Receiver<()>>,
+
+    /// An optional handle to the gossip sub-system, if running.
+    gossip_handle: Option<GossipHandle>,
 }

 impl<T> IngesterGuard<T>
@ -137,6 +142,27 @@ impl<T> Drop for IngesterGuard<T> {
    }
 }

+/// Configuration parameters for the optional gossip sub-system.
+#[derive(Debug, Default)]
+pub enum GossipConfig {
+    /// Disable the gossip sub-system.
+    #[default]
+    Disabled,
+
+    /// Enable the gossip sub-system, listening on the specified `bind_addr` and
+    /// using `peers` as the initial peer seed list.
+    Enabled {
+        /// UDP socket address to use for gossip communication.
+        bind_addr: SocketAddr,
+        /// Initial peer seed list in the form of either:
+        ///
+        ///   - "dns.address.example:port"
+        ///   - "10.0.0.1:port"
+        ///
+        peers: Vec<String>,
+    },
+}
+
 /// Errors that occur during initialisation of an `ingester` instance.
 #[derive(Debug, Error)]
 pub enum InitError {
@ -152,6 +178,10 @@ pub enum InitError {
    /// An error replaying the entries in the WAL.
    #[error(transparent)]
    WalReplay(Box<dyn std::error::Error>),
+
+    /// An error binding the UDP socket for gossip communication.
+    #[error("failed to bind udp gossip socket: {0}")]
+    GossipBind(std::io::Error),
 }

 /// Initialise a new `ingester` instance, returning the gRPC service handler
@ -238,6 +268,7 @@ pub async fn new<F>(
    persist_queue_depth: usize,
    persist_hot_partition_cost: usize,
    object_store: ParquetStorage,
+    gossip: GossipConfig,
    shutdown: F,
 ) -> Result<IngesterGuard<impl IngesterRpcInterface>, InitError>
 where
@ -351,11 +382,9 @@ where

    // Initialize disk metrics to emit disk capacity / free statistics for the
    // WAL directory.
-    let disk_metric_task = tokio::task::spawn(
-        DiskSpaceMetrics::new(wal_directory, &metrics)
-            .expect("failed to resolve WAL directory to disk")
-            .run(),
-    );
+    let (disk_metric_task, _snapshot_rx) = DiskSpaceMetrics::new(wal_directory, &metrics)
+        .expect("failed to resolve WAL directory to disk");
+    let disk_metric_task = tokio::task::spawn(disk_metric_task.run());

    // Replay the WAL log files, if any.
    let max_sequence_number =
@ -422,6 +451,23 @@ where
        wal_reference_handle,
    ));

+    // Optionally start the gossip subsystem
+    let gossip_handle = match gossip {
+        GossipConfig::Disabled => {
+            info!("gossip disabled");
+            None
+        }
+        GossipConfig::Enabled { bind_addr, peers } => {
+            // Start the gossip sub-system, which logs during init.
+            let handle =
+                gossip::Builder::new(peers, NopDispatcher::default(), Arc::clone(&metrics))
+                    .bind(bind_addr)
+                    .await
+                    .map_err(InitError::GossipBind)?;
+            Some(handle)
+        }
+    };
+
    Ok(IngesterGuard {
        rpc: GrpcDelegate::new(
            Arc::new(write_path),
@ -438,5 +484,6 @@ where
        disk_metric_task,
        graceful_shutdown_handler: shutdown_task,
        shutdown_complete: shutdown_rx.shared(),
+        gossip_handle,
    })
 }
--- a/ingester/src/lib.rs
+++ b/ingester/src/lib.rs
@ -200,6 +200,7 @@
    unused_crate_dependencies,
    missing_docs
 )]
+#![allow(clippy::default_constructed_unit_structs)]

 // Workaround for "unused crate" lint false positives.
 #[cfg(test)]
--- a/ingester/src/persist/completion_observer.rs
+++ b/ingester/src/persist/completion_observer.rs
@ -2,7 +2,8 @@ use std::{fmt::Debug, sync::Arc, time::Duration};

 use async_trait::async_trait;
 use data_types::{
-    sequence_number_set::SequenceNumberSet, NamespaceId, ParquetFileParams, PartitionId, TableId,
+    sequence_number_set::SequenceNumberSet, NamespaceId, ParquetFileParams, TableId,
+    TransitionPartitionId,
 };

 use crate::wal::reference_tracker::WalReferenceHandle;
@ -54,9 +55,9 @@ impl CompletedPersist {
        self.meta.table_id
    }

-    /// Returns the [`PartitionId`] of the persisted data.
-    pub(crate) fn partition_id(&self) -> PartitionId {
-        self.meta.partition_id
+    /// Returns the [`TransitionPartitionId`] of the persisted data.
+    pub(crate) fn partition_id(&self) -> &TransitionPartitionId {
+        &self.meta.partition_id
    }

    /// Returns the [`SequenceNumberSet`] of the persisted data.
@ -166,15 +167,16 @@ pub(crate) mod mock {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::test_util::{ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_TABLE_ID};
+    use crate::test_util::{
+        ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, ARBITRARY_TRANSITION_PARTITION_ID,
+    };
    use data_types::{ColumnId, ColumnSet, SequenceNumber, Timestamp};

    fn arbitrary_file_meta() -> ParquetFileParams {
        ParquetFileParams {
            namespace_id: ARBITRARY_NAMESPACE_ID,
            table_id: ARBITRARY_TABLE_ID,
-            partition_id: ARBITRARY_PARTITION_ID,
-            partition_hash_id: None,
+            partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
            object_store_id: Default::default(),
            min_time: Timestamp::new(42),
            max_time: Timestamp::new(42),
@ -226,7 +228,7 @@ mod tests {

        assert_eq!(note.namespace_id(), meta.namespace_id);
        assert_eq!(note.table_id(), meta.table_id);
-        assert_eq!(note.partition_id(), meta.partition_id);
+        assert_eq!(note.partition_id(), &meta.partition_id);

        assert_eq!(note.column_count(), meta.column_set.len());
        assert_eq!(note.row_count(), meta.row_count as usize);
--- a/ingester/src/persist/file_metrics.rs
+++ b/ingester/src/persist/file_metrics.rs
@ -151,7 +151,9 @@ mod tests {
    use super::*;
    use crate::{
        persist::completion_observer::mock::MockCompletionObserver,
-        test_util::{ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_TABLE_ID},
+        test_util::{
+            ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, ARBITRARY_TRANSITION_PARTITION_ID,
+        },
    };
    use data_types::{
        sequence_number_set::SequenceNumberSet, ColumnId, ColumnSet, ParquetFileParams, Timestamp,
@ -169,8 +171,7 @@ mod tests {
        let meta = ParquetFileParams {
            namespace_id: ARBITRARY_NAMESPACE_ID,
            table_id: ARBITRARY_TABLE_ID,
-            partition_id: ARBITRARY_PARTITION_ID,
-            partition_hash_id: None,
+            partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
            object_store_id: Default::default(),
            min_time: Timestamp::new(Duration::from_secs(1_000).as_nanos() as _),
            max_time: Timestamp::new(Duration::from_secs(1_042).as_nanos() as _), // 42 seconds later
--- a/ingester/src/persist/mod.rs
+++ b/ingester/src/persist/mod.rs
@ -16,7 +16,7 @@ mod tests {
    use std::{sync::Arc, time::Duration};

    use assert_matches::assert_matches;
-    use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
+    use data_types::{CompactionLevel, ParquetFile};
    use futures::TryStreamExt;
    use iox_catalog::{
        interface::{get_schema_by_id, Catalog, SoftDeletedRows},
@ -190,7 +190,7 @@ mod tests {
        // Generate a partition with data
        let partition = partition_with_write(Arc::clone(&catalog)).await;
        let table_id = partition.lock().table_id();
-        let partition_id = partition.lock().partition_id();
+        let partition_id = partition.lock().transition_partition_id();
        let namespace_id = partition.lock().namespace_id();
        assert_matches!(partition.lock().sort_key(), SortKeyState::Provided(None));

@ -221,7 +221,7 @@ mod tests {
        assert_matches!(&completion_observer.calls().as_slice(), &[n] => {
            assert_eq!(n.namespace_id(), namespace_id);
            assert_eq!(n.table_id(), table_id);
-            assert_eq!(n.partition_id(), partition_id);
+            assert_eq!(n.partition_id(), &partition_id);
            assert_eq!(n.sequence_numbers().len(), 1);
        });

@ -243,12 +243,12 @@ mod tests {
            .repositories()
            .await
            .parquet_files()
-            .list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
+            .list_by_partition_not_to_delete(&partition_id)
            .await
            .expect("query for parquet files failed");

        // Validate a single file was inserted with the expected properties.
-        let (object_store_id, file_size_bytes) = assert_matches!(&*files, &[ParquetFile {
+        let (object_store_id, file_size_bytes) = assert_matches!(&*files, [ParquetFile {
                namespace_id: got_namespace_id,
                table_id: got_table_id,
                partition_id: got_partition_id,
@ -263,12 +263,12 @@ mod tests {
            {
                assert_eq!(created_at.get(), max_l0_created_at.get());

-                assert_eq!(got_namespace_id, namespace_id);
-                assert_eq!(got_table_id, table_id);
-                assert_eq!(got_partition_id, partition_id);
+                assert_eq!(got_namespace_id, &namespace_id);
+                assert_eq!(got_table_id, &table_id);
+                assert_eq!(got_partition_id, &partition_id);

-                assert_eq!(row_count, 1);
-                assert_eq!(compaction_level, CompactionLevel::Initial);
+                assert_eq!(*row_count, 1);
+                assert_eq!(compaction_level, &CompactionLevel::Initial);

                (object_store_id, file_size_bytes)
            }
@ -292,7 +292,7 @@ mod tests {
            }] => {
                let want_path = format!("{object_store_id}.parquet");
                assert!(location.as_ref().ends_with(&want_path));
-                assert_eq!(size, file_size_bytes as usize);
+                assert_eq!(size, *file_size_bytes as usize);
            }
        )
    }
@ -326,8 +326,7 @@ mod tests {
        // Generate a partition with data
        let partition = partition_with_write(Arc::clone(&catalog)).await;
        let table_id = partition.lock().table_id();
-        let partition_id = partition.lock().partition_id();
-        let transition_partition_id = partition.lock().transition_partition_id();
+        let partition_id = partition.lock().transition_partition_id();
        let namespace_id = partition.lock().namespace_id();
        assert_matches!(partition.lock().sort_key(), SortKeyState::Provided(None));

@ -344,7 +343,7 @@ mod tests {
            .await
            .partitions()
            .cas_sort_key(
-                &transition_partition_id,
+                &partition_id,
                None,
                &["bananas", "are", "good", "for", "you"],
            )
@ -367,7 +366,7 @@ mod tests {
        assert_matches!(&completion_observer.calls().as_slice(), &[n] => {
            assert_eq!(n.namespace_id(), namespace_id);
            assert_eq!(n.table_id(), table_id);
-            assert_eq!(n.partition_id(), partition_id);
+            assert_eq!(n.partition_id(), &partition_id);
            assert_eq!(n.sequence_numbers().len(), 1);
        });

@ -392,12 +391,12 @@ mod tests {
            .repositories()
            .await
            .parquet_files()
-            .list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
+            .list_by_partition_not_to_delete(&partition_id)
            .await
            .expect("query for parquet files failed");

        // Validate a single file was inserted with the expected properties.
-        let (object_store_id, file_size_bytes) = assert_matches!(&*files, &[ParquetFile {
+        let (object_store_id, file_size_bytes) = assert_matches!(&*files, [ParquetFile {
                namespace_id: got_namespace_id,
                table_id: got_table_id,
                partition_id: got_partition_id,
@ -412,12 +411,12 @@ mod tests {
            {
                assert_eq!(created_at.get(), max_l0_created_at.get());

-                assert_eq!(got_namespace_id, namespace_id);
-                assert_eq!(got_table_id, table_id);
-                assert_eq!(got_partition_id, partition_id);
+                assert_eq!(got_namespace_id, &namespace_id);
+                assert_eq!(got_table_id, &table_id);
+                assert_eq!(got_partition_id, &partition_id);

-                assert_eq!(row_count, 1);
-                assert_eq!(compaction_level, CompactionLevel::Initial);
+                assert_eq!(*row_count, 1);
+                assert_eq!(compaction_level, &CompactionLevel::Initial);

                (object_store_id, file_size_bytes)
            }
@ -438,18 +437,14 @@ mod tests {
        assert_eq!(files.len(), 2, "expected two uploaded files");

        // Ensure the catalog record points at a valid file in object storage.
-        let want_path = ParquetFilePath::new(
-            namespace_id,
-            table_id,
-            &transition_partition_id,
-            object_store_id,
-        )
-        .object_store_path();
+        let want_path =
+            ParquetFilePath::new(namespace_id, table_id, &partition_id, *object_store_id)
+                .object_store_path();
        let file = files
            .into_iter()
            .find(|f| f.location == want_path)
            .expect("did not find final file in object storage");

-        assert_eq!(file.size, file_size_bytes as usize);
+        assert_eq!(file.size, *file_size_bytes as usize);
    }
 }
--- a/ingester/src/persist/queue.rs
+++ b/ingester/src/persist/queue.rs
@ -55,7 +55,8 @@ pub(crate) mod mock {
    use std::{sync::Arc, time::Duration};

    use data_types::{
-        ColumnId, ColumnSet, NamespaceId, ParquetFileParams, PartitionId, TableId, Timestamp,
+        ColumnId, ColumnSet, NamespaceId, ParquetFileParams, PartitionHashId, PartitionKey,
+        TableId, Timestamp, TransitionPartitionId,
    };
    use test_helpers::timeout::FutureTimeout;
    use tokio::task::JoinHandle;
@ -155,13 +156,16 @@ pub(crate) mod mock {
                let wait_ms: u64 = rand::random::<u64>() % 100;
                tokio::time::sleep(Duration::from_millis(wait_ms)).await;
                let sequence_numbers = partition.lock().mark_persisted(data);
+                let table_id = TableId::new(2);
+                let partition_hash_id =
+                    PartitionHashId::new(table_id, &PartitionKey::from("arbitrary"));
+                let partition_id = TransitionPartitionId::Deterministic(partition_hash_id);
                completion_observer
                    .persist_complete(Arc::new(CompletedPersist::new(
                        ParquetFileParams {
                            namespace_id: NamespaceId::new(1),
-                            table_id: TableId::new(2),
-                            partition_id: PartitionId::new(3),
-                            partition_hash_id: None,
+                            table_id,
+                            partition_id,
                            object_store_id: Default::default(),
                            min_time: Timestamp::new(42),
                            max_time: Timestamp::new(42),
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@ -394,8 +394,7 @@ where
        ParquetFileParams {
            namespace_id: NamespaceId::new(1),
            table_id: TableId::new(2),
-            partition_id: PartitionId::new(3),
-            partition_hash_id: None,
+            partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
            object_store_id: Default::default(),
            min_time: Timestamp::new(42),
            max_time: Timestamp::new(42),
--- a/ingester_test_ctx/src/lib.rs
+++ b/ingester_test_ctx/src/lib.rs
@ -30,7 +30,7 @@ use futures::{stream::FuturesUnordered, FutureExt, StreamExt, TryStreamExt};
 use generated_types::influxdata::iox::ingester::v1::{
    write_service_server::WriteService, WriteRequest,
 };
-use ingester::{IngesterGuard, IngesterRpcInterface};
+use ingester::{GossipConfig, IngesterGuard, IngesterRpcInterface};
 use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
 use iox_catalog::{
    interface::{Catalog, SoftDeletedRows},
@ -168,6 +168,7 @@ impl TestContextBuilder {
            max_persist_queue_depth,
            persist_hot_partition_cost,
            storage.clone(),
+            GossipConfig::default(),
            shutdown_rx.map(|v| v.expect("shutdown sender dropped without calling shutdown")),
        )
        .await
--- a/iox_catalog/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
+++ b/iox_catalog/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
@ -0,0 +1,24 @@
+DROP TRIGGER IF EXISTS update_partition ON parquet_file;
+
+ALTER TABLE parquet_file
+ALTER COLUMN partition_id
+DROP NOT NULL;
+
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+    UPDATE partition
+    SET new_file_at = NEW.created_at
+    WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
+      AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
+
+    RETURN NEW;
+END;
+$$;
+
+CREATE TRIGGER update_partition
+    AFTER INSERT ON parquet_file
+    FOR EACH ROW
+    EXECUTE PROCEDURE update_partition_on_new_file_at();
--- a/iox_catalog/sqlite/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
+++ b/iox_catalog/sqlite/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
@ -0,0 +1,98 @@
+CREATE TABLE parquet_file_temp
+AS SELECT * FROM parquet_file;
+
+DROP TABLE parquet_file;
+
+CREATE TABLE parquet_file
+(
+    id                  INTEGER
+        constraint parquet_file_pkey
+            primary key autoincrement,
+    shard_id            numeric            not null
+        constraint parquet_file_sequencer_id_fkey
+            references shard,
+    table_id            numeric            not null
+        references table_name,
+    partition_id        numeric
+        references partition,
+    partition_hash_id bytea
+      references partition (hash_id),
+
+    object_store_id     uuid               not null
+        constraint parquet_location_unique
+            unique,
+    max_sequence_number numeric,
+    min_time            numeric,
+    max_time            numeric,
+    to_delete           numeric,
+    row_count           numeric  default 0 not null,
+    file_size_bytes     numeric  default 0 not null,
+    compaction_level    smallint default 0 not null,
+    created_at          numeric,
+    namespace_id        numeric            not null
+        references namespace
+            on delete cascade,
+    column_set          numeric[]          not null,
+    max_l0_created_at   numeric  default 0 not null
+);
+
+create index if not exists parquet_file_deleted_at_idx
+    on parquet_file (to_delete);
+
+create index if not exists parquet_file_partition_idx
+    on parquet_file (partition_id);
+
+create index if not exists parquet_file_table_idx
+    on parquet_file (table_id);
+
+create index if not exists parquet_file_shard_compaction_delete_idx
+    on parquet_file (shard_id, compaction_level, to_delete);
+
+create index if not exists parquet_file_shard_compaction_delete_created_idx
+    on parquet_file (shard_id, compaction_level, to_delete, created_at);
+
+create index if not exists parquet_file_partition_created_idx
+    on parquet_file (partition_id, created_at);
+
+CREATE INDEX IF NOT EXISTS parquet_file_partition_hash_id_idx
+ON parquet_file (partition_hash_id)
+WHERE partition_hash_id IS NOT NULL;
+
+create trigger if not exists update_partition
+    after insert
+    on parquet_file
+    for each row
+begin
+    UPDATE partition
+    SET new_file_at = NEW.created_at
+    WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
+       AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
+end;
+
+create trigger if not exists update_billing
+    after insert
+    on parquet_file
+    for each row
+begin
+    INSERT INTO billing_summary (namespace_id, total_file_size_bytes)
+    VALUES (NEW.namespace_id, NEW.file_size_bytes)
+    ON CONFLICT (namespace_id) DO UPDATE
+        SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes
+    WHERE billing_summary.namespace_id = NEW.namespace_id;
+end;
+
+create trigger if not exists decrement_summary
+    after update
+    on parquet_file
+    for each row
+    when OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL
+begin
+    UPDATE billing_summary
+    SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes
+    WHERE billing_summary.namespace_id = OLD.namespace_id;
+end;
+
+INSERT INTO parquet_file
+SELECT * FROM parquet_file_temp;
+
+DROP TABLE parquet_file_temp;
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@ -1865,7 +1865,7 @@ pub(crate) mod test_helpers {

        let other_params = ParquetFileParams {
            table_id: other_partition.table_id,
-            partition_id: other_partition.id,
+            partition_id: other_partition.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            min_time: Timestamp::new(50),
            max_time: Timestamp::new(60),
@ -1978,7 +1978,7 @@ pub(crate) mod test_helpers {

        let f1_params = ParquetFileParams {
            table_id: partition2.table_id,
-            partition_id: partition2.id,
+            partition_id: partition2.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            min_time: Timestamp::new(1),
            max_time: Timestamp::new(10),
@ -2449,7 +2449,7 @@ pub(crate) mod test_helpers {
        let l0_five_hour_ago_file_params = ParquetFileParams {
            object_store_id: Uuid::new_v4(),
            created_at: time_five_hour_ago,
-            partition_id: partition2.id,
+            partition_id: partition2.transition_partition_id(),
            ..parquet_file_params.clone()
        };
        repos
@ -2492,7 +2492,7 @@ pub(crate) mod test_helpers {
        let l1_file_params = ParquetFileParams {
            object_store_id: Uuid::new_v4(),
            created_at: time_now,
-            partition_id: partition2.id,
+            partition_id: partition2.transition_partition_id(),
            compaction_level: CompactionLevel::FileNonOverlapped,
            ..parquet_file_params.clone()
        };
@ -2578,7 +2578,7 @@ pub(crate) mod test_helpers {
        let l2_file_params = ParquetFileParams {
            object_store_id: Uuid::new_v4(),
            created_at: time_now,
-            partition_id: partition3.id,
+            partition_id: partition3.transition_partition_id(),
            compaction_level: CompactionLevel::Final,
            ..parquet_file_params.clone()
        };
@ -2619,7 +2619,7 @@ pub(crate) mod test_helpers {
        let l0_one_hour_ago_file_params = ParquetFileParams {
            object_store_id: Uuid::new_v4(),
            created_at: time_one_hour_ago,
-            partition_id: partition3.id,
+            partition_id: partition3.transition_partition_id(),
            ..parquet_file_params.clone()
        };
        repos
@ -2720,8 +2720,7 @@ pub(crate) mod test_helpers {
        level1_file.compaction_level = CompactionLevel::FileNonOverlapped;

        let other_partition_params = ParquetFileParams {
-            partition_id: partition2.id,
-            partition_hash_id: partition2.hash_id().cloned(),
+            partition_id: partition2.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            ..parquet_file_params.clone()
        };
@ -2744,12 +2743,20 @@ pub(crate) mod test_helpers {
        expected_ids.sort();
        assert_eq!(file_ids, expected_ids);

-        // remove namespace to avoid it from affecting later tests
-        repos
-            .namespaces()
-            .soft_delete("namespace_parquet_file_test_list_by_partiton_not_to_delete")
+        // Using the catalog partition ID should return the same files, even if the Parquet file
+        // records don't have the partition ID on them (which is the default now)
+        let files = repos
+            .parquet_files()
+            .list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition.id))
            .await
-            .expect("delete namespace should succeed");
+            .unwrap();
+        assert_eq!(files.len(), 2);
+
+        let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
+        file_ids.sort();
+        let mut expected_ids = vec![parquet_file.id, level1_file.id];
+        expected_ids.sort();
+        assert_eq!(file_ids, expected_ids);
    }

    async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {
--- a/iox_catalog/src/lib.rs
+++ b/iox_catalog/src/lib.rs
@ -396,8 +396,7 @@ pub mod test_helpers {
        ParquetFileParams {
            namespace_id: namespace.id,
            table_id: table.id,
-            partition_id: partition.id,
-            partition_hash_id: partition.hash_id().cloned(),
+            partition_id: partition.transition_partition_id(),
            object_store_id: Uuid::new_v4(),
            min_time: Timestamp::new(1),
            max_time: Timestamp::new(10),
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@ -887,14 +887,28 @@ impl ParquetFileRepo for MemTxn {
    ) -> Result<Vec<ParquetFile>> {
        let stage = self.stage();

+        let partition = stage
+            .partitions
+            .iter()
+            .find(|p| match partition_id {
+                TransitionPartitionId::Deterministic(hash_id) => p
+                    .hash_id()
+                    .map(|p_hash_id| p_hash_id == hash_id)
+                    .unwrap_or(false),
+                TransitionPartitionId::Deprecated(id) => id == &p.id,
+            })
+            .unwrap()
+            .clone();
+
        Ok(stage
            .parquet_files
            .iter()
-            .filter(|f| match partition_id {
-                TransitionPartitionId::Deterministic(hash_id) => {
-                    f.partition_hash_id.as_ref().map_or(false, |h| h == hash_id)
-                }
-                TransitionPartitionId::Deprecated(id) => f.partition_id == *id,
+            .filter(|f| match &f.partition_id {
+                TransitionPartitionId::Deterministic(hash_id) => partition
+                    .hash_id()
+                    .map(|p_hash_id| p_hash_id == hash_id)
+                    .unwrap_or(false),
+                TransitionPartitionId::Deprecated(id) => id == &partition.id,
            })
            .filter(|f| f.to_delete.is_none())
            .cloned()
@ -996,17 +1010,15 @@ async fn create_parquet_file(
        ParquetFileId::new(stage.parquet_files.len() as i64 + 1),
    );
    let created_at = parquet_file.created_at;
-    let partition_id = parquet_file.partition_id;
+    let partition_id = parquet_file.partition_id.clone();
    stage.parquet_files.push(parquet_file);

    // Update the new_file_at field its partition to the time of created_at
    let partition = stage
        .partitions
        .iter_mut()
-        .find(|p| p.id == partition_id)
-        .ok_or(Error::PartitionNotFound {
-            id: TransitionPartitionId::Deprecated(partition_id),
-        })?;
+        .find(|p| p.transition_partition_id() == partition_id)
+        .ok_or(Error::PartitionNotFound { id: partition_id })?;
    partition.new_file_at = Some(created_at);

    Ok(stage.parquet_files.last().unwrap().clone())
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@ -1627,22 +1627,26 @@ RETURNING id;
        let query = match partition_id {
            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFile>(
                r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
-       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
-       max_l0_created_at
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-WHERE parquet_file.partition_hash_id = $1
+INNER JOIN partition
+ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
+WHERE partition.hash_id = $1
  AND parquet_file.to_delete IS NULL;
        "#,
            )
            .bind(hash_id), // $1
            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFile>(
                r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
-       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
-       max_l0_created_at
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-WHERE parquet_file.partition_id = $1
+INNER JOIN partition
+ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
+WHERE partition.id = $1
  AND parquet_file.to_delete IS NULL;
        "#,
            )
@ -1754,7 +1758,6 @@ where
        namespace_id,
        table_id,
        partition_id,
-        partition_hash_id,
        object_store_id,
        min_time,
        max_time,
@ -1766,6 +1769,11 @@ where
        max_l0_created_at,
    } = parquet_file_params;

+    let (partition_id, partition_hash_id) = match partition_id {
+        TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
+        TransitionPartitionId::Deprecated(id) => (Some(id), None),
+    };
+
    let partition_hash_id_ref = &partition_hash_id.as_ref();
    let query = sqlx::query_scalar::<_, ParquetFileId>(
        r#"
@ -2203,7 +2211,10 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
            .create(parquet_file_params)
            .await
            .unwrap();
-        assert!(parquet_file.partition_hash_id.is_none());
+        assert_matches!(
+            parquet_file.partition_id,
+            TransitionPartitionId::Deprecated(_)
+        );
    }

    #[test]
--- a/iox_catalog/src/sqlite.rs
+++ b/iox_catalog/src/sqlite.rs
@ -1221,8 +1221,8 @@ struct ParquetFilePod {
    id: ParquetFileId,
    namespace_id: NamespaceId,
    table_id: TableId,
-    partition_id: PartitionId,
-    partition_hash_id: Option<PartitionHashId>,
+    #[sqlx(flatten)]
+    partition_id: TransitionPartitionId,
    object_store_id: Uuid,
    min_time: Timestamp,
    max_time: Timestamp,
@ -1242,7 +1242,6 @@ impl From<ParquetFilePod> for ParquetFile {
            namespace_id: value.namespace_id,
            table_id: value.table_id,
            partition_id: value.partition_id,
-            partition_hash_id: value.partition_hash_id,
            object_store_id: value.object_store_id,
            min_time: value.min_time,
            max_time: value.max_time,
@ -1395,22 +1394,26 @@ RETURNING id;
        let query = match partition_id {
            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFilePod>(
                r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
-       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
-       max_l0_created_at
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-WHERE parquet_file.partition_hash_id = $1
+INNER JOIN partition
+ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
+WHERE partition.hash_id = $1
  AND parquet_file.to_delete IS NULL;
        "#,
            )
            .bind(hash_id), // $1
            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFilePod>(
                r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
-       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
-       max_l0_created_at
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-WHERE parquet_file.partition_id = $1
+INNER JOIN partition
+ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
+WHERE partition.id = $1
  AND parquet_file.to_delete IS NULL;
        "#,
            )
@ -1533,7 +1536,6 @@ where
        namespace_id,
        table_id,
        partition_id,
-        partition_hash_id,
        object_store_id,
        min_time,
        max_time,
@ -1545,7 +1547,10 @@ where
        max_l0_created_at,
    } = parquet_file_params;

-    let partition_hash_id_ref = &partition_hash_id.as_ref();
+    let (partition_id, partition_hash_id) = match partition_id {
+        TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
+        TransitionPartitionId::Deprecated(id) => (Some(id), None),
+    };
    let res = sqlx::query_as::<_, ParquetFilePod>(
        r#"
 INSERT INTO parquet_file (
@ -1562,7 +1567,7 @@ RETURNING
    .bind(TRANSITION_SHARD_ID) // $1
    .bind(table_id) // $2
    .bind(partition_id) // $3
-    .bind(partition_hash_id_ref) // $4
+    .bind(partition_hash_id.as_ref()) // $4
    .bind(object_store_id) // $5
    .bind(min_time) // $6
    .bind(max_time) // $7
@ -1811,7 +1816,10 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
            .create(parquet_file_params)
            .await
            .unwrap();
-        assert!(parquet_file.partition_hash_id.is_none());
+        assert_matches!(
+            parquet_file.partition_id,
+            TransitionPartitionId::Deprecated(_)
+        );
    }

    macro_rules! test_column_create_or_get_many_unchecked {
--- a/iox_tests/src/builders.rs
+++ b/iox_tests/src/builders.rs
@ -1,6 +1,7 @@
 use data_types::{
    ColumnSet, CompactionLevel, NamespaceId, ParquetFile, ParquetFileId, Partition,
    PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, Table, TableId, Timestamp,
+    TransitionPartitionId,
 };
 use uuid::Uuid;

@ -20,8 +21,7 @@ impl ParquetFileBuilder {
                id: ParquetFileId::new(id),
                namespace_id: NamespaceId::new(0),
                table_id,
-                partition_id: PartitionId::new(0),
-                partition_hash_id: Some(PartitionHashId::new(
+                partition_id: TransitionPartitionId::Deterministic(PartitionHashId::new(
                    table_id,
                    &PartitionKey::from("arbitrary"),
                )),
@ -39,11 +39,11 @@ impl ParquetFileBuilder {
        }
    }

-    /// Set the partition id
-    pub fn with_partition(self, id: i64) -> Self {
+    /// Set the partition identifier
+    pub fn with_partition(self, partition_id: TransitionPartitionId) -> Self {
        Self {
            file: ParquetFile {
-                partition_id: PartitionId::new(id),
+                partition_id,
                ..self.file
            },
        }
--- a/iox_tests/src/catalog.rs
+++ b/iox_tests/src/catalog.rs
@ -602,8 +602,7 @@ impl TestPartition {
        let parquet_file_params = ParquetFileParams {
            namespace_id: self.namespace.namespace.id,
            table_id: self.table.table.id,
-            partition_id: self.partition.id,
-            partition_hash_id: self.partition.hash_id().cloned(),
+            partition_id: self.partition.transition_partition_id(),
            object_store_id: object_store_id.unwrap_or_else(Uuid::new_v4),
            min_time: Timestamp::new(min_time),
            max_time: Timestamp::new(max_time),
--- a/iox_tests/src/lib.rs
+++ b/iox_tests/src/lib.rs
@ -17,6 +17,8 @@
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;

+use data_types::{PartitionHashId, PartitionKey, TableId, TransitionPartitionId};
+
 mod catalog;
 pub use catalog::{
    TestCatalog, TestNamespace, TestParquetFile, TestParquetFileBuilder, TestPartition, TestTable,
@ -24,3 +26,14 @@ pub use catalog::{

 mod builders;
 pub use builders::{ParquetFileBuilder, PartitionBuilder, SkippedCompactionBuilder, TableBuilder};
+
+/// Create a partition identifier from an int (which gets used as the table ID) and a partition key
+/// with the string "arbitrary". Most useful in cases where there isn't any actual catalog
+/// interaction (that is, in mocks) and when the important property of the partition identifiers is
+/// that they're either the same or different than other partition identifiers.
+pub fn partition_identifier(table_id: i64) -> TransitionPartitionId {
+    TransitionPartitionId::Deterministic(PartitionHashId::new(
+        TableId::new(table_id),
+        &PartitionKey::from("arbitrary"),
+    ))
+}
--- a/ioxd_ingester/src/lib.rs
+++ b/ioxd_ingester/src/lib.rs
@ -25,7 +25,7 @@ use generated_types::influxdata::iox::{
    },
 };
 use hyper::{Body, Request, Response};
-use ingester::{IngesterGuard, IngesterRpcInterface};
+use ingester::{GossipConfig, IngesterGuard, IngesterRpcInterface};
 use iox_catalog::interface::Catalog;
 use iox_query::exec::Executor;
 use ioxd_common::{
@ -210,6 +210,14 @@ pub async fn create_ingester_server_type(
 ) -> Result<Arc<dyn ServerType>> {
    let (shutdown_tx, shutdown_rx) = oneshot::channel();

+    let gossip = match ingester_config.gossip_config.gossip_bind_address {
+        None => GossipConfig::Disabled,
+        Some(v) => GossipConfig::Enabled {
+            bind_addr: v.into(),
+            peers: ingester_config.gossip_config.seed_list.clone(),
+        },
+    };
+
    let grpc = ingester::new(
        catalog,
        Arc::clone(&metrics),
@ -221,6 +229,7 @@ pub async fn create_ingester_server_type(
        ingester_config.persist_queue_depth,
        ingester_config.persist_hot_partition_cost,
        object_store,
+        gossip,
        shutdown_rx.map(|v| v.expect("shutdown sender dropped without calling shutdown")),
    )
    .await?;
--- a/ioxd_router/Cargo.toml
+++ b/ioxd_router/Cargo.toml
@ -10,6 +10,7 @@ async-trait = "0.1"
 authz = { path = "../authz" }
 clap_blocks = { path = "../clap_blocks" }
 data_types = { path = "../data_types" }
+gossip = { version = "0.1.0", path = "../gossip" }
 hashbrown = { workspace = true }
 hyper = "0.14"
 iox_catalog = { path = "../iox_catalog" }
--- a/ioxd_router/src/lib.rs
+++ b/ioxd_router/src/lib.rs
@ -10,7 +10,9 @@
    missing_debug_implementations,
    unused_crate_dependencies
 )]
+#![allow(clippy::default_constructed_unit_structs)]

+use gossip::NopDispatcher;
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;

@ -21,7 +23,7 @@ use std::{

 use async_trait::async_trait;
 use authz::{Authorizer, AuthorizerInstrumentation, IoxAuthorizer};
-use clap_blocks::router::RouterConfig;
+use clap_blocks::{gossip::GossipConfig, router::RouterConfig};
 use data_types::NamespaceName;
 use hashbrown::HashMap;
 use hyper::{Body, Request, Response};
@ -86,6 +88,10 @@ pub enum Error {
        source: Box<dyn std::error::Error>,
        addr: String,
    },
+
+    /// An error binding the UDP socket for gossip communication.
+    #[error("failed to bind udp gossip socket: {0}")]
+    GossipBind(std::io::Error),
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -218,6 +224,7 @@ pub async fn create_router_server_type(
    catalog: Arc<dyn Catalog>,
    object_store: Arc<DynObjectStore>,
    router_config: &RouterConfig,
+    gossip_config: &GossipConfig,
    trace_context_header_name: String,
 ) -> Result<Arc<dyn ServerType>> {
    let ingester_connections = router_config.ingester_addresses.iter().map(|addr| {
@ -333,6 +340,28 @@ pub async fn create_router_server_type(
    // Record the overall request handling latency
    let handler_stack = InstrumentationDecorator::new("request", &metrics, handler_stack);

+    // Optionally initialised the gossip subsystem.
+    //
+    // NOTE: the handle is completely unused, but needs to live as long as the
+    // server does to do anything useful (RAII), so it is placed int he
+    // RpcWriteRouterServer, which doesn't need it at all.
+    //
+    // TODO: remove handle from RpcWriteRouterServer when using handle
+    let gossip_handle = match gossip_config.gossip_bind_address {
+        Some(bind_addr) => {
+            let handle = gossip::Builder::new(
+                gossip_config.seed_list.clone(),
+                NopDispatcher::default(),
+                Arc::clone(&metrics),
+            )
+            .bind(*bind_addr)
+            .await
+            .map_err(Error::GossipBind)?;
+            Some(handle)
+        }
+        None => None,
+    };
+
    // Initialize the HTTP API delegate
    let write_request_unifier: Result<Box<dyn WriteRequestUnifier>> = match (
        router_config.single_tenant_deployment,
@ -379,8 +408,13 @@ pub async fn create_router_server_type(
    // `RpcWriteRouterServerType`.
    let grpc = RpcWriteGrpcDelegate::new(catalog, object_store);

-    let router_server =
-        RpcWriteRouterServer::new(http, grpc, metrics, common_state.trace_collector());
+    let router_server = RpcWriteRouterServer::new(
+        http,
+        grpc,
+        metrics,
+        common_state.trace_collector(),
+        gossip_handle,
+    );
    let server_type = Arc::new(RpcWriteRouterServerType::new(router_server, common_state));
    Ok(server_type)
 }
--- a/parquet_file/src/lib.rs
+++ b/parquet_file/src/lib.rs
@ -108,7 +108,7 @@ impl From<&ParquetFile> for ParquetFilePath {
        Self {
            namespace_id: f.namespace_id,
            table_id: f.table_id,
-            partition_id: f.transition_partition_id(),
+            partition_id: f.partition_id.clone(),
            object_store_id: f.object_store_id,
        }
    }
@ -119,7 +119,7 @@ impl From<&ParquetFileParams> for ParquetFilePath {
        Self {
            namespace_id: f.namespace_id,
            table_id: f.table_id,
-            partition_id: f.transition_partition_id(),
+            partition_id: f.partition_id.clone(),
            object_store_id: f.object_store_id,
        }
    }
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@ -91,7 +91,7 @@ use bytes::Bytes;
 use data_types::{
    ColumnId, ColumnSet, ColumnSummary, CompactionLevel, InfluxDbType, NamespaceId,
    ParquetFileParams, PartitionHashId, PartitionId, PartitionKey, StatValues, Statistics, TableId,
-    Timestamp,
+    Timestamp, TransitionPartitionId,
 };
 use generated_types::influxdata::iox::ingester::v1 as proto;
 use iox_time::Time;
@ -443,6 +443,7 @@ impl IoxMetadata {
    where
        F: for<'a> Fn(&'a str) -> ColumnId,
    {
+        let partition_id = TransitionPartitionId::from((partition_id, partition_hash_id.as_ref()));
        let decoded = metadata.decode().expect("invalid IOx metadata");
        trace!(
            ?partition_id,
@ -487,7 +488,6 @@ impl IoxMetadata {
            namespace_id: self.namespace_id,
            table_id: self.table_id,
            partition_id,
-            partition_hash_id,
            object_store_id: self.object_store_id,
            min_time,
            max_time,
--- a/querier/src/cache/mod.rs
+++ b/querier/src/cache/mod.rs
@ -113,11 +113,13 @@ impl CatalogCache {
            "ram_metadata",
            RamSize(ram_pool_metadata_bytes),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));
        let ram_pool_data = Arc::new(ResourcePool::new(
            "ram_data",
            RamSize(ram_pool_data_bytes),
            Arc::clone(&metric_registry),
+            &Handle::current(),
        ));

        let partition_cache = PartitionCache::new(
--- a/querier/src/cache/parquet_file.rs
+++ b/querier/src/cache/parquet_file.rs
@ -361,8 +361,8 @@ mod tests {
        partition.create_parquet_file(builder).await;
        let table_id = table.table.id;

-        let single_file_size = 240;
-        let two_file_size = 448;
+        let single_file_size = 256;
+        let two_file_size = 480;
        assert!(single_file_size < two_file_size);

        let cache = make_cache(&catalog);
--- a/querier/src/cache/partition.rs
+++ b/querier/src/cache/partition.rs
@ -17,7 +17,7 @@ use cache_system::{
 };
 use data_types::{
    partition_template::{build_column_values, ColumnValue},
-    ColumnId, Partition, PartitionId, TransitionPartitionId,
+    ColumnId, Partition, TransitionPartitionId,
 };
 use datafusion::scalar::ScalarValue;
 use iox_catalog::{interface::Catalog, partition_lookup_batch};
@ -38,7 +38,7 @@ const CACHE_ID: &str = "partition";

 type CacheT = Box<
    dyn Cache<
-        K = PartitionId,
+        K = TransitionPartitionId,
        V = Option<CachedPartition>,
        GetExtra = (Arc<CachedTable>, Option<Span>),
        PeekExtra = ((), Option<Span>),
@ -49,7 +49,7 @@ type CacheT = Box<
 #[derive(Debug)]
 pub struct PartitionCache {
    cache: CacheT,
-    remove_if_handle: RemoveIfHandle<PartitionId, Option<CachedPartition>>,
+    remove_if_handle: RemoveIfHandle<TransitionPartitionId, Option<CachedPartition>>,
    flusher: Arc<dyn BatchLoaderFlusher>,
 }

@ -64,7 +64,8 @@ impl PartitionCache {
        testing: bool,
    ) -> Self {
        let loader = FunctionLoader::new(
-            move |partition_ids: Vec<PartitionId>, cached_tables: Vec<Arc<CachedTable>>| {
+            move |partition_ids: Vec<TransitionPartitionId>,
+                  cached_tables: Vec<Arc<CachedTable>>| {
                // sanity checks
                assert_eq!(partition_ids.len(), cached_tables.len());

@ -75,23 +76,20 @@ impl PartitionCache {
                    // prepare output buffer
                    let mut out = (0..partition_ids.len()).map(|_| None).collect::<Vec<_>>();
                    let mut out_map =
-                        HashMap::<PartitionId, usize>::with_capacity(partition_ids.len());
+                        HashMap::<TransitionPartitionId, usize>::with_capacity(partition_ids.len());
                    for (idx, id) in partition_ids.iter().enumerate() {
-                        match out_map.entry(*id) {
-                            Entry::Occupied(_) => unreachable!("cache system requested same partition from loader concurrently, this should have been prevented by the CacheDriver"),
+                        match out_map.entry(id.clone()) {
+                            Entry::Occupied(_) => unreachable!(
+                                "cache system requested same partition from loader concurrently, \
+                                this should have been prevented by the CacheDriver"
+                            ),
                            Entry::Vacant(v) => {
                                v.insert(idx);
                            }
                        }
                    }

-                    // build `&[&TransitionPartitionId]` for batch catalog request
-                    let ids = partition_ids
-                        .iter()
-                        .copied()
-                        .map(TransitionPartitionId::Deprecated)
-                        .collect::<Vec<_>>();
-                    let ids = ids.iter().collect::<Vec<_>>();
+                    let ids: Vec<&TransitionPartitionId> = partition_ids.iter().collect();

                    // fetch catalog data
                    let partitions = Backoff::new(&backoff_config)
@ -104,7 +102,7 @@ impl PartitionCache {

                    // build output
                    for p in partitions {
-                        let idx = out_map[&p.id];
+                        let idx = out_map[&p.transition_partition_id()];
                        let cached_table = &cached_tables[idx];
                        let p = CachedPartition::new(p, cached_table);
                        out[idx] = Some(p);
@ -180,7 +178,7 @@ impl PartitionCache {

                    self.remove_if_handle.remove_if_and_get(
                        &self.cache,
-                        partition_id,
+                        partition_id.clone(),
                        move |cached_partition| {
                            let invalidates = if let Some(sort_key) =
                                &cached_partition.and_then(|p| p.sort_key)
@ -195,7 +193,7 @@ impl PartitionCache {

                            if invalidates {
                                debug!(
-                                    partition_id = partition_id.get(),
+                                    partition_id = %partition_id,
                                    "invalidate partition cache",
                                );
                            }
@ -217,13 +215,13 @@ impl PartitionCache {
 /// Request for [`PartitionCache::get`].
 #[derive(Debug)]
 pub struct PartitionRequest {
-    pub partition_id: PartitionId,
+    pub partition_id: TransitionPartitionId,
    pub sort_key_should_cover: Vec<ColumnId>,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CachedPartition {
-    pub id: PartitionId,
+    pub id: TransitionPartitionId,
    pub sort_key: Option<Arc<PartitionSortKey>>,
    pub column_ranges: ColumnRanges,
 }
@ -299,7 +297,7 @@ impl CachedPartition {
        column_ranges.shrink_to_fit();

        Self {
-            id: partition.id,
+            id: partition.transition_partition_id(),
            sort_key,
            column_ranges: Arc::new(column_ranges),
        }
@ -368,7 +366,10 @@ mod tests {
        ram::test_util::test_ram_pool, test_util::assert_catalog_access_metric_count,
    };
    use async_trait::async_trait;
-    use data_types::{partition_template::TablePartitionTemplateOverride, ColumnType};
+    use data_types::{
+        partition_template::TablePartitionTemplateOverride, ColumnType, PartitionHashId,
+        PartitionId, PartitionKey, TableId,
+    };
    use futures::StreamExt;
    use generated_types::influxdata::iox::partition_template::v1::{
        template_part::Part, PartitionTemplate, TemplatePart,
@ -419,8 +420,11 @@ mod tests {
            true,
        );

+        let p1_id = p1.transition_partition_id();
+        let p2_id = p2.transition_partition_id();
+
        let sort_key1a = cache
-            .get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), &p1_id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
@ -434,24 +438,24 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );

        let sort_key2 = cache
-            .get_one(Arc::clone(&cached_table), p2.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), &p2_id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key2, None);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            2,
        );

        let sort_key1b = cache
-            .get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), &p1_id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
@ -461,16 +465,37 @@ mod tests {
        ));
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            2,
        );

        // non-existing partition
        for _ in 0..2 {
+            // Non-existing partition identified by partition hash ID
            let res = cache
                .get_one(
                    Arc::clone(&cached_table),
-                    PartitionId::new(i64::MAX),
+                    &TransitionPartitionId::Deterministic(PartitionHashId::new(
+                        TableId::new(i64::MAX),
+                        &PartitionKey::from("bananas_not_found"),
+                    )),
+                    &[],
+                    None,
+                )
+                .await;
+            assert_eq!(res, None);
+            assert_catalog_access_metric_count(
+                &catalog.metric_registry,
+                "partition_get_by_hash_id_batch",
+                3,
+            );
+
+            // Non-existing partition identified by deprecated catalog IDs; this part can be
+            // removed when partition identification is fully transitioned to partition hash IDs
+            let res = cache
+                .get_one(
+                    Arc::clone(&cached_table),
+                    &TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
                    &Vec::new(),
                    None,
                )
@ -479,7 +504,7 @@ mod tests {
            assert_catalog_access_metric_count(
                &catalog.metric_registry,
                "partition_get_by_id_batch",
-                3,
+                1,
            );
        }
    }
@ -548,8 +573,14 @@ mod tests {
            true,
        );

+        let p1_id = p1.transition_partition_id();
+        let p2_id = p2.transition_partition_id();
+        let p3_id = p3.transition_partition_id();
+        let p4_id = p4.transition_partition_id();
+        let p5_id = p5.transition_partition_id();
+
        let ranges1a = cache
-            .get_one(Arc::clone(&cached_table), p1.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p1_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -578,12 +609,12 @@ mod tests {
        ));
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );

        let ranges2 = cache
-            .get_one(Arc::clone(&cached_table), p2.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p2_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -599,12 +630,12 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            2,
        );

        let ranges3 = cache
-            .get_one(Arc::clone(&cached_table), p3.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p3_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -629,12 +660,12 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            3,
        );

        let ranges4 = cache
-            .get_one(Arc::clone(&cached_table), p4.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p4_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -659,12 +690,12 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            4,
        );

        let ranges5 = cache
-            .get_one(Arc::clone(&cached_table), p5.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p5_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -680,28 +711,48 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            5,
        );

        let ranges1b = cache
-            .get_one(Arc::clone(&cached_table), p1.id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p1_id, &[], None)
            .await
            .unwrap()
            .column_ranges;
        assert!(Arc::ptr_eq(&ranges1a, &ranges1b));
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            5,
        );

-        // non-existing partition
        for _ in 0..2 {
+            // Non-existing partition identified by partition hash ID
            let res = cache
                .get_one(
                    Arc::clone(&cached_table),
-                    PartitionId::new(i64::MAX),
+                    &TransitionPartitionId::Deterministic(PartitionHashId::new(
+                        TableId::new(i64::MAX),
+                        &PartitionKey::from("bananas_not_found"),
+                    )),
+                    &[],
+                    None,
+                )
+                .await;
+            assert_eq!(res, None);
+            assert_catalog_access_metric_count(
+                &catalog.metric_registry,
+                "partition_get_by_hash_id_batch",
+                6,
+            );
+
+            // Non-existing partition identified by deprecated catalog IDs; this part can be
+            // removed when partition identification is fully transitioned to partition hash IDs
+            let res = cache
+                .get_one(
+                    Arc::clone(&cached_table),
+                    &TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
                    &[],
                    None,
                )
@ -710,7 +761,7 @@ mod tests {
            assert_catalog_access_metric_count(
                &catalog.metric_registry,
                "partition_get_by_id_batch",
-                6,
+                1,
            );
        }
    }
@ -724,7 +775,7 @@ mod tests {
        let c1 = t.create_column("foo", ColumnType::Tag).await;
        let c2 = t.create_column("time", ColumnType::Time).await;
        let p = t.create_partition("k1").await;
-        let p_id = p.partition.id;
+        let p_id = p.partition.transition_partition_id();
        let p_sort_key = p.partition.sort_key();
        let cached_table = Arc::new(CachedTable {
            id: t.table.id,
@ -751,41 +802,41 @@ mod tests {
        );

        let sort_key = cache
-            .get_one(Arc::clone(&cached_table), p_id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p_id, &[], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );

        // requesting nother will not expire
        assert!(p_sort_key.is_none());
        let sort_key = cache
-            .get_one(Arc::clone(&cached_table), p_id, &[], None)
+            .get_one(Arc::clone(&cached_table), &p_id, &[], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );

        // but requesting something will expire
        let sort_key = cache
-            .get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
+            .get_one(Arc::clone(&cached_table), &p_id, &[c1.column.id], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            2,
        );

@ -801,7 +852,7 @@ mod tests {
        // expire & fetch
        let p_sort_key = p.partition.sort_key();
        let sort_key = cache
-            .get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
+            .get_one(Arc::clone(&cached_table), &p_id, &[c1.column.id], None)
            .await
            .unwrap()
            .sort_key;
@ -815,7 +866,7 @@ mod tests {
        );
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            3,
        );

@ -827,7 +878,7 @@ mod tests {
            vec![c1.column.id, c2.column.id],
        ] {
            let sort_key_2 = cache
-                .get_one(Arc::clone(&cached_table), p_id, &should_cover, None)
+                .get_one(Arc::clone(&cached_table), &p_id, &should_cover, None)
                .await
                .unwrap()
                .sort_key;
@ -837,7 +888,7 @@ mod tests {
            ));
            assert_catalog_access_metric_count(
                &catalog.metric_registry,
-                "partition_get_by_id_batch",
+                "partition_get_by_hash_id_batch",
                3,
            );
        }
@ -847,7 +898,7 @@ mod tests {
        let sort_key_2 = cache
            .get_one(
                Arc::clone(&cached_table),
-                p_id,
+                &p_id,
                &[c1.column.id, c3.column.id],
                None,
            )
@ -861,7 +912,7 @@ mod tests {
        assert_eq!(sort_key, sort_key_2);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            4,
        );
    }
@ -892,34 +943,45 @@ mod tests {
            true,
        );

+        let p1_id = p1.transition_partition_id();
+        let p2_id = p2.transition_partition_id();
+
        let mut res = cache
            .get(
                Arc::clone(&cached_table),
                vec![
                    PartitionRequest {
-                        partition_id: p1.id,
+                        partition_id: p1_id.clone(),
                        sort_key_should_cover: vec![],
                    },
                    PartitionRequest {
-                        partition_id: p2.id,
+                        partition_id: p2_id.clone(),
                        sort_key_should_cover: vec![],
                    },
                    PartitionRequest {
-                        partition_id: p1.id,
+                        partition_id: p1_id.clone(),
+                        sort_key_should_cover: vec![],
+                    },
+                    // requesting non-existing partitions is fine, they just don't appear in
+                    // the output
+                    PartitionRequest {
+                        partition_id: TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
                        sort_key_should_cover: vec![],
                    },
                    PartitionRequest {
-                        // requesting non-existing partitions is fine, they just don't appear in the output
-                        partition_id: PartitionId::new(i64::MAX),
+                        partition_id: TransitionPartitionId::Deterministic(PartitionHashId::new(
+                            TableId::new(i64::MAX),
+                            &PartitionKey::from("bananas_not_found"),
+                        )),
                        sort_key_should_cover: vec![],
                    },
                ],
                None,
            )
            .await;
-        res.sort_by_key(|p| p.id);
-        let ids = res.iter().map(|p| p.id).collect::<Vec<_>>();
-        assert_eq!(ids, vec![p1.id, p1.id, p2.id]);
+        res.sort_by(|a, b| a.id.cmp(&b.id));
+        let ids = res.into_iter().map(|p| p.id).collect::<Vec<_>>();
+        assert_eq!(ids, vec![p1_id.clone(), p1_id, p2_id]);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
            "partition_get_by_id_batch",
@ -1008,7 +1070,7 @@ mod tests {
        c_id: ColumnId,

        /// Partitions within that table.
-        partitions: Vec<PartitionId>,
+        partitions: Vec<TransitionPartitionId>,
    }

    impl ConcurrencyTestState {
@ -1032,7 +1094,7 @@ mod tests {
                        t.create_partition_with_sort_key(&format!("p{i}"), &["time"])
                            .await
                            .partition
-                            .id
+                            .transition_partition_id()
                    }
                })
                .collect::<Vec<_>>()
@ -1046,7 +1108,8 @@ mod tests {
            }
        }

-        /// Perform the actual [`PartitionCache::get`] call and run some basic sanity checks on the result.
+        /// Perform the actual [`PartitionCache::get`] call and run some basic sanity checks on the
+        /// result.
        async fn run(self, cache: Arc<PartitionCache>) {
            let Self {
                cached_table,
@ -1060,15 +1123,15 @@ mod tests {
                    partitions
                        .iter()
                        .map(|p| PartitionRequest {
-                            partition_id: *p,
+                            partition_id: p.clone(),
                            sort_key_should_cover: vec![],
                        })
                        .collect(),
                    None,
                )
                .await;
-            results.sort_by_key(|p| p.id);
-            let partitions_res = results.iter().map(|p| p.id).collect::<Vec<_>>();
+            results.sort_by(|a, b| a.id.cmp(&b.id));
+            let partitions_res = results.iter().map(|p| p.id.clone()).collect::<Vec<_>>();
            assert_eq!(partitions, partitions_res);
            assert!(results
                .iter()
@ -1086,7 +1149,7 @@ mod tests {
        async fn get_one(
            &self,
            cached_table: Arc<CachedTable>,
-            partition_id: PartitionId,
+            partition_id: &TransitionPartitionId,
            sort_key_should_cover: &[ColumnId],
            span: Option<Span>,
        ) -> Option<CachedPartition>;
@ -1097,14 +1160,14 @@ mod tests {
        async fn get_one(
            &self,
            cached_table: Arc<CachedTable>,
-            partition_id: PartitionId,
+            partition_id: &TransitionPartitionId,
            sort_key_should_cover: &[ColumnId],
            span: Option<Span>,
        ) -> Option<CachedPartition> {
            self.get(
                cached_table,
                vec![PartitionRequest {
-                    partition_id,
+                    partition_id: partition_id.clone(),
                    sort_key_should_cover: sort_key_should_cover.to_vec(),
                }],
                span,
--- a/querier/src/cache/ram.rs
+++ b/querier/src/cache/ram.rs
@ -43,12 +43,14 @@ pub mod test_util {
    use std::sync::Arc;

    use cache_system::backend::policy::lru::ResourcePool;
+    use tokio::runtime::Handle;

    pub fn test_ram_pool() -> Arc<ResourcePool<RamSize>> {
        Arc::new(ResourcePool::new(
            "pool",
            RamSize(usize::MAX),
            Arc::new(metric::Registry::new()),
+            &Handle::current(),
        ))
    }
 }
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@ -859,10 +859,6 @@ impl IngesterPartition {
        }
    }

-    pub(crate) fn partition_id(&self) -> PartitionId {
-        self.partition_id
-    }
-
    pub(crate) fn transition_partition_id(&self) -> TransitionPartitionId {
        TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
    }
--- a/querier/src/parquet/creation.rs
+++ b/querier/src/parquet/creation.rs
@ -1,6 +1,6 @@
 use std::{collections::HashMap, sync::Arc};

-use data_types::{ChunkId, ChunkOrder, ColumnId, ParquetFile, PartitionId, TransitionPartitionId};
+use data_types::{ChunkId, ChunkOrder, ColumnId, ParquetFile, TransitionPartitionId};
 use futures::StreamExt;
 use hashbrown::HashSet;
 use iox_catalog::interface::Catalog;
@ -56,7 +56,7 @@ impl ChunkAdapter {
        &self,
        cached_table: Arc<CachedTable>,
        files: Arc<[Arc<ParquetFile>]>,
-        cached_partitions: &HashMap<PartitionId, CachedPartition>,
+        cached_partitions: &HashMap<TransitionPartitionId, CachedPartition>,
        span: Option<Span>,
    ) -> Vec<QuerierParquetChunk> {
        let span_recorder = SpanRecorder::new(span);
@ -170,18 +170,13 @@ impl ChunkAdapter {

        let order = ChunkOrder::new(parquet_file.file.max_l0_created_at.get());

-        let partition_id = parquet_file.file.partition_id;
-        let transition_partition_id = TransitionPartitionId::from((
-            partition_id,
-            parquet_file.file.partition_hash_id.as_ref(),
-        ));
+        let partition_id = parquet_file.file.partition_id.clone();

        let meta = Arc::new(QuerierParquetChunkMeta {
            chunk_id,
            order,
            sort_key: Some(sort_key),
            partition_id,
-            transition_partition_id,
        });

        let parquet_chunk = Arc::new(ParquetChunk::new(
--- a/querier/src/parquet/mod.rs
+++ b/querier/src/parquet/mod.rs
@ -1,6 +1,6 @@
 //! Querier Chunks

-use data_types::{ChunkId, ChunkOrder, PartitionId, TransitionPartitionId};
+use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
 use datafusion::physical_plan::Statistics;
 use iox_query::chunk_statistics::{create_chunk_statistics, ColumnRanges};
 use parquet_file::chunk::ParquetChunk;
@ -25,10 +25,7 @@ pub struct QuerierParquetChunkMeta {
    sort_key: Option<SortKey>,

    /// Partition ID.
-    partition_id: PartitionId,
-
-    /// Transition partition ID.
-    transition_partition_id: TransitionPartitionId,
+    partition_id: TransitionPartitionId,
 }

 impl QuerierParquetChunkMeta {
@ -43,13 +40,8 @@ impl QuerierParquetChunkMeta {
    }

    /// Partition ID.
-    pub fn partition_id(&self) -> PartitionId {
-        self.partition_id
-    }
-
-    /// Partition ID.
-    pub fn transition_partition_id(&self) -> &TransitionPartitionId {
-        &self.transition_partition_id
+    pub fn partition_id(&self) -> &TransitionPartitionId {
+        &self.partition_id
    }
 }

@ -251,7 +243,7 @@ pub mod tests {
                .get(
                    Arc::clone(&self.cached_table),
                    vec![PartitionRequest {
-                        partition_id: self.parquet_file.partition_id,
+                        partition_id: self.parquet_file.partition_id.clone(),
                        sort_key_should_cover: vec![],
                    }],
                    None,
@ -261,7 +253,7 @@ pub mod tests {
                .next()
                .unwrap();
            let cached_partitions =
-                HashMap::from([(self.parquet_file.partition_id, cached_partition)]);
+                HashMap::from([(self.parquet_file.partition_id.clone(), cached_partition)]);
            self.adapter
                .new_chunks(
                    Arc::clone(&self.cached_table),
--- a/querier/src/parquet/query_access.rs
+++ b/querier/src/parquet/query_access.rs
@ -15,11 +15,11 @@ impl QueryChunk for QuerierParquetChunk {
    }

    fn partition_id(&self) -> PartitionId {
-        self.meta().partition_id()
+        unimplemented!()
    }

    fn transition_partition_id(&self) -> &TransitionPartitionId {
-        self.meta().transition_partition_id()
+        self.meta().partition_id()
    }

    fn sort_key(&self) -> Option<&SortKey> {
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@ -8,7 +8,7 @@ use crate::{
    parquet::ChunkAdapter,
    IngesterConnection,
 };
-use data_types::{ColumnId, NamespaceId, ParquetFile, PartitionId, TableId};
+use data_types::{ColumnId, NamespaceId, ParquetFile, TableId, TransitionPartitionId};
 use datafusion::error::DataFusionError;
 use futures::join;
 use iox_query::{provider, provider::ChunkPruner, QueryChunk};
@ -282,7 +282,7 @@ impl QuerierTable {
        let chunks = partitions
            .into_iter()
            .filter_map(|mut c| {
-                let cached_partition = cached_partitions.get(&c.partition_id())?;
+                let cached_partition = cached_partitions.get(&c.transition_partition_id())?;
                c.set_partition_column_ranges(&cached_partition.column_ranges);
                Some(c)
            })
@ -322,16 +322,16 @@ impl QuerierTable {
        ingester_partitions: &[IngesterPartition],
        parquet_files: &[Arc<ParquetFile>],
        span: Option<Span>,
-    ) -> HashMap<PartitionId, CachedPartition> {
+    ) -> HashMap<TransitionPartitionId, CachedPartition> {
        let span_recorder = SpanRecorder::new(span);

-        let mut should_cover: HashMap<PartitionId, HashSet<ColumnId>> =
+        let mut should_cover: HashMap<TransitionPartitionId, HashSet<ColumnId>> =
            HashMap::with_capacity(ingester_partitions.len());

        // For ingester partitions we only need the column ranges -- which are static -- not the sort key. So it is
        // sufficient to collect the partition IDs.
        for p in ingester_partitions {
-            should_cover.entry(p.partition_id()).or_default();
+            should_cover.entry(p.transition_partition_id()).or_default();
        }

        // For parquet files we must ensure that the -- potentially evolving -- sort key coveres the primary key.
@ -342,7 +342,7 @@ impl QuerierTable {
            .collect::<HashSet<_>>();
        for f in parquet_files {
            should_cover
-                .entry(f.partition_id)
+                .entry(f.partition_id.clone())
                .or_default()
                .extend(f.column_set.iter().copied().filter(|id| pk.contains(id)));
        }
@ -366,7 +366,7 @@ impl QuerierTable {
            )
            .await;

-        partitions.into_iter().map(|p| (p.id, p)).collect()
+        partitions.into_iter().map(|p| (p.id.clone(), p)).collect()
    }

    /// Get a chunk pruner that can be used to prune chunks retrieved via [`chunks`](Self::chunks)
@ -889,7 +889,7 @@ mod tests {
        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 2);
@ -899,7 +899,7 @@ mod tests {
        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 4);
@ -912,7 +912,7 @@ mod tests {
        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );

@ -922,7 +922,7 @@ mod tests {
        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            1,
        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 6);
@ -936,7 +936,7 @@ mod tests {
        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
        assert_catalog_access_metric_count(
            &catalog.metric_registry,
-            "partition_get_by_id_batch",
+            "partition_get_by_hash_id_batch",
            2,
        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 8);
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -15,6 +15,7 @@ dml = { path = "../dml" }
 flate2 = "1.0"
 futures = "0.3.28"
 generated_types = { path = "../generated_types" }
+gossip = { version = "0.1.0", path = "../gossip" }
 hashbrown = { workspace = true }
 hyper = "0.14"
 iox_catalog = { path = "../iox_catalog" }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -16,6 +16,9 @@ pub struct RpcWriteRouterServer<D, N> {

    http: HttpDelegate<D, N>,
    grpc: RpcWriteGrpcDelegate,
+
+    // TODO: this shouldn't be here but it is here while it's unused elsewhere
+    _gossip_handle: Option<gossip::GossipHandle>,
 }

 impl<D, N> RpcWriteRouterServer<D, N> {
@ -26,12 +29,14 @@ impl<D, N> RpcWriteRouterServer<D, N> {
        grpc: RpcWriteGrpcDelegate,
        metrics: Arc<metric::Registry>,
        trace_collector: Option<Arc<dyn TraceCollector>>,
+        gossip_handle: Option<gossip::GossipHandle>,
    ) -> Self {
        Self {
            metrics,
            trace_collector,
            http,
            grpc,
+            _gossip_handle: gossip_handle,
        }
    }

--- a/service_grpc_catalog/src/lib.rs
+++ b/service_grpc_catalog/src/lib.rs
@ -18,7 +18,7 @@
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;

-use data_types::{PartitionId, TableId, TransitionPartitionId};
+use data_types::{PartitionHashId, PartitionId, TableId, TransitionPartitionId};
 use generated_types::influxdata::iox::catalog::v1::*;
 use iox_catalog::interface::{Catalog, SoftDeletedRows};
 use observability_deps::tracing::*;
@ -47,14 +47,14 @@ impl catalog_service_server::CatalogService for CatalogService {
    ) -> Result<Response<GetParquetFilesByPartitionIdResponse>, Status> {
        let mut repos = self.catalog.repositories().await;
        let req = request.into_inner();
-        let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(req.partition_id));
+        let partition_id = to_partition_id(req.partition_identifier)?;

        let parquet_files = repos
            .parquet_files()
            .list_by_partition_not_to_delete(&partition_id)
            .await
            .map_err(|e| {
-                warn!(error=%e, %req.partition_id, "failed to get parquet_files for partition");
+                warn!(error=%e, %partition_id, "failed to get parquet_files for partition");
                Status::not_found(e.to_string())
            })?;

@ -169,13 +169,52 @@ impl catalog_service_server::CatalogService for CatalogService {
    }
 }

+fn to_partition_identifier(partition_id: &TransitionPartitionId) -> PartitionIdentifier {
+    match partition_id {
+        TransitionPartitionId::Deterministic(hash_id) => PartitionIdentifier {
+            id: Some(partition_identifier::Id::HashId(
+                hash_id.as_bytes().to_owned(),
+            )),
+        },
+        TransitionPartitionId::Deprecated(id) => PartitionIdentifier {
+            id: Some(partition_identifier::Id::CatalogId(id.get())),
+        },
+    }
+}
+
+fn to_partition_id(
+    partition_identifier: Option<PartitionIdentifier>,
+) -> Result<TransitionPartitionId, Status> {
+    let partition_id =
+        match partition_identifier
+            .and_then(|pi| pi.id)
+            .ok_or(Status::invalid_argument(
+                "No partition identifier specified",
+            ))? {
+            partition_identifier::Id::HashId(bytes) => TransitionPartitionId::Deterministic(
+                PartitionHashId::try_from(&bytes[..]).map_err(|e| {
+                    Status::invalid_argument(format!(
+                        "Could not parse bytes as a `PartitionHashId`: {e}"
+                    ))
+                })?,
+            ),
+            partition_identifier::Id::CatalogId(id) => {
+                TransitionPartitionId::Deprecated(PartitionId::new(id))
+            }
+        };
+
+    Ok(partition_id)
+}
+
 // converts the catalog ParquetFile to protobuf
 fn to_parquet_file(p: data_types::ParquetFile) -> ParquetFile {
+    let partition_identifier = to_partition_identifier(&p.partition_id);
+
    ParquetFile {
        id: p.id.get(),
        namespace_id: p.namespace_id.get(),
        table_id: p.table_id.get(),
-        partition_id: p.partition_id.get(),
+        partition_identifier: Some(partition_identifier),
        object_store_id: p.object_store_id.to_string(),
        min_time: p.min_time.get(),
        max_time: p.max_time.get(),
@ -191,8 +230,10 @@ fn to_parquet_file(p: data_types::ParquetFile) -> ParquetFile {

 // converts the catalog Partition to protobuf
 fn to_partition(p: data_types::Partition) -> Partition {
+    let identifier = to_partition_identifier(&p.transition_partition_id());
+
    Partition {
-        id: p.id.get(),
+        identifier: Some(identifier),
        key: p.partition_key.to_string(),
        table_id: p.table_id.get(),
        array_sort_key: p.sort_key,
@ -230,8 +271,7 @@ mod tests {
            let p1params = ParquetFileParams {
                namespace_id: namespace.id,
                table_id: table.id,
-                partition_id: partition.id,
-                partition_hash_id: partition.hash_id().cloned(),
+                partition_id: partition.transition_partition_id(),
                object_store_id: Uuid::new_v4(),
                min_time: Timestamp::new(1),
                max_time: Timestamp::new(5),
@ -248,13 +288,15 @@ mod tests {
            };
            p1 = repos.parquet_files().create(p1params).await.unwrap();
            p2 = repos.parquet_files().create(p2params).await.unwrap();
-            partition_id = partition.id;
+            partition_id = partition.transition_partition_id();
            Arc::clone(&catalog)
        };

+        let partition_identifier = to_partition_identifier(&partition_id);
+
        let grpc = super::CatalogService::new(catalog);
        let request = GetParquetFilesByPartitionIdRequest {
-            partition_id: partition_id.get(),
+            partition_identifier: Some(partition_identifier),
        };

        let tonic_response = grpc
--- a/service_grpc_object_store/src/lib.rs
+++ b/service_grpc_object_store/src/lib.rs
@ -75,7 +75,7 @@ impl object_store_service_server::ObjectStoreService for ObjectStoreService {
        let path = ParquetFilePath::new(
            parquet_file.namespace_id,
            parquet_file.table_id,
-            &parquet_file.transition_partition_id(),
+            &parquet_file.partition_id.clone(),
            parquet_file.object_store_id,
        );
        let path = path.object_store_path();
@ -128,8 +128,7 @@ mod tests {
            let p1params = ParquetFileParams {
                namespace_id: namespace.id,
                table_id: table.id,
-                partition_id: partition.id,
-                partition_hash_id: partition.hash_id().cloned(),
+                partition_id: partition.transition_partition_id(),
                object_store_id: Uuid::new_v4(),
                min_time: Timestamp::new(1),
                max_time: Timestamp::new(5),
@ -150,7 +149,7 @@ mod tests {
        let path = ParquetFilePath::new(
            p1.namespace_id,
            p1.table_id,
-            &p1.transition_partition_id(),
+            &p1.partition_id.clone(),
            p1.object_store_id,
        );
        let path = path.object_store_path();
--- a/tracker/Cargo.toml
+++ b/tracker/Cargo.toml
@ -25,3 +25,4 @@ sysinfo = "0.29.7"
 tempfile = "3.7.0"
 # Need the multi-threaded executor for testing
 tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
+test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
--- a/tracker/src/disk_metric.rs
+++ b/tracker/src/disk_metric.rs
@ -1,7 +1,10 @@
-use std::{borrow::Cow, path::PathBuf, time::Duration};
+use std::borrow::Cow;
+use std::path::PathBuf;
+use std::time::Duration;

 use metric::{Attributes, U64Gauge};
 use sysinfo::{DiskExt, RefreshKind, System, SystemExt};
+use tokio::sync::watch;

 /// The interval at which disk metrics are updated.
 ///
@ -9,6 +12,32 @@ use sysinfo::{DiskExt, RefreshKind, System, SystemExt};
 /// interval.
 const UPDATE_INTERVAL: Duration = Duration::from_secs(13);

+/// An immutable snapshot of space and usage statistics for some disk.
+#[derive(Clone, Copy, Debug)]
+pub struct DiskSpaceSnapshot {
+    available_disk_space: u64,
+    total_disk_space: u64,
+}
+
+impl DiskSpaceSnapshot {
+    /// The available space in bytes on the disk.
+    pub fn available_disk_space(&self) -> u64 {
+        self.available_disk_space
+    }
+
+    /// The maximum capacity in bytes of the disk.
+    pub fn total_disk_space(&self) -> u64 {
+        self.total_disk_space
+    }
+
+    /// Overall usage of the disk, as a percentage [0.0, 1.0].
+    #[inline]
+    pub fn disk_usage_ratio(&self) -> f64 {
+        debug_assert!(self.available_disk_space <= self.total_disk_space);
+        1.0 - (self.available_disk_space as f64 / self.total_disk_space as f64)
+    }
+}
+
 /// A periodic reporter of disk capacity / free statistics for a given
 /// directory.
 #[derive(Debug)]
@ -22,12 +51,19 @@ pub struct DiskSpaceMetrics {
    /// The index into [`System::disks()`] for the disk containing the observed
    /// directory.
    disk_idx: usize,
+
+    /// A stream of [`DiskSpaceSnapshot`] produced by the metric reporter for
+    /// consumption by any listeners.
+    snapshot_tx: watch::Sender<DiskSpaceSnapshot>,
 }

 impl DiskSpaceMetrics {
    /// Create a new [`DiskSpaceMetrics`], returning [`None`] if no disk can be
    /// found for the specified `directory`.
-    pub fn new(directory: PathBuf, registry: &metric::Registry) -> Option<Self> {
+    pub fn new(
+        directory: PathBuf,
+        registry: &metric::Registry,
+    ) -> Option<(Self, watch::Receiver<DiskSpaceSnapshot>)> {
        let path: Cow<'static, str> = Cow::from(directory.display().to_string());
        let mut directory = directory.canonicalize().ok()?;

@ -52,14 +88,14 @@ impl DiskSpaceMetrics {

        // Resolve the mount point once.
        // The directory path may be `/path/to/dir` and the mount point is `/`.
-        let disk_idx = loop {
-            if let Some((idx, _disk)) = system
+        let (disk_idx, initial_disk) = loop {
+            if let Some((idx, disk)) = system
                .disks()
                .iter()
                .enumerate()
                .find(|(_idx, disk)| disk.mount_point() == directory)
            {
-                break idx;
+                break (idx, disk);
            }
            // The mount point for this directory could not be found.
            if !directory.pop() {
@ -67,18 +103,26 @@ impl DiskSpaceMetrics {
            }
        };

-        Some(Self {
-            available_disk_space,
-            total_disk_space,
-            system,
-            disk_idx,
-        })
+        let (snapshot_tx, snapshot_rx) = watch::channel(DiskSpaceSnapshot {
+            available_disk_space: initial_disk.available_space(),
+            total_disk_space: initial_disk.total_space(),
+        });
+
+        Some((
+            Self {
+                available_disk_space,
+                total_disk_space,
+                system,
+                disk_idx,
+                snapshot_tx,
+            },
+            snapshot_rx,
+        ))
    }

    /// Start the [`DiskSpaceMetrics`] evaluation loop, blocking forever.
    pub async fn run(mut self) {
        let mut interval = tokio::time::interval(UPDATE_INTERVAL);
-
        loop {
            interval.tick().await;

@ -93,6 +137,13 @@ impl DiskSpaceMetrics {

            self.available_disk_space.set(disk.available_space());
            self.total_disk_space.set(disk.total_space());
+
+            // Produce and send a [`DiskSpaceSnapshot`] for any listeners
+            // that might exist.
+            _ = self.snapshot_tx.send(DiskSpaceSnapshot {
+                available_disk_space: disk.available_space(),
+                total_disk_space: disk.total_space(),
+            });
        }
    }
 }
@ -103,6 +154,7 @@ mod tests {

    use metric::Metric;
    use tempfile::tempdir_in;
+    use test_helpers::timeout::FutureTimeout;

    use super::*;

@ -121,11 +173,9 @@ mod tests {

        let registry = Arc::new(metric::Registry::new());

-        let _handle = tokio::spawn(
-            DiskSpaceMetrics::new(pathbuf, &registry)
-                .expect("root always exists")
-                .run(),
-        );
+        let (_handle, mut snapshot_rx) =
+            DiskSpaceMetrics::new(pathbuf, &registry).expect("root always exists");
+        let _handle = tokio::spawn(_handle.run());

        // Wait for the metric to be emitted and non-zero - this should be very
        // quick!
@ -151,10 +201,45 @@ mod tests {
                .fetch();

            if recorded_free_metric > 0 && recorded_total_metric > 0 {
+                snapshot_rx
+                    .changed()
+                    .with_timeout_panic(Duration::from_secs(5))
+                    .await
+                    .expect("snapshot value should have changed");
+
+                let snapshot = *snapshot_rx.borrow();
+                assert_eq!(snapshot.available_disk_space, recorded_free_metric);
+                assert_eq!(snapshot.total_disk_space, recorded_total_metric);
+
                return;
            }

            tokio::time::sleep(Duration::from_millis(50)).await;
        }
    }
+
+    // Token test to assert disk usage ratio
+    #[test]
+    fn assert_disk_usage_ratio() {
+        // 80% used
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 2000,
+            total_disk_space: 10000,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.8);
+
+        // 90% used
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 2000,
+            total_disk_space: 20000,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.9);
+
+        // Free!
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 42,
+            total_disk_space: 42,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.0);
+    }
 }