feat: Add a function to the catalog to fetch level 1 parquet files

Fixes #3946.
2022-03-10 16:06:37 -05:00 · 2022-03-10 16:06:37 -05:00 · 1dacf567d9
parent f184b7023c
commit 1dacf567d9
7 changed files with 402 additions and 36 deletions
--- a/compactor/src/compact.rs
+++ b/compactor/src/compact.rs
@ -2,10 +2,13 @@

 use crate::{
    query::QueryableParquetChunk,
-    utils::{CompactedData, ParquetFileWithTombstone, TablePartition},
+    utils::{CompactedData, ParquetFileWithTombstone},
 };
 use backoff::BackoffConfig;
-use data_types2::{ParquetFile, ParquetFileId, PartitionId, SequencerId, TableId, TombstoneId};
+use data_types2::{
+    ParquetFile, ParquetFileId, PartitionId, SequencerId, TableId, TablePartition, Timestamp,
+    TombstoneId,
+};
 use datafusion::error::DataFusionError;
 use iox_catalog::interface::Catalog;
 use object_store::ObjectStore;
@ -98,6 +101,11 @@ pub enum Error {
        source: iox_catalog::interface::Error,
    },

+    #[snafu(display("Error while requesting level 1 parquet files {}", source))]
+    Level1 {
+        source: iox_catalog::interface::Error,
+    },
+
    #[snafu(display("Error updating catalog {}", source))]
    Update {
        source: iox_catalog::interface::Error,
@ -156,6 +164,21 @@ impl Compactor {
            .context(Level0Snafu)
    }

+    async fn level_1_parquet_files(
+        &self,
+        table_partition: TablePartition,
+        min_time: Timestamp,
+        max_time: Timestamp,
+    ) -> Result<Vec<ParquetFile>> {
+        let mut repos = self.catalog.repositories().await;
+
+        repos
+            .parquet_files()
+            .level_1(table_partition, min_time, max_time)
+            .await
+            .context(Level1Snafu)
+    }
+
    async fn update_to_level_1(&self, parquet_file_ids: &[ParquetFileId]) -> Result<()> {
        let mut repos = self.catalog.repositories().await;

@ -189,12 +212,29 @@ impl Compactor {
        // Read level-0 parquet files
        let level_0_files = self.level_0_parquet_files(sequencer_id).await?;

+        // If there are no level-0 parquet files, return because there's nothing to do
+        if level_0_files.is_empty() {
+            return Ok(());
+        }
+
        // Group files into table partition
        let mut partitions = Self::group_parquet_files_into_partition(level_0_files);

-        // Get level-1 files overlapped with level-0
-        for (_key, val) in &mut partitions.iter_mut() {
-            let level_1_files: Vec<ParquetFile> = vec![]; // TODO: #3946
+        // Get level-1 files overlapped in time with level-0
+        for (key, val) in &mut partitions.iter_mut() {
+            let overall_min_time = val
+                .iter()
+                .map(|pf| pf.min_time)
+                .min()
+                .expect("The list of files was checked for emptiness above");
+            let overall_max_time = val
+                .iter()
+                .map(|pf| pf.max_time)
+                .max()
+                .expect("The list of files was checked for emptiness above");
+            let level_1_files = self
+                .level_1_parquet_files(*key, overall_min_time, overall_max_time)
+                .await?;
            val.extend(level_1_files);
        }

@ -285,7 +325,8 @@ impl Compactor {
    }

    // Compact given files. Assume the given files are overlaped in time.
-    // If the assumption does not meet, we will spend time not to compact anything but put data together
+    // If the assumption does not meet, we will spend time not to compact anything but put data
+    // together
    async fn compact(
        &self,
        overlapped_files: Vec<ParquetFileWithTombstone>,
--- a/compactor/src/utils.rs
+++ b/compactor/src/utils.rs
@ -1,38 +1,15 @@
 //! Helpers of the Compactor

-use std::{collections::HashSet, sync::Arc};
-
+use crate::query::QueryableParquetChunk;
 use arrow::record_batch::RecordBatch;
-use data_types2::{
-    ParquetFile, ParquetFileId, PartitionId, SequencerId, TableId, Tombstone, TombstoneId,
-};
+use data_types2::{ParquetFile, ParquetFileId, Tombstone, TombstoneId};
 use iox_object_store::IoxObjectStore;
 use object_store::ObjectStore;
 use parquet_file::{
    chunk::{new_parquet_chunk, ChunkMetrics, DecodedParquetFile},
    metadata::IoxMetadata,
 };
-
-use crate::query::QueryableParquetChunk;
-
-/// Define table partition
-#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
-pub struct TablePartition {
-    sequencer_id: SequencerId,
-    table_id: TableId,
-    partition_id: PartitionId,
-}
-
-impl TablePartition {
-    /// Return a new table partition
-    pub fn new(sequencer_id: SequencerId, table_id: TableId, partition_id: PartitionId) -> Self {
-        Self {
-            sequencer_id,
-            table_id,
-            partition_id,
-        }
-    }
-}
+use std::{collections::HashSet, sync::Arc};

 /// Wrapper of a parquet file and its tombstones
 #[allow(missing_docs)]
--- a/data_types2/src/lib.rs
+++ b/data_types2/src/lib.rs
@ -17,6 +17,7 @@ use std::{
    collections::BTreeMap,
    convert::TryFrom,
    fmt::{Debug, Formatter},
+    ops::{Add, Sub},
    sync::Arc,
 };
 use uuid::Uuid;
@ -189,6 +190,29 @@ impl std::fmt::Display for PartitionId {
    }
 }

+/// Combination of Sequencer ID, Table ID, and Partition ID useful for identifying groups of
+/// Parquet files to be compacted together.
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
+pub struct TablePartition {
+    /// The sequencer ID
+    pub sequencer_id: SequencerId,
+    /// The table ID
+    pub table_id: TableId,
+    /// The partition ID
+    pub partition_id: PartitionId,
+}
+
+impl TablePartition {
+    /// Combine the relevant parts
+    pub fn new(sequencer_id: SequencerId, table_id: TableId, partition_id: PartitionId) -> Self {
+        Self {
+            sequencer_id,
+            table_id,
+            partition_id,
+        }
+    }
+}
+
 /// Unique ID for a `Tombstone`
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
 #[sqlx(transparent)]
@ -234,6 +258,22 @@ impl Timestamp {
    }
 }

+impl Add<i64> for Timestamp {
+    type Output = Self;
+
+    fn add(self, other: i64) -> Self {
+        Self(self.0 + other)
+    }
+}
+
+impl Sub<i64> for Timestamp {
+    type Output = Self;
+
+    fn sub(self, other: i64) -> Self {
+        Self(self.0 - other)
+    }
+}
+
 /// Unique ID for a `ParquetFile`
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
 #[sqlx(transparent)]
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@ -5,7 +5,8 @@ use data_types2::{
    Column, ColumnSchema, ColumnType, KafkaPartition, KafkaTopic, KafkaTopicId, Namespace,
    NamespaceId, NamespaceSchema, ParquetFile, ParquetFileId, ParquetFileParams, Partition,
    PartitionId, PartitionInfo, ProcessedTombstone, QueryPool, QueryPoolId, SequenceNumber,
-    Sequencer, SequencerId, Table, TableId, TableSchema, Timestamp, Tombstone, TombstoneId,
+    Sequencer, SequencerId, Table, TableId, TablePartition, TableSchema, Timestamp, Tombstone,
+    TombstoneId,
 };
 use snafu::{OptionExt, Snafu};
 use std::{collections::BTreeMap, convert::TryFrom, fmt::Debug, sync::Arc};
@ -441,6 +442,16 @@ pub trait ParquetFileRepo: Send + Sync {
    /// define a file as a candidate for compaction
    async fn level_0(&mut self, sequencer_id: SequencerId) -> Result<Vec<ParquetFile>>;

+    /// List parquet files for a given table partition, in a given time range, with compaction
+    /// level 1, and other criteria that define a file as a candidate for compaction with a level 0
+    /// file
+    async fn level_1(
+        &mut self,
+        table_partition: TablePartition,
+        min_time: Timestamp,
+        max_time: Timestamp,
+    ) -> Result<Vec<ParquetFile>>;
+
    /// Update the compaction level of the specified parquet files to level 1. Returns the IDs
    /// of the files that were successfully updated.
    async fn update_to_level_1(
@ -551,6 +562,7 @@ pub(crate) mod test_helpers {
        test_tombstone(Arc::clone(&catalog)).await;
        test_parquet_file(Arc::clone(&catalog)).await;
        test_parquet_file_compaction_level_0(Arc::clone(&catalog)).await;
+        test_parquet_file_compaction_level_1(Arc::clone(&catalog)).await;
        test_update_to_compaction_level_1(Arc::clone(&catalog)).await;
        test_add_parquet_file_with_tombstones(Arc::clone(&catalog)).await;
        test_txn_isolation(Arc::clone(&catalog)).await;
@ -1541,6 +1553,230 @@ pub(crate) mod test_helpers {
        );
    }

+    async fn test_parquet_file_compaction_level_1(catalog: Arc<dyn Catalog>) {
+        let mut repos = catalog.repositories().await;
+        let kafka = repos.kafka_topics().create_or_get("foo").await.unwrap();
+        let pool = repos.query_pools().create_or_get("foo").await.unwrap();
+        let namespace = repos
+            .namespaces()
+            .create(
+                "namespace_parquet_file_compaction_level_1_test",
+                "inf",
+                kafka.id,
+                pool.id,
+            )
+            .await
+            .unwrap();
+        let table = repos
+            .tables()
+            .create_or_get("test_table", namespace.id)
+            .await
+            .unwrap();
+        let other_table = repos
+            .tables()
+            .create_or_get("test_table2", namespace.id)
+            .await
+            .unwrap();
+        let sequencer = repos
+            .sequencers()
+            .create_or_get(&kafka, KafkaPartition::new(100))
+            .await
+            .unwrap();
+        let other_sequencer = repos
+            .sequencers()
+            .create_or_get(&kafka, KafkaPartition::new(101))
+            .await
+            .unwrap();
+        let partition = repos
+            .partitions()
+            .create_or_get("one", sequencer.id, table.id)
+            .await
+            .unwrap();
+        let other_partition = repos
+            .partitions()
+            .create_or_get("two", sequencer.id, table.id)
+            .await
+            .unwrap();
+
+        // Set up the window of times we're interested in level 1 files for
+        let query_min_time = Timestamp::new(5);
+        let query_max_time = Timestamp::new(10);
+
+        // Create a file with times entirely within the window
+        let parquet_file_params = ParquetFileParams {
+            sequencer_id: sequencer.id,
+            table_id: partition.table_id,
+            partition_id: partition.id,
+            object_store_id: Uuid::new_v4(),
+            min_sequence_number: SequenceNumber::new(10),
+            max_sequence_number: SequenceNumber::new(140),
+            min_time: query_min_time + 1,
+            max_time: query_max_time - 1,
+            file_size_bytes: 1337,
+            parquet_metadata: b"md1".to_vec(),
+            row_count: 0,
+            created_at: Timestamp::new(1),
+        };
+        let parquet_file = repos
+            .parquet_files()
+            .create(parquet_file_params.clone())
+            .await
+            .unwrap();
+
+        // Create a file that will remain as level 0
+        let level_0_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            ..parquet_file_params.clone()
+        };
+        let _level_0_file = repos.parquet_files().create(level_0_params).await.unwrap();
+
+        // Create a file completely before the window
+        let too_early_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            min_time: query_min_time - 2,
+            max_time: query_min_time - 1,
+            ..parquet_file_params.clone()
+        };
+        let too_early_file = repos
+            .parquet_files()
+            .create(too_early_params)
+            .await
+            .unwrap();
+
+        // Create a file overlapping the window on the lower end
+        let overlap_lower_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            min_time: query_min_time - 1,
+            max_time: query_min_time + 1,
+            ..parquet_file_params.clone()
+        };
+        let overlap_lower_file = repos
+            .parquet_files()
+            .create(overlap_lower_params)
+            .await
+            .unwrap();
+
+        // Create a file overlapping the window on the upper end
+        let overlap_upper_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            min_time: query_max_time - 1,
+            max_time: query_max_time + 1,
+            ..parquet_file_params.clone()
+        };
+        let overlap_upper_file = repos
+            .parquet_files()
+            .create(overlap_upper_params)
+            .await
+            .unwrap();
+
+        // Create a file completely after the window
+        let too_late_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            min_time: query_max_time + 1,
+            max_time: query_max_time + 2,
+            ..parquet_file_params.clone()
+        };
+        let too_late_file = repos.parquet_files().create(too_late_params).await.unwrap();
+
+        // Create a file for some other sequencer
+        let other_sequencer_params = ParquetFileParams {
+            sequencer_id: other_sequencer.id,
+            object_store_id: Uuid::new_v4(),
+            ..parquet_file_params.clone()
+        };
+        let other_sequencer_file = repos
+            .parquet_files()
+            .create(other_sequencer_params)
+            .await
+            .unwrap();
+
+        // Create a file for the same sequencer but a different table
+        let other_table_params = ParquetFileParams {
+            table_id: other_table.id,
+            object_store_id: Uuid::new_v4(),
+            ..parquet_file_params.clone()
+        };
+        let other_table_file = repos
+            .parquet_files()
+            .create(other_table_params)
+            .await
+            .unwrap();
+
+        // Create a file for the same sequencer and table but a different partition
+        let other_partition_params = ParquetFileParams {
+            partition_id: other_partition.id,
+            object_store_id: Uuid::new_v4(),
+            ..parquet_file_params.clone()
+        };
+        let other_partition_file = repos
+            .parquet_files()
+            .create(other_partition_params)
+            .await
+            .unwrap();
+
+        // Create a file too big to be considered
+        let too_big_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            file_size_bytes: MAX_COMPACT_SIZE + 1,
+            ..parquet_file_params.clone()
+        };
+        let too_big_file = repos.parquet_files().create(too_big_params).await.unwrap();
+
+        // Create a file marked to be deleted
+        let to_delete_params = ParquetFileParams {
+            object_store_id: Uuid::new_v4(),
+            ..parquet_file_params.clone()
+        };
+        let to_delete_file = repos
+            .parquet_files()
+            .create(to_delete_params)
+            .await
+            .unwrap();
+        repos
+            .parquet_files()
+            .flag_for_delete(to_delete_file.id)
+            .await
+            .unwrap();
+
+        // Make all but _level_0_file compaction level 1
+        repos
+            .parquet_files()
+            .update_to_level_1(&[
+                parquet_file.id,
+                too_early_file.id,
+                too_late_file.id,
+                overlap_lower_file.id,
+                overlap_upper_file.id,
+                other_sequencer_file.id,
+                other_table_file.id,
+                other_partition_file.id,
+                too_big_file.id,
+                to_delete_file.id,
+            ])
+            .await
+            .unwrap();
+
+        // Level 1 parquet files for a sequencer should contain only those that match the right
+        // criteria
+        let table_partition = TablePartition::new(sequencer.id, table.id, partition.id);
+        let level_1 = repos
+            .parquet_files()
+            .level_1(table_partition, query_min_time, query_max_time)
+            .await
+            .unwrap();
+        let mut level_1_ids: Vec<_> = level_1.iter().map(|pf| pf.id).collect();
+        level_1_ids.sort();
+        let expected = vec![parquet_file, overlap_lower_file, overlap_upper_file];
+        let mut expected_ids: Vec<_> = expected.iter().map(|pf| pf.id).collect();
+        expected_ids.sort();
+
+        assert_eq!(
+            level_1_ids, expected_ids,
+            "\nlevel 1: {:#?}\nexpected: {:#?}",
+            level_1, expected,
+        );
+    }
+
    async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {
        let mut repos = catalog.repositories().await;
        let kafka = repos.kafka_topics().create_or_get("foo").await.unwrap();
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@ -15,7 +15,7 @@ use data_types2::{
    Column, ColumnId, ColumnType, KafkaPartition, KafkaTopic, KafkaTopicId, Namespace, NamespaceId,
    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionInfo,
    ProcessedTombstone, QueryPool, QueryPoolId, SequenceNumber, Sequencer, SequencerId, Table,
-    TableId, Timestamp, Tombstone, TombstoneId,
+    TableId, TablePartition, Timestamp, Tombstone, TombstoneId,
 };
 use observability_deps::tracing::warn;
 use std::fmt::Formatter;
@ -859,6 +859,31 @@ impl ParquetFileRepo for MemTxn {
            .collect())
    }

+    async fn level_1(
+        &mut self,
+        table_partition: TablePartition,
+        min_time: Timestamp,
+        max_time: Timestamp,
+    ) -> Result<Vec<ParquetFile>> {
+        let stage = self.stage();
+
+        Ok(stage
+            .parquet_files
+            .iter()
+            .filter(|f| {
+                f.sequencer_id == table_partition.sequencer_id
+                    && f.table_id == table_partition.table_id
+                    && f.partition_id == table_partition.partition_id
+                    && f.compaction_level == 1
+                    && !f.to_delete
+                    && f.file_size_bytes <= MAX_COMPACT_SIZE
+                    && ((f.min_time <= min_time && f.max_time >= min_time)
+                        || (f.min_time > min_time && f.min_time <= max_time))
+            })
+            .cloned()
+            .collect())
+    }
+
    async fn update_to_level_1(
        &mut self,
        parquet_file_ids: &[ParquetFileId],
--- a/iox_catalog/src/metrics.rs
+++ b/iox_catalog/src/metrics.rs
@ -10,7 +10,7 @@ use data_types2::{
    Column, ColumnType, KafkaPartition, KafkaTopic, KafkaTopicId, Namespace, NamespaceId,
    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionInfo,
    ProcessedTombstone, QueryPool, QueryPoolId, SequenceNumber, Sequencer, SequencerId, Table,
-    TableId, Timestamp, Tombstone, TombstoneId,
+    TableId, TablePartition, Timestamp, Tombstone, TombstoneId,
 };
 use metric::{Metric, U64Histogram, U64HistogramOptions};
 use std::{fmt::Debug, sync::Arc};
@ -263,6 +263,7 @@ decorate!(
        "parquet_list_by_sequencer_greater_than" = list_by_sequencer_greater_than(&mut self, sequencer_id: SequencerId, sequence_number: SequenceNumber) -> Result<Vec<ParquetFile>>;
        "parquet_list_by_namespace_not_to_delete" = list_by_namespace_not_to_delete(&mut self, namespace_id: NamespaceId) -> Result<Vec<ParquetFile>>;
        "parquet_level_0" = level_0(&mut self, sequencer_id: SequencerId) -> Result<Vec<ParquetFile>>;
+        "parquet_level_1" = level_1(&mut self, table_partition: TablePartition, min_time: Timestamp, max_time: Timestamp) -> Result<Vec<ParquetFile>>;
        "parquet_update_to_level_1" = update_to_level_1(&mut self, parquet_file_ids: &[ParquetFileId]) -> Result<Vec<ParquetFileId>>;
        "parquet_exist" = exist(&mut self, id: ParquetFileId) -> Result<bool>;
        "parquet_count" = count(&mut self) -> Result<i64>;
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@ -14,7 +14,7 @@ use data_types2::{
    Column, ColumnType, KafkaPartition, KafkaTopic, KafkaTopicId, Namespace, NamespaceId,
    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionInfo,
    ProcessedTombstone, QueryPool, QueryPoolId, SequenceNumber, Sequencer, SequencerId, Table,
-    TableId, Timestamp, Tombstone, TombstoneId,
+    TableId, TablePartition, Timestamp, Tombstone, TombstoneId,
 };
 use observability_deps::tracing::{info, warn};
 use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Acquire, Executor, Postgres, Row};
@ -1300,6 +1300,52 @@ WHERE parquet_file.sequencer_id = $1
        .map_err(|e| Error::SqlxError { source: e })
    }

+    async fn level_1(
+        &mut self,
+        table_partition: TablePartition,
+        min_time: Timestamp,
+        max_time: Timestamp,
+    ) -> Result<Vec<ParquetFile>> {
+        sqlx::query_as::<_, ParquetFile>(
+            r#"
+SELECT
+    parquet_file.id as id,
+    parquet_file.sequencer_id as sequencer_id,
+    parquet_file.table_id as table_id,
+    parquet_file.partition_id as partition_id,
+    parquet_file.object_store_id as object_store_id,
+    parquet_file.min_sequence_number as min_sequence_number,
+    parquet_file.max_sequence_number as max_sequence_number,
+    parquet_file.min_time as min_time,
+    parquet_file.max_time as max_time,
+    parquet_file.to_delete as to_delete,
+    parquet_file.file_size_bytes as file_size_bytes,
+    parquet_file.parquet_metadata as parquet_metadata,
+    parquet_file.row_count as row_count,
+    parquet_file.compaction_level as compaction_level,
+    parquet_file.created_at as created_at
+FROM parquet_file
+WHERE parquet_file.sequencer_id = $1
+  AND parquet_file.table_id = $2
+  AND parquet_file.partition_id = $3
+  AND parquet_file.compaction_level = 1
+  AND parquet_file.to_delete = false
+  AND parquet_file.file_size_bytes <= $4
+  AND ((parquet_file.min_time <= $5 AND parquet_file.max_time >= $5)
+      OR (parquet_file.min_time > $5 AND parquet_file.min_time <= $6))
+  ;"#,
+        )
+        .bind(&table_partition.sequencer_id) // $1
+        .bind(&table_partition.table_id) // $2
+        .bind(&table_partition.partition_id) // $3
+        .bind(MAX_COMPACT_SIZE) // $4
+        .bind(min_time) // $5
+        .bind(max_time) // $6
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(|e| Error::SqlxError { source: e })
+    }
+
    async fn update_to_level_1(
        &mut self,
        parquet_file_ids: &[ParquetFileId],