influxdb/compactor/src/query.rs

//! Queryable Compactor Data

use data_types::{
    ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
    TableSummary, Timestamp, TimestampMinMax, Tombstone,
};
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_query::{
    exec::{stringset::StringSet, IOxSessionContext},
    QueryChunk, QueryChunkMeta,
};
use observability_deps::tracing::trace;
use parquet_file::chunk::ParquetChunk;
use predicate::{delete_predicate::tombstones_to_delete_predicates, Predicate};
use schema::{merge::SchemaMerger, selection::Selection, sort::SortKey, Schema};
use snafu::{ResultExt, Snafu};
use std::{any::Any, sync::Arc};
use uuid::Uuid;

#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
    #[snafu(display("Failed to read parquet: {}", source))]
    ReadParquet {
        source: parquet_file::storage::ReadError,
    },

    #[snafu(display(
        "Error reading IOx Metadata from Parquet IoxParquetMetadata: {}",
        source
    ))]
    ReadParquetMeta {
        source: parquet_file::storage::ReadError,
    },
}

/// A specialized `Error` for Compactor's query errors
pub type Result<T, E = Error> = std::result::Result<T, E>;

/// QueryableParquetChunk that implements QueryChunk and QueryMetaChunk for building query plan
#[derive(Debug, Clone)]
pub struct QueryableParquetChunk {
    data: Arc<ParquetChunk>,                      // data of the parquet file
    delete_predicates: Vec<Arc<DeletePredicate>>, // converted from tombstones
    table_name: String,                           // needed to build query plan
    partition_id: PartitionId,
    max_sequence_number: SequenceNumber,
    min_time: Timestamp,
    max_time: Timestamp,
    sort_key: Option<SortKey>,
    partition_sort_key: Option<SortKey>,
    compaction_level: CompactionLevel,
    /// The compaction level that this operation will be when finished. Chunks from files that have
    /// the same level as this should get chunk order 0 so that files at a lower compaction level
    /// (and thus created later) should have priority in deduplication.
    ///
    /// That is:
    ///
    /// * When compacting L0 + L1, the target level is L1. L0 files should have priority, so all L1
    ///   files should have chunk order 0 to be sorted first.
    /// * When compacting L1 + L2, the target level is L2. L1 files should have priority, so all L2
    ///   files should have chunk order 0 to be sorted first.
    target_level: CompactionLevel,
}

impl QueryableParquetChunk {
    /// Initialize a QueryableParquetChunk
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        table_name: impl Into<String>,
        partition_id: PartitionId,
        data: Arc<ParquetChunk>,
        deletes: &[Tombstone],
        max_sequence_number: SequenceNumber,
        min_time: Timestamp,
        max_time: Timestamp,
        sort_key: Option<SortKey>,
        partition_sort_key: Option<SortKey>,
        compaction_level: CompactionLevel,
        target_level: CompactionLevel,
    ) -> Self {
        let delete_predicates = tombstones_to_delete_predicates(deletes);
        Self {
            data,
            delete_predicates,
            table_name: table_name.into(),
            partition_id,
            max_sequence_number,
            min_time,
            max_time,
            sort_key,
            partition_sort_key,
            compaction_level,
            target_level,
        }
    }

    /// Merge schema of the given chunks
    pub fn merge_schemas(chunks: &[Arc<dyn QueryChunk>]) -> Arc<Schema> {
        let mut merger = SchemaMerger::new();
        for chunk in chunks {
            merger = merger.merge(&chunk.schema()).expect("schemas compatible");
        }
        merger.build()
    }

    /// Return max sequence number
    pub fn max_sequence_number(&self) -> SequenceNumber {
        self.max_sequence_number
    }

    /// Return min time
    pub fn min_time(&self) -> i64 {
        self.min_time.get()
    }

    /// Return max time
    pub fn max_time(&self) -> i64 {
        self.max_time.get()
    }

    /// Return the parquet file's object store id
    pub fn object_store_id(&self) -> Uuid {
        self.data.object_store_id()
    }
}

impl QueryChunkMeta for QueryableParquetChunk {
    fn summary(&self) -> Option<Arc<TableSummary>> {
        None
    }

    fn schema(&self) -> Arc<Schema> {
        self.data.schema()
    }

    fn partition_sort_key(&self) -> Option<&SortKey> {
        self.partition_sort_key.as_ref()
    }

    fn partition_id(&self) -> PartitionId {
        self.partition_id
    }

    fn sort_key(&self) -> Option<&SortKey> {
        self.sort_key.as_ref()
    }

    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        self.delete_predicates.as_ref()
    }

    fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
        Some(TimestampMinMax {
            min: self.min_time(),
            max: self.max_time(),
        })
    }
}

impl QueryChunk for QueryableParquetChunk {
    // This function is needed to distinguish the ParquetChunks further if they happen to have the
    // same creation order.
    // Ref: chunks.sort_unstable_by_key(|c| (c.order(), c.id())); in provider.rs
    // Note: The order of this QueryableParquetChunk is the parquet file's min_sequence_number which
    // will be the same for parquet files of splitted compacted data.
    //
    // This function returns the parquet file's min_time which will be always different for the
    // parquet files of same order/min_sequence_number and is good to order the parquet file
    fn id(&self) -> ChunkId {
        // When we need the order to split overlapped chunks, the ChunkOrder is already different.
        // ChunkId is used as tiebreaker does not matter much, so use the object store id
        self.object_store_id().into()
    }

    /// Returns the name of the table stored in this chunk
    fn table_name(&self) -> &str {
        &self.table_name
    }

    /// Returns true if the chunk may contain a duplicate "primary
    /// key" within itself
    fn may_contain_pk_duplicates(&self) -> bool {
        // data within this parquet chunk was deduplicated
        false
    }

    /// Returns a set of Strings with column names from the specified
    /// table that have at least one row that matches `predicate`, if
    /// the predicate can be evaluated entirely on the metadata of
    /// this Chunk. Returns `None` otherwise
    fn column_names(
        &self,
        _ctx: IOxSessionContext,
        _predicate: &Predicate,
        _columns: Selection<'_>,
    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

    /// Return a set of Strings containing the distinct values in the
    /// specified columns. If the predicate can be evaluated entirely
    /// on the metadata of this Chunk. Returns `None` otherwise
    ///
    /// The requested columns must all have String type.
    fn column_values(
        &self,
        _ctx: IOxSessionContext,
        _column_name: &str,
        _predicate: &Predicate,
    ) -> Result<Option<StringSet>, DataFusionError> {
        Ok(None)
    }

    /// Provides access to raw `QueryChunk` data as an
    /// asynchronous stream of `RecordBatch`es filtered by a *required*
    /// predicate. Note that not all chunks can evaluate all types of
    /// predicates and this function will return an error
    /// if requested to evaluate with a predicate that is not supported
    ///
    /// This is the analog of the `TableProvider` in DataFusion
    ///
    /// The reason we can't simply use the `TableProvider` trait
    /// directly is that the data for a particular Table lives in
    /// several chunks within a partition, so there needs to be an
    /// implementation of `TableProvider` that stitches together the
    /// streams from several different `QueryChunk`s.
    fn read_filter(
        &self,
        mut ctx: IOxSessionContext,
        predicate: &Predicate,
        selection: Selection<'_>,
    ) -> Result<SendableRecordBatchStream, DataFusionError> {
        ctx.set_metadata("storage", "compactor");
        ctx.set_metadata("projection", format!("{}", selection));
        trace!(?selection, "selection");

        self.data
            .read_filter(predicate, selection)
            .context(ReadParquetSnafu)
            .map_err(|e| DataFusionError::External(Box::new(e)))
    }

    /// Returns chunk type
    fn chunk_type(&self) -> &str {
        "QueryableParquetChunk"
    }

    // Order of the chunk so they can be deduplicated correctly
    fn order(&self) -> ChunkOrder {
        use CompactionLevel::*;
        match (self.target_level, self.compaction_level) {
            // Files of the same level as what they're being compacting into were created earlier,
            // so they should be sorted first so that files created later that haven't yet been
            // compacted to this level will have priority when resolving duplicates.
            (FileNonOverlapped, FileNonOverlapped) => ChunkOrder::new(0),
            (Final, Final) => ChunkOrder::new(0),

            // Files that haven't yet been compacted to the target level were created later and
            // should be sorted based on their max sequence number.
            (FileNonOverlapped, Initial) => ChunkOrder::new(self.max_sequence_number.get()),
            (Final, FileNonOverlapped) => ChunkOrder::new(self.max_sequence_number.get()),

            // These combinations of target compaction level and file compaction level are
            // invalid in this context given the current compaction algorithm.
            (Initial, _) => panic!("Can't compact into CompactionLevel::Initial"),
            (FileNonOverlapped, Final) => panic!(
                "Can't compact CompactionLevel::Final into CompactionLevel::FileNonOverlapped"
            ),
            (Final, Initial) => {
                panic!("Can't compact CompactionLevel::Initial into CompactionLevel::Final")
            }
        }
    }

    fn as_any(&self) -> &dyn Any {
        self
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use data_types::ColumnType;
    use iox_tests::util::{TestCatalog, TestParquetFileBuilder};
    use parquet_file::storage::ParquetStorage;

    async fn test_setup(
        compaction_level: CompactionLevel,
        target_level: CompactionLevel,
        max_sequence_number: i64,
    ) -> QueryableParquetChunk {
        let catalog = TestCatalog::new();
        let ns = catalog.create_namespace("ns").await;
        let shard = ns.create_shard(1).await;
        let table = ns.create_table("table").await;
        table.create_column("field_int", ColumnType::I64).await;
        table.create_column("tag1", ColumnType::Tag).await;
        table.create_column("time", ColumnType::Time).await;

        let partition = table
            .with_shard(&shard)
            .create_partition("2022-07-13")
            .await;

        let lp = vec!["table,tag1=WA field_int=1000i 8000"].join("\n");
        let builder = TestParquetFileBuilder::default()
            .with_line_protocol(&lp)
            .with_compaction_level(compaction_level)
            .with_max_seq(max_sequence_number);
        let file = partition.create_parquet_file(builder).await;
        let parquet_file = Arc::new(file.parquet_file);

        let parquet_chunk = Arc::new(ParquetChunk::new(
            Arc::clone(&parquet_file),
            Arc::new(table.schema().await),
            ParquetStorage::new(Arc::clone(&catalog.object_store)),
        ));

        QueryableParquetChunk::new(
            "table",
            partition.partition.id,
            parquet_chunk,
            &[],
            parquet_file.max_sequence_number,
            parquet_file.min_time,
            parquet_file.max_time,
            None,
            None,
            parquet_file.compaction_level,
            target_level,
        )
    }

    #[tokio::test]
    async fn chunk_order_is_max_seq_when_compaction_level_0_and_target_level_1() {
        let chunk = test_setup(
            CompactionLevel::Initial,
            CompactionLevel::FileNonOverlapped,
            2,
        )
        .await;

        assert_eq!(chunk.order(), ChunkOrder::new(2));
    }

    #[tokio::test]
    async fn chunk_order_is_0_when_compaction_level_1_and_target_level_1() {
        let chunk = test_setup(
            CompactionLevel::FileNonOverlapped,
            CompactionLevel::FileNonOverlapped,
            2,
        )
        .await;

        assert_eq!(chunk.order(), ChunkOrder::new(0));
    }

    #[tokio::test]
    async fn chunk_order_is_max_seq_when_compaction_level_1_and_target_level_2() {
        let chunk = test_setup(
            CompactionLevel::FileNonOverlapped,
            CompactionLevel::Final,
            2,
        )
        .await;

        assert_eq!(chunk.order(), ChunkOrder::new(2));
    }

    #[tokio::test]
    async fn chunk_order_is_0_when_compaction_level_2_and_target_level_2() {
        let chunk = test_setup(CompactionLevel::Final, CompactionLevel::Final, 2).await;

        assert_eq!(chunk.order(), ChunkOrder::new(0));
    }
}