influxdb/querier/src/table/mod.rs

use self::query_access::QuerierTableChunkPruner;
use crate::{
    chunk::ParquetChunkAdapter,
    ingester::{self, IngesterPartition},
    table::state_reconciler::{filter_parquet_files, tombstone_exclude_list},
    tombstone::QuerierTombstone,
    IngesterConnection,
};
use backoff::{Backoff, BackoffConfig};
use data_types::TableId;
use iox_query::{provider::ChunkPruner, QueryChunk};
use observability_deps::tracing::debug;
use predicate::Predicate;
use schema::Schema;
use snafu::{ResultExt, Snafu};
use std::{collections::HashMap, sync::Arc};

mod query_access;
mod state_reconciler;

#[cfg(test)]
mod test_util;

#[derive(Debug, Snafu)]
#[allow(clippy::large_enum_variant)]
pub enum Error {
    #[snafu(display("Error getting partitions from ingester: {}", source))]
    GettingIngesterPartitions { source: ingester::Error },

    #[snafu(display("Cannot combine ingester data with catalog/cache: {}", source))]
    StateFusion {
        source: state_reconciler::FilterParquetError,
    },
}

pub type Result<T, E = Error> = std::result::Result<T, E>;

/// Table representation for the querier.
#[derive(Debug)]
pub struct QuerierTable {
    /// Namespace the table is in
    namespace_name: Arc<str>,

    /// Backoff config for IO operations.
    backoff_config: BackoffConfig,

    /// Table name.
    name: Arc<str>,

    /// Table ID.
    id: TableId,

    /// Table schema.
    schema: Arc<Schema>,

    /// Connection to ingester
    ingester_connection: Arc<dyn IngesterConnection>,

    /// Interface to create chunks for this table.
    chunk_adapter: Arc<ParquetChunkAdapter>,
}

impl QuerierTable {
    /// Create new table.
    pub fn new(
        namespace_name: Arc<str>,
        backoff_config: BackoffConfig,
        id: TableId,
        name: Arc<str>,
        schema: Arc<Schema>,
        ingester_connection: Arc<dyn IngesterConnection>,
        chunk_adapter: Arc<ParquetChunkAdapter>,
    ) -> Self {
        Self {
            namespace_name,
            backoff_config,
            name,
            id,
            schema,
            ingester_connection,
            chunk_adapter,
        }
    }

    /// Table name.
    pub fn name(&self) -> &Arc<str> {
        &self.name
    }

    /// Table ID.
    #[allow(dead_code)]
    pub fn id(&self) -> TableId {
        self.id
    }

    /// Schema.
    pub fn schema(&self) -> &Arc<Schema> {
        &self.schema
    }

    /// Query all chunks within this table.
    ///
    /// This currently contains all parquet files linked to their unprocessed tombstones.
    pub async fn chunks(&self, predicate: &Predicate) -> Result<Vec<Arc<dyn QueryChunk>>> {
        debug!(?predicate, namespace=%self.namespace_name, table_name=%self.name(), "Fetching all chunks");

        // ask ingesters for data
        let ingester_partitions = self.ingester_partitions(predicate).await?;

        debug!(
            namespace=%self.namespace_name,
            table_name=%self.name(),
            num_ingester_partitions=%ingester_partitions.len(),
            "Ingester partitions fetched"
        );

        // get parquet files and tombstones in a single catalog transaction
        // IMPORTANT: this needs to happen AFTER gathering data from the ingesters
        // TODO: figure out some form of caching
        let (parquet_files, tombstones) = Backoff::new(&self.backoff_config)
            .retry_all_errors::<_, _, _, iox_catalog::interface::Error>(
                "get parquet files and tombstones for table",
                || async {
                    let mut txn = self.chunk_adapter.catalog().start_transaction().await?;

                    let parquet_files = txn
                        .parquet_files()
                        .list_by_table_not_to_delete_with_metadata(self.id)
                        .await?;

                    debug!(
                        ?parquet_files,
                        namespace=%self.namespace_name,
                        table_name=%self.name(),
                        "Parquet files from catalog"
                    );

                    let tombstones = txn.tombstones().list_by_table(self.id).await?;

                    txn.commit().await?;

                    Ok((parquet_files, tombstones))
                },
            )
            .await
            .expect("retry forever");

        // fuse ingester and catalog state
        let parquet_files =
            filter_parquet_files(&ingester_partitions, parquet_files).context(StateFusionSnafu)?;
        debug!(
            ?parquet_files,
            namespace=%self.namespace_name,
            table_name=%self.name(),
            "Parquet files after filtering"
        );
        let tombstone_exclusion = tombstone_exclude_list(&ingester_partitions, &tombstones);

        // convert parquet files and tombstones to nicer objects
        let mut chunks = Vec::with_capacity(parquet_files.len());
        for parquet_file_with_metadata in parquet_files {
            if let Some(chunk) = self
                .chunk_adapter
                .new_querier_chunk(parquet_file_with_metadata)
                .await
            {
                chunks.push(chunk);
            }
        }
        debug!(num_chunks=%chunks.len(), "Querier chunks");
        let querier_tombstones: Vec<_> =
            tombstones.into_iter().map(QuerierTombstone::from).collect();

        // match chunks and tombstones
        let mut tombstones_by_sequencer: HashMap<_, Vec<_>> = HashMap::new();
        for tombstone in querier_tombstones {
            tombstones_by_sequencer
                .entry(tombstone.sequencer_id())
                .or_default()
                .push(tombstone);
        }
        let mut chunks2 = Vec::with_capacity(chunks.len());
        for chunk in chunks.into_iter() {
            let chunk = if let Some(tombstones) =
                tombstones_by_sequencer.get(&chunk.meta().sequencer_id())
            {
                let mut delete_predicates = Vec::with_capacity(tombstones.len());
                for tombstone in tombstones {
                    // check conditions that don't need catalog access first to avoid unnecessary catalog load

                    // Check if tombstone should be excluded based on the ingester response
                    if tombstone_exclusion
                        .contains(&(chunk.meta().partition_id(), tombstone.tombstone_id()))
                    {
                        continue;
                    }

                    // Check if tombstone even applies to the sequence number range within the parquet file. There
                    // are the following cases here:
                    //
                    // 1. Tombstone comes before chunk min sequencer number:
                    //    There is no way the tombstone can affect the chunk.
                    // 2. Tombstone comes after chunk max sequencer number:
                    //    Tombstone affects whole chunk (it might be marked as processed though, we'll check that
                    //    further down).
                    // 3. Tombstone is in the min-max sequencer number range of the chunk:
                    //    Technically the querier has NO way to determine the rows that are affected by the tombstone
                    //    since we have no row-level sequence numbers. Such a file can be created by two sources -- the
                    //    ingester and the compactor. The ingester must have materialized the tombstone while creating
                    //    the parquet file, so the querier can skip it. The compactor also materialized the tombstones,
                    //    so we can skip it as well. In the compactor case the tombstone will even be marked as
                    //    processed.
                    //
                    // So the querier only needs to consider the tombstone in case 2.
                    if tombstone.sequence_number() <= chunk.meta().max_sequence_number() {
                        continue;
                    }

                    // TODO: also consider time ranges (https://github.com/influxdata/influxdb_iox/issues/4086)

                    // check if tombstone is marked as processed
                    if self
                        .chunk_adapter
                        .catalog_cache()
                        .processed_tombstones()
                        .exists(
                            chunk
                                .parquet_file_id()
                                .expect("just created from a parquet file"),
                            tombstone.tombstone_id(),
                        )
                        .await
                    {
                        continue;
                    }

                    delete_predicates.push(Arc::clone(tombstone.delete_predicate()));
                }
                chunk.with_delete_predicates(delete_predicates)
            } else {
                chunk
            };

            chunks2.push(Arc::new(chunk) as Arc<dyn QueryChunk>);
        }

        // Add ingester chunks to the overall chunk list.
        // - filter out chunks that don't have any record batches
        // - tombstones don't need to be applied since they were already materialized by the ingester
        chunks2.extend(
            ingester_partitions
                .into_iter()
                .filter(|c| c.has_batches())
                .map(|c| c as _),
        );

        debug!(num_chunks2=%chunks2.len(), "Chunks 2");

        Ok(chunks2)
    }

    /// Get a chunk pruner that can be used to prune chunks retrieved via [`chunks`](Self::chunks)
    pub fn chunk_pruner(&self) -> Arc<dyn ChunkPruner> {
        Arc::new(QuerierTableChunkPruner {})
    }

    async fn ingester_partitions(
        &self,
        predicate: &Predicate,
    ) -> Result<Vec<Arc<IngesterPartition>>> {
        // For now, ask for *all* columns in the table from the ingester (need
        // at least all pk (time, tag) columns for
        // deduplication.
        //
        // As a future optimization, might be able to fetch only
        // fields that are needed in query
        let columns: Vec<String> = self
            .schema
            .iter()
            .map(|(_, f)| f.name().to_string())
            .collect();

        // get any chunks from the ingster
        self.ingester_connection
            .partitions(
                Arc::clone(&self.namespace_name),
                Arc::clone(&self.name),
                columns,
                predicate,
                Arc::clone(&self.schema),
            )
            .await
            .context(GettingIngesterPartitionsSnafu)
    }
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use arrow::record_batch::RecordBatch;
    use assert_matches::assert_matches;
    use data_types::{ChunkId, ColumnType, SequenceNumber};
    use iox_tests::util::{now, TestCatalog};
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
    use predicate::Predicate;
    use schema::{builder::SchemaBuilder, selection::Selection, InfluxFieldType};

    use super::*;
    use crate::{
        ingester::{test_util::MockIngesterConnection, IngesterPartition},
        table::test_util::querier_table,
    };

    #[tokio::test]
    async fn test_parquet_chunks() {
        let pred = Predicate::default();
        let catalog = TestCatalog::new();

        let ns = catalog.create_namespace("ns").await;

        let table1 = ns.create_table("table1").await;
        let table2 = ns.create_table("table2").await;

        let sequencer1 = ns.create_sequencer(1).await;
        let sequencer2 = ns.create_sequencer(2).await;

        let partition11 = table1
            .with_sequencer(&sequencer1)
            .create_partition("k")
            .await;
        let partition12 = table1
            .with_sequencer(&sequencer2)
            .create_partition("k")
            .await;
        let partition21 = table2
            .with_sequencer(&sequencer1)
            .create_partition("k")
            .await;

        table1.create_column("foo", ColumnType::I64).await;
        table2.create_column("foo", ColumnType::I64).await;

        let querier_table = querier_table(&catalog, &table1).await;

        // no parquet files yet
        assert!(querier_table.chunks(&pred).await.unwrap().is_empty());

        let file111 = partition11
            .create_parquet_file_with_min_max(
                "table1 foo=1 11",
                1,
                2,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let file112 = partition11
            .create_parquet_file_with_min_max(
                "table1 foo=2 22",
                3,
                4,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let file113 = partition11
            .create_parquet_file_with_min_max(
                "table1 foo=3 33",
                5,
                6,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let file114 = partition11
            .create_parquet_file_with_min_max(
                "table1 foo=4 44",
                7,
                8,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let file115 = partition11
            .create_parquet_file_with_min_max(
                "table1 foo=5 55",
                9,
                10,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let file121 = partition12
            .create_parquet_file_with_min_max(
                "table1 foo=5 55",
                1,
                2,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;
        let _file211 = partition21
            .create_parquet_file_with_min_max(
                "table2 foo=6 66",
                1,
                2,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        file111.flag_for_delete().await;

        let tombstone1 = table1
            .with_sequencer(&sequencer1)
            .create_tombstone(7, 1, 100, "foo=1")
            .await;
        tombstone1.mark_processed(&file112).await;
        let tombstone2 = table1
            .with_sequencer(&sequencer1)
            .create_tombstone(8, 1, 100, "foo=1")
            .await;
        tombstone2.mark_processed(&file112).await;

        // now we have some files
        // this contains all files except for:
        // - file111: marked for delete
        // - file221: wrong table
        let mut chunks = querier_table.chunks(&pred).await.unwrap();
        chunks.sort_by_key(|c| c.id());
        assert_eq!(chunks.len(), 5);

        // check IDs
        assert_eq!(
            chunks[0].id(),
            ChunkId::new_test(file112.parquet_file.id.get() as u128),
        );
        assert_eq!(
            chunks[1].id(),
            ChunkId::new_test(file113.parquet_file.id.get() as u128),
        );
        assert_eq!(
            chunks[2].id(),
            ChunkId::new_test(file114.parquet_file.id.get() as u128),
        );
        assert_eq!(
            chunks[3].id(),
            ChunkId::new_test(file115.parquet_file.id.get() as u128),
        );
        assert_eq!(
            chunks[4].id(),
            ChunkId::new_test(file121.parquet_file.id.get() as u128),
        );

        // check delete predicates
        // file112: marked as processed
        assert_eq!(chunks[0].delete_predicates().len(), 0);
        // file113: has delete predicate
        assert_eq!(chunks[1].delete_predicates().len(), 2);
        // file114: predicates are directly within the chunk range => assume they are materialized
        assert_eq!(chunks[2].delete_predicates().len(), 0);
        // file115: came after in sequencer
        assert_eq!(chunks[3].delete_predicates().len(), 0);
        // file121: wrong sequencer
        assert_eq!(chunks[4].delete_predicates().len(), 0);
    }

    #[tokio::test]
    async fn test_compactor_collision() {
        let pred = Predicate::default();
        let catalog = TestCatalog::new();

        let ns = catalog.create_namespace("ns").await;
        let table = ns.create_table("table").await;
        let sequencer = ns.create_sequencer(1).await;
        let partition = table.with_sequencer(&sequencer).create_partition("k").await;
        table.create_column("foo", ColumnType::I64).await;

        // create a parquet file that cannot be processed by the querier:
        //
        //
        // --------------------------- sequence number ----------------------------->
        // |           0           |           1           |           2           |
        //
        //
        //                          Available Information:
        // (        ingester reports as "persited"         )
        //                                                 ( ingester in-mem data  )
        //                         (                  parquet file                 )
        //
        //
        //                        Desired Information:
        //                         (  wanted parquet data  )
        //                                                 ( ignored parquet data  )
        //                                                 ( ingester in-mem data  )
        //
        //
        // However there is no way to split the parquet data into the "wanted" and "ignored" part because we don't have
        // row-level sequence numbers.

        partition
            .create_parquet_file_with_min_max(
                "table foo=1 11",
                1,
                2,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        let querier_table = querier_table(&catalog, &table).await;

        querier_table
            .ingester_connection
            .as_any()
            .downcast_ref::<MockIngesterConnection>()
            .unwrap()
            .next_response(Ok(vec![Arc::new(
                IngesterPartition::try_new(
                    ChunkId::new(),
                    Arc::from(ns.namespace.name.clone()),
                    Arc::from(table.table.name.clone()),
                    partition.partition.id,
                    sequencer.sequencer.id,
                    Arc::new(SchemaBuilder::new().build().unwrap()),
                    Some(SequenceNumber::new(1)),
                    None,
                    vec![],
                )
                .unwrap(),
            )]));

        let err = querier_table.chunks(&pred).await.unwrap_err();
        assert_matches!(err, Error::StateFusion { .. });
    }

    #[tokio::test]
    async fn test_state_reconcile() {
        let pred = Predicate::default();
        let catalog = TestCatalog::new();

        let ns = catalog.create_namespace("ns").await;
        let table = ns.create_table("table").await;
        let sequencer = ns.create_sequencer(1).await;
        let partition1 = table
            .with_sequencer(&sequencer)
            .create_partition("k1")
            .await;
        let partition2 = table
            .with_sequencer(&sequencer)
            .create_partition("k2")
            .await;

        // kept because max sequence number <= 2
        let file1 = partition1
            .create_parquet_file_with_min_max(
                "table foo=1 11",
                1,
                2,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        // pruned because min sequence number > 2
        partition1
            .create_parquet_file_with_min_max(
                "table foo=2 22",
                3,
                3,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        // kept because max sequence number <= 3
        let file2 = partition2
            .create_parquet_file_with_min_max(
                "table foo=1 11",
                1,
                3,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        // pruned because min sequence number > 3
        partition2
            .create_parquet_file_with_min_max(
                "table foo=2 22",
                4,
                4,
                now().timestamp_nanos(),
                now().timestamp_nanos(),
            )
            .await;

        // partition1: kept because sequence number <= 10
        // partition2: kept because sequence number <= 11
        table
            .with_sequencer(&sequencer)
            .create_tombstone(10, 1, 100, "foo=1")
            .await;

        // partition1: pruned because sequence number > 10
        // partition2: kept because sequence number <= 11
        table
            .with_sequencer(&sequencer)
            .create_tombstone(11, 1, 100, "foo=2")
            .await;

        // partition1: pruned because sequence number > 10
        // partition2: pruned because sequence number > 11
        table
            .with_sequencer(&sequencer)
            .create_tombstone(12, 1, 100, "foo=3")
            .await;

        let querier_table = querier_table(&catalog, &table).await;

        let ingester_chunk_id1 = ChunkId::new_test(u128::MAX - 1);
        let ingester_chunk_id2 = ChunkId::new_test(u128::MAX);
        querier_table
            .ingester_connection
            .as_any()
            .downcast_ref::<MockIngesterConnection>()
            .unwrap()
            .next_response(Ok(vec![
                // this chunk is kept
                Arc::new(
                    IngesterPartition::try_new(
                        ingester_chunk_id1,
                        Arc::from(ns.namespace.name.clone()),
                        Arc::from(table.table.name.clone()),
                        partition1.partition.id,
                        sequencer.sequencer.id,
                        Arc::new(
                            SchemaBuilder::new()
                                .influx_field("foo", InfluxFieldType::Integer)
                                .timestamp()
                                .build()
                                .unwrap(),
                        ),
                        // parquet max persisted sequence number
                        Some(SequenceNumber::new(2)),
                        // tombstone max persisted sequence number
                        Some(SequenceNumber::new(10)),
                        vec![lp_to_record_batch("table foo=3i 33")],
                    )
                    .unwrap(),
                ),
                // this chunk is filtered out because it has no record batches but the reconciling still takes place
                Arc::new(
                    IngesterPartition::try_new(
                        ingester_chunk_id2,
                        Arc::from(ns.namespace.name.clone()),
                        Arc::from(table.table.name.clone()),
                        partition2.partition.id,
                        sequencer.sequencer.id,
                        Arc::new(
                            SchemaBuilder::new()
                                .influx_field("foo", InfluxFieldType::Integer)
                                .timestamp()
                                .build()
                                .unwrap(),
                        ),
                        // parquet max persisted sequence number
                        Some(SequenceNumber::new(3)),
                        // tombstone max persisted sequence number
                        Some(SequenceNumber::new(11)),
                        vec![],
                    )
                    .unwrap(),
                ),
            ]));

        let mut chunks = querier_table.chunks(&pred).await.unwrap();
        chunks.sort_by_key(|c| c.id());

        // three chunks (two parquet files and one for the in-mem ingester data)
        assert_eq!(chunks.len(), 3);

        // check IDs
        assert_eq!(
            chunks[0].id(),
            ChunkId::new_test(file1.parquet_file.id.get() as u128),
        );
        assert_eq!(
            chunks[1].id(),
            ChunkId::new_test(file2.parquet_file.id.get() as u128),
        );
        assert_eq!(chunks[2].id(), ingester_chunk_id1);

        // check delete predicates
        // parquet chunks have predicate attached
        assert_eq!(chunks[0].delete_predicates().len(), 1);
        assert_eq!(chunks[1].delete_predicates().len(), 2);
        // ingester in-mem chunk doesn't need predicates, because the ingester has already materialized them for us
        assert_eq!(chunks[2].delete_predicates().len(), 0);
    }

    fn lp_to_record_batch(lp: &str) -> RecordBatch {
        lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
    }
}