influxdb/ingester/src/query.rs

//! Module to handle query on Ingester's data

use std::sync::Arc;

use arrow::{
    array::BooleanArray, compute::filter_record_batch, error::Result as ArrowResult,
    record_batch::RecordBatch,
};
use arrow_util::util::merge_record_batches;
use data_types::{
    chunk_metadata::{ChunkAddr, ChunkId, ChunkOrder},
    delete_predicate::DeletePredicate,
    partition_metadata::TableSummary,
};
use datafusion::{
    error::DataFusionError,
    logical_plan::ExprRewritable,
    physical_plan::{
        common::SizedRecordBatchStream,
        metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
        PhysicalExpr, SendableRecordBatchStream,
    },
};
use iox_catalog::interface::{SequenceNumber, Tombstone};
use observability_deps::tracing::{debug, trace};
use predicate::{delete_predicate::parse_delete_predicate, Predicate, PredicateMatch};
use query::{
    exec::stringset::StringSet,
    util::{df_physical_expr_from_schema_and_expr, MissingColumnsToNull},
    QueryChunk, QueryChunkMeta,
};
use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
use snafu::{ResultExt, Snafu};

use crate::data::{QueryableBatch, SnapshotBatch};

#[allow(clippy::enum_variant_names)]
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
    #[snafu(display("Internal error concatenating record batches {}", source))]
    ConcatBatches { source: arrow::error::ArrowError },

    #[snafu(display("Internal error filtering columns from a record batch {}", source))]
    FilterColumns { source: crate::data::Error },

    #[snafu(display("Internal error rewriting predicate for QueryableBatch: {}", source))]
    RewritingFilterPredicate {
        source: datafusion::error::DataFusionError,
    },

    #[snafu(display(
        "Internal error converting logical expression to physical one: {}",
        source
    ))]
    ToPhysicalExpr {
        source: datafusion::error::DataFusionError,
    },

    #[snafu(display("Internal error filtering record batch: {}", source))]
    FilterBatch { source: arrow::error::ArrowError },
}

/// A specialized `Error` for Ingester's Query errors
pub type Result<T, E = Error> = std::result::Result<T, E>;

impl QueryableBatch {
    /// Initilaize a QueryableBatch
    pub fn new(table_name: &str, data: Vec<Arc<SnapshotBatch>>, deletes: Vec<Tombstone>) -> Self {
        let mut delete_predicates = vec![];
        for delete in &deletes {
            let delete_predicate = Arc::new(
                parse_delete_predicate(
                    &delete.min_time.get().to_string(),
                    &delete.max_time.get().to_string(),
                    &delete.serialized_predicate,
                )
                .expect("Error building delete predicate"),
            );

            delete_predicates.push(delete_predicate);
        }

        Self {
            data,
            deletes,
            delete_predicates,
            table_name: table_name.to_string(),
        }
    }

    /// return min and max of all the snapshots
    pub fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) {
        let min = self
            .data
            .first()
            .expect("The Queryable Batch should not empty")
            .min_sequencer_number;

        let max = self
            .data
            .first()
            .expect("The Queryable Batch should not empty")
            .max_sequencer_number;

        assert!(min <= max);

        (min, max)
    }

    /// return true if it has no data
    pub fn is_empty(&self) -> bool {
        self.data.is_empty()
    }
}

impl QueryChunkMeta for QueryableBatch {
    fn summary(&self) -> Option<&TableSummary> {
        None
    }

    fn schema(&self) -> Arc<Schema> {
        // todo: may want store this schema as a field of QueryableBatch and
        // only do this schema merge the first time it is call

        // Merge schema of all RecordBatches of the PerstingBatch
        let batches: Vec<Arc<RecordBatch>> =
            self.data.iter().map(|s| Arc::clone(&s.data)).collect();
        merge_record_batch_schemas(&batches)
    }

    fn sort_key(&self) -> Option<&SortKey> {
        None
    }

    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        self.delete_predicates.as_ref()
    }
}

impl QueryChunk for QueryableBatch {
    type Error = Error;

    // This function should not be used in QueryBatch context
    fn id(&self) -> ChunkId {
        // always return id 0 for debugging mode
        // todo: need to see if the same id for all chunks will cause any panics
        ChunkId::new_test(0)
    }

    // This function should not be used in PersistingBatch context
    fn addr(&self) -> ChunkAddr {
        unimplemented!()
    }

    /// Returns the name of the table stored in this chunk
    fn table_name(&self) -> &str {
        &self.table_name
    }

    /// Returns true if the chunk may contain a duplicate "primary
    /// key" within itself
    fn may_contain_pk_duplicates(&self) -> bool {
        // always true because they are not deduplicated yet
        true
    }

    /// Returns the result of applying the `predicate` to the chunk
    /// using an efficient, but inexact method, based on metadata.
    ///
    /// NOTE: This method is suitable for calling during planning, and
    /// may return PredicateMatch::Unknown for certain types of
    /// predicates.
    fn apply_predicate_to_metadata(
        &self,
        _predicate: &Predicate,
    ) -> Result<PredicateMatch, Self::Error> {
        Ok(PredicateMatch::Unknown)
    }

    /// Returns a set of Strings with column names from the specified
    /// table that have at least one row that matches `predicate`, if
    /// the predicate can be evaluated entirely on the metadata of
    /// this Chunk. Returns `None` otherwise
    fn column_names(
        &self,
        _predicate: &Predicate,
        _columns: Selection<'_>,
    ) -> Result<Option<StringSet>, Self::Error> {
        Ok(None)
    }

    /// Return a set of Strings containing the distinct values in the
    /// specified columns. If the predicate can be evaluated entirely
    /// on the metadata of this Chunk. Returns `None` otherwise
    ///
    /// The requested columns must all have String type.
    fn column_values(
        &self,
        _column_name: &str,
        _predicate: &Predicate,
    ) -> Result<Option<StringSet>, Self::Error> {
        Ok(None)
    }

    /// Provides access to raw `QueryChunk` data as an
    /// asynchronous stream of `RecordBatch`es filtered by a *required*
    /// predicate. Note that not all chunks can evaluate all types of
    /// predicates and this function will return an error
    /// if requested to evaluate with a predicate that is not supported
    ///
    /// This is the analog of the `TableProvider` in DataFusion
    ///
    /// The reason we can't simply use the `TableProvider` trait
    /// directly is that the data for a particular Table lives in
    /// several chunks within a partition, so there needs to be an
    /// implementation of `TableProvider` that stitches together the
    /// streams from several different `QueryChunk`s.
    fn read_filter(
        &self,
        predicate: &Predicate,
        selection: Selection<'_>,
    ) -> Result<SendableRecordBatchStream, Self::Error> {
        trace!(?selection, "selection");

        // Get all record batches from their snapshots
        let mut batches = vec![];
        for snapshot in &self.data {
            // Only return columns in the selection
            let batch = snapshot.scan(selection).context(FilterColumnsSnafu {})?;
            if let Some(batch) = batch {
                batches.push(batch);
            }
        }

        // Combine record batches into one batch and padding null values as needed
        // Schema of all record batches after mergeing
        let schema = merge_record_batch_schemas(&batches);
        let batch =
            merge_record_batches(schema.as_arrow(), batches).context(ConcatBatchesSnafu {})?;

        let mut stream_batches = vec![];
        if let Some(mut batch) = batch {
            // Apply predicate to filter data
            if let Some(filter_expr) = predicate.filter_expr() {
                // Since the predicate may include columns that the batches do not have,
                // we need to rewrite the predicate to replace those column names with NULL
                let mut rewriter = MissingColumnsToNull::new(&schema);
                let filter_expr = filter_expr
                    .rewrite(&mut rewriter)
                    .context(RewritingFilterPredicateSnafu {})?;
                let df_phy_expr =
                    df_physical_expr_from_schema_and_expr(schema.as_arrow(), filter_expr)
                        .context(ToPhysicalExprSnafu)?;
                let num_rows_before = batch.num_rows();
                batch = batch_filter(&batch, &df_phy_expr).context(FilterBatchSnafu)?;
                let num_rows_after = batch.num_rows();
                debug!(
                    ?num_rows_before,
                    ?num_rows_after,
                    "predicate pushdown for QueryableBatch"
                );
            }

            if batch.num_rows() > 0 {
                stream_batches.push(Arc::new(batch));
            }
        }

        // Return stream of data
        let dummy_metrics = ExecutionPlanMetricsSet::new();
        let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
        let stream = SizedRecordBatchStream::new(schema.as_arrow(), stream_batches, mem_metrics);
        Ok(Box::pin(stream))
    }

    /// Returns chunk type
    fn chunk_type(&self) -> &str {
        "PersistingBatch"
    }

    // This function should not be used in PersistingBatch context
    fn order(&self) -> ChunkOrder {
        unimplemented!()
    }
}

// Filter data from RecordBatch
// Borrow from DF's https://github.com/apache/arrow-datafusion/blob/ecd0081bde98e9031b81aa6e9ae2a4f309fcec12/datafusion/src/physical_plan/filter.rs#L186
// TODO: if we make DF batch_filter public, we can call that function directly
fn batch_filter(
    batch: &RecordBatch,
    predicate: &Arc<dyn PhysicalExpr>,
) -> ArrowResult<RecordBatch> {
    predicate
        .evaluate(batch)
        .map(|v| v.into_array(batch.num_rows()))
        .map_err(DataFusionError::into)
        .and_then(|array| {
            array
                .as_any()
                .downcast_ref::<BooleanArray>()
                .ok_or_else(|| {
                    DataFusionError::Internal(
                        "Filter predicate evaluated to non-boolean value".to_string(),
                    )
                    .into()
                })
                // apply filter array to record batch
                .and_then(|filter_array| filter_record_batch(batch, filter_array))
        })
}

#[cfg(test)]
mod tests {
    use crate::test_util::{
        create_batches_with_influxtype_different_columns_different_order,
        create_one_record_batch_with_influxtype_no_duplicates, create_tombstone,
        make_queryable_batch,
    };

    use super::*;

    use arrow::{
        array::{
            ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
            TimestampNanosecondArray, UInt64Array,
        },
        datatypes::{DataType, Int32Type, TimeUnit},
    };
    use arrow_util::assert_batches_eq;
    use data_types::{
        delete_predicate::{DeleteExpr, Op, Scalar},
        timestamp::TimestampRange,
    };
    use datafusion::logical_plan::{col, lit};
    use predicate::PredicateBuilder;

    #[tokio::test]
    async fn test_merge_batch_schema() {
        // Merge schema of the batches
        // The fields in the schema are sorted by column name
        let batches = create_batches();
        let merged_schema = (&*merge_record_batch_schemas(&batches)).clone();

        // Expected Arrow schema
        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![
            arrow::datatypes::Field::new(
                "dict",
                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
                true,
            ),
            arrow::datatypes::Field::new("int64", DataType::Int64, true),
            arrow::datatypes::Field::new("string", DataType::Utf8, true),
            arrow::datatypes::Field::new("bool", DataType::Boolean, true),
            arrow::datatypes::Field::new(
                "time",
                DataType::Timestamp(TimeUnit::Nanosecond, None),
                false,
            ),
            arrow::datatypes::Field::new("uint64", DataType::UInt64, false),
            arrow::datatypes::Field::new("float64", DataType::Float64, true),
        ]));
        let expected_schema = Schema::try_from(arrow_schema)
            .unwrap()
            .sort_fields_by_name();

        assert_eq!(
            expected_schema, merged_schema,
            "\nExpected:\n{:#?}\nActual:\n{:#?}",
            expected_schema, merged_schema
        );
    }

    #[tokio::test]
    async fn test_tombstones_to_delete_predicates() {
        // create tombstones
        let tombstones = vec![
            create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"),
            create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"),
        ];

        // This new queryable batch will convert tombstone to delete predicates
        let query_batch = QueryableBatch::new("test_table", vec![], tombstones);
        let predicates = query_batch.delete_predicates();
        let expected = vec![
            Arc::new(DeletePredicate {
                range: TimestampRange::new(100, 200),
                exprs: vec![DeleteExpr {
                    column: String::from("temp"),
                    op: Op::Eq,
                    scalar: Scalar::I64(10),
                }],
            }),
            Arc::new(DeletePredicate {
                range: TimestampRange::new(100, 350),
                exprs: vec![
                    DeleteExpr {
                        column: String::from("temp"),
                        op: Op::Ne,
                        scalar: Scalar::I64(10),
                    },
                    DeleteExpr {
                        column: String::from("city"),
                        op: Op::Eq,
                        scalar: Scalar::String(String::from(r#"Boston"#)),
                    },
                ],
            }),
        ];

        assert_eq!(expected, predicates);
    }

    #[tokio::test]
    async fn test_read_filter() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        let stream = batch
            .read_filter(&Predicate::default(), Selection::All) // return all columns
            .unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec![
            "+-----------+------+-----------------------------+",
            "| field_int | tag1 | time                        |",
            "+-----------+------+-----------------------------+",
            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
            "| 70        | UT   | 1970-01-01T00:00:00.000020Z |",
            "+-----------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_columns() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        let stream = batch
            .read_filter(
                &Predicate::default(),
                Selection::Some(&["time", "field_int"]), // return 2 out of 3 columns
            )
            .unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec![
            "+-----------+-----------------------------+",
            "| field_int | time                        |",
            "+-----------+-----------------------------+",
            "| 1000      | 1970-01-01T00:00:00.000008Z |",
            "| 10        | 1970-01-01T00:00:00.000010Z |",
            "| 70        | 1970-01-01T00:00:00.000020Z |",
            "+-----------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_predicate() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        // tag1 = VT
        let expr = col("tag1").eq(lit("VT"));
        let pred = PredicateBuilder::default().add_expr(expr).build();

        let stream = batch.read_filter(&pred, Selection::All).unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec![
            "+-----------+------+-----------------------------+",
            "| field_int | tag1 | time                        |",
            "+-----------+------+-----------------------------+",
            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
            "+-----------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_predicate_on_missing_column() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        // foo = VT
        let expr = col("foo").eq(lit("VT")); // `foo` column not available
        let pred = PredicateBuilder::default().add_expr(expr).build();

        let stream = batch.read_filter(&pred, Selection::All).unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        // missing_column = "VT" -> return nothing
        let expected = vec!["++", "++"];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_predicate_on_missing_column_is_null() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        // foo is NULL
        let expr = col("foo").is_null();
        let pred = PredicateBuilder::default().add_expr(expr).build();

        let stream = batch.read_filter(&pred, Selection::All).unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        // missing_column is NULL  --> return everything
        let expected = vec![
            "+-----------+------+-----------------------------+",
            "| field_int | tag1 | time                        |",
            "+-----------+------+-----------------------------+",
            "| 1000      | WA   | 1970-01-01T00:00:00.000008Z |",
            "| 10        | VT   | 1970-01-01T00:00:00.000010Z |",
            "| 70        | UT   | 1970-01-01T00:00:00.000020Z |",
            "+-----------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_not_exist_columns() {
        let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        let stream = batch
            .read_filter(
                &Predicate::default(),
                Selection::Some(&["foo"]), // column not exist
            )
            .unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec!["++", "++"];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_many_batches() {
        let batches = create_batches_with_influxtype_different_columns_different_order().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        let stream = batch
            .read_filter(&Predicate::default(), Selection::All) // return all columns
            .unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec![
            "+-----------+------+------+--------------------------------+",
            "| field_int | tag1 | tag2 | time                           |",
            "+-----------+------+------+--------------------------------+",
            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000001Z    |",
            "| 10        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000100Z |",
            "| 100       | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 5         | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
            "| 1000      | MT   | CT   | 1970-01-01T00:00:00.000002Z    |",
            "| 20        | MT   | AL   | 1970-01-01T00:00:00.000007Z    |",
            "| 70        | CT   | CT   | 1970-01-01T00:00:00.000000500Z |",
            "| 10        | AL   | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 30        | MT   | AL   | 1970-01-01T00:00:00.000000005Z |",
            "| 1000      |      | CT   | 1970-01-01T00:00:00.000001Z    |",
            "| 10        |      | AL   | 1970-01-01T00:00:00.000007Z    |",
            "| 70        |      | CT   | 1970-01-01T00:00:00.000000100Z |",
            "| 100       |      | MA   | 1970-01-01T00:00:00.000000050Z |",
            "| 5         |      | AL   | 1970-01-01T00:00:00.000005Z    |",
            "+-----------+------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_many_batches_filer_columns_predicates() {
        let batches = create_batches_with_influxtype_different_columns_different_order().await;
        let batch = make_queryable_batch("test_table", 1, batches);

        // Only read 2 columns: "tag1" and "time"
        let selection = Selection::Some(&["tag1", "time"]);

        // foo is NULL AND tag1=CT
        let expr = col("foo").is_null().and(col("tag1").eq(lit("CT")));
        let pred = PredicateBuilder::default().add_expr(expr).build();

        let stream = batch.read_filter(&pred, selection).unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        // missing_column is NULL AND tag1=CT --> return 2 columns with tag1=CT only
        let expected = vec![
            "+------+--------------------------------+",
            "| tag1 | time                           |",
            "+------+--------------------------------+",
            "| CT   | 1970-01-01T00:00:00.000000100Z |",
            "| CT   | 1970-01-01T00:00:00.000000500Z |",
            "+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &batches);
    }

    #[tokio::test]
    async fn test_read_filter_many_batches_not_exist_columns() {
        let batches = create_batches_with_influxtype_different_columns_different_order().await;
        let batch = make_queryable_batch("test_table", 1, batches);
        let stream = batch
            .read_filter(
                &Predicate::default(),
                Selection::Some(&["foo", "bar"]), // column not exist
            )
            .unwrap();
        let batches = datafusion::physical_plan::common::collect(stream)
            .await
            .unwrap();

        let expected = vec!["++", "++"];
        assert_batches_eq!(&expected, &batches);
    }

    // ----------------------------------------------------------------------------------------------
    // Data for testing

    // Create pure RecordBatches without knowledge of Influx datatype
    fn create_batches() -> Vec<Arc<RecordBatch>> {
        // Batch 1: <dict, i64, str, bool, time>  & 3 rows
        let dict_array: ArrayRef = Arc::new(
            vec![Some("a"), None, Some("b")]
                .into_iter()
                .collect::<DictionaryArray<Int32Type>>(),
        );
        let int64_array: ArrayRef =
            Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
        let string_array: ArrayRef = Arc::new(
            vec![Some("foo"), Some("and"), Some("bar")]
                .into_iter()
                .collect::<StringArray>(),
        );
        let bool_array: ArrayRef = Arc::new(
            [Some(true), None, Some(false)]
                .iter()
                .collect::<BooleanArray>(),
        );
        let ts_array: ArrayRef = Arc::new(
            [Some(150), Some(200), Some(1526823730000000000)]
                .iter()
                .collect::<TimestampNanosecondArray>(),
        );
        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
            ("dict", dict_array, true),
            ("int64", int64_array, true),
            ("string", string_array, true),
            ("bool", bool_array, true),
            ("time", ts_array, false), // not null
        ])
        .unwrap();

        // Batch 2: <dict, u64, f64, str, bool, time> & 2 rows
        let dict_array: ArrayRef = Arc::new(
            vec![None, Some("d")]
                .into_iter()
                .collect::<DictionaryArray<Int32Type>>(),
        );
        let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::<UInt64Array>()); // not null
        let float64_array: ArrayRef =
            Arc::new([Some(1.0), Some(2.0)].iter().collect::<Float64Array>());
        let string_array: ArrayRef = Arc::new(
            vec![Some("foo"), Some("bar")]
                .into_iter()
                .collect::<StringArray>(),
        );
        let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::<BooleanArray>());
        let ts_array: ArrayRef = Arc::new(
            [Some(100), Some(1626823730000000000)] // not null
                .iter()
                .collect::<TimestampNanosecondArray>(),
        );
        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
            ("dict", dict_array, true),
            ("uint64", uint64_array, false), // not null
            ("float64", float64_array, true),
            ("string", string_array, true),
            ("bool", bool_array, true),
            ("time", ts_array, false), // not null
        ])
        .unwrap();

        vec![Arc::new(batch1), Arc::new(batch2)]
    }
}