From a1b29b3ebbcabc7a5ca093a33186e6c7559a35a0 Mon Sep 17 00:00:00 2001 From: Stuart Carnie Date: Tue, 4 Apr 2023 10:19:50 +1000 Subject: [PATCH] fix: Incorrect results when planning aggregate expressions (#7431) * fix: Incorrect results when mixing non-existent fields in aggregates * chore: Improve comments around aggregates --- .../query_tests2/cases/in/issue_6112.influxql | 18 +++++-- .../cases/in/issue_6112.influxql.expected | 41 ++++++++++++++++ iox_query_influxql/src/plan/planner.rs | 47 +++++++++++++++++-- 3 files changed, 97 insertions(+), 9 deletions(-) diff --git a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql index 793d4decac..473cb712b3 100644 --- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql +++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql @@ -307,12 +307,20 @@ SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk; -- IOX_COMPARE: sorted SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none); --- TODO(sgc): The following two queries produce incorrect results --- See: https://github.com/influxdata/influxdb_iox/issues/7361 -- using aggregates across measurements --- SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk; --- using aggregates with missing fields --- SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu; +SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk; +-- using aggregates with missing fields, should return a single, null value for the row +SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu; +-- should return the mean result and a null result for the second column +SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu; +-- should return no results +SELECT MEAN(foo) FROM cpu; +-- should return a row for each tag key +SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu; +-- should return a row result for each tag key +SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu; +-- should return nothing +SELECT MEAN(foo) FROM cpu GROUP BY cpu; SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none); -- supports offset parameter diff --git a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected index 2b3f0d5e1c..112caa52a1 100644 --- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected +++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected @@ -645,6 +645,47 @@ Error while planning query: Error during planning: invalid number of arguments f | disk | 2022-10-31T02:00:00Z | | 2234.0 | | disk | 2022-10-31T02:00:10Z | | 2239.0 | +------------------+----------------------+--------------------+--------+ +-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk; ++------------------+----------------------+---------------------------------+ +| iox::measurement | time | mean_usage_idle_mean_bytes_free | ++------------------+----------------------+---------------------------------+ +| cpu | 1970-01-01T00:00:00Z | | +| disk | 1970-01-01T00:00:00Z | | ++------------------+----------------------+---------------------------------+ +-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu; ++------------------+----------------------+--------------------------+ +| iox::measurement | time | mean_usage_idle_mean_foo | ++------------------+----------------------+--------------------------+ +| cpu | 1970-01-01T00:00:00Z | | ++------------------+----------------------+--------------------------+ +-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu; ++------------------+----------------------+--------------------+--------------------------+ +| iox::measurement | time | mean | mean_usage_idle_mean_foo | ++------------------+----------------------+--------------------+--------------------------+ +| cpu | 1970-01-01T00:00:00Z | 1.9850000000000003 | | ++------------------+----------------------+--------------------+--------------------------+ +-- InfluxQL: SELECT MEAN(foo) FROM cpu; +++ +++ +-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu; ++------------------+----------------------+-----------+--------------------------+ +| iox::measurement | time | cpu | mean_usage_idle_mean_foo | ++------------------+----------------------+-----------+--------------------------+ +| cpu | 1970-01-01T00:00:00Z | cpu-total | | +| cpu | 1970-01-01T00:00:00Z | cpu0 | | +| cpu | 1970-01-01T00:00:00Z | cpu1 | | ++------------------+----------------------+-----------+--------------------------+ +-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu; ++------------------+----------------------+-----------+--------------------+--------------------------+ +| iox::measurement | time | cpu | mean | mean_usage_idle_mean_foo | ++------------------+----------------------+-----------+--------------------+--------------------------+ +| cpu | 1970-01-01T00:00:00Z | cpu-total | 2.9850000000000003 | | +| cpu | 1970-01-01T00:00:00Z | cpu0 | 0.985 | | +| cpu | 1970-01-01T00:00:00Z | cpu1 | 1.9849999999999999 | | ++------------------+----------------------+-----------+--------------------+--------------------------+ +-- InfluxQL: SELECT MEAN(foo) FROM cpu GROUP BY cpu; +++ +++ -- InfluxQL: SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none); +------------------+----------------------+-------+------+ | iox::measurement | time | count | sum | diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs index 1b8f131bfa..448a9d83c3 100644 --- a/iox_query_influxql/src/plan/planner.rs +++ b/iox_query_influxql/src/plan/planner.rs @@ -110,7 +110,7 @@ struct Context<'a> { scope: ExprScope, tz: Option, - /// `true` if the query projects aggregates. + /// `true` if the query projection specifies aggregate expressions. is_aggregate: bool, // GROUP BY information @@ -439,6 +439,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> { group_by_tag_set: &[&str], schemas: &Schemas, ) -> Result<(LogicalPlan, Vec)> { + if !ctx.is_aggregate { + return Ok((input, select_exprs)); + } + let Some(time_column_index) = find_time_column_index(fields) else { return Err(DataFusionError::Internal("unable to find time column".to_owned())) }; @@ -452,10 +456,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> { // will produce two aggregate expressions: // // [SUM(foo), COUNT(foo)] + // + // NOTE: + // + // It is possible this vector is empty, when all the fields in the + // projection refer to columns that do not exist in the current + // table. let aggr_exprs = find_aggregate_exprs(&select_exprs); - if aggr_exprs.is_empty() { - return Ok((input, select_exprs)); - } let aggr_group_by_exprs = if let Some(group_by) = ctx.group_by { let mut group_by_exprs = Vec::new(); @@ -483,6 +490,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> { vec![] }; + if aggr_exprs.is_empty() && aggr_group_by_exprs.is_empty() { + // If there are no aggregate expressions in the projection, because + // they all referred to non-existent columns in the table, and there + // is no GROUP BY, the result set is a single row. + // + // This is required for InfluxQL compatibility. + return Ok((LogicalPlanBuilder::empty(true).build()?, select_exprs)); + } + let plan = LogicalPlanBuilder::from(input) .aggregate(aggr_group_by_exprs.clone(), aggr_exprs.clone())? .build()?; @@ -1960,6 +1976,29 @@ mod test { TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] "###); + // Aggregate expression selecting non-existent field + assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N] + EmptyRelation [] + "###); + + // Aggregate expression with GROUP BY and non-existent field + assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data GROUP BY foo"), @r###" + Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N] + Aggregate: groupBy=[[data.foo]], aggr=[[]] [foo:Dictionary(Int32, Utf8);N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); + + // Aggregate expression selecting tag, should treat as non-existent + assert_snapshot!(plan("SELECT MEAN(f64_field), MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, AVG(data.f64_field) AS mean, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N] + Aggregate: groupBy=[[]], aggr=[[AVG(data.f64_field)]] [AVG(data.f64_field):Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); + // Fallible // Cannot combine aggregate and non-aggregate columns in the projection