fix: Incorrect results when planning aggregate expressions (#7431)

* fix: Incorrect results when mixing non-existent fields in aggregates * chore: Improve comments around aggregates
2023-04-04 10:19:50 +10:00 · 2023-04-04 10:19:50 +10:00 · a1b29b3ebb
parent 839816abf1
commit a1b29b3ebb
3 changed files with 97 additions and 9 deletions
--- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql
+++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql
@ -307,12 +307,20 @@ SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk;
 -- IOX_COMPARE: sorted
 SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none);

-- TODO(sgc): The following two queries produce incorrect results
-- See: https://github.com/influxdata/influxdb_iox/issues/7361
 -- using aggregates across measurements
-- SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
-- using aggregates with missing fields
-- SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
+SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
+-- using aggregates with missing fields, should return a single, null value for the row
+SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
+-- should return the mean result and a null result for the second column
+SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
+-- should return no results
+SELECT MEAN(foo) FROM cpu;
+-- should return a row for each tag key
+SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+-- should return a row result for each tag key
+SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+-- should return nothing
+SELECT MEAN(foo) FROM cpu GROUP BY cpu;

 SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
 -- supports offset parameter
--- a/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected
+++ b/influxdb_iox/tests/query_tests2/cases/in/issue_6112.influxql.expected
@ -645,6 +645,47 @@ Error while planning query: Error during planning: invalid number of arguments f
 | disk             | 2022-10-31T02:00:00Z |                    | 2234.0 |
 | disk             | 2022-10-31T02:00:10Z |                    | 2239.0 |
 +------------------+----------------------+--------------------+--------+
+-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
+------------------+----------------------+---------------------------------+
+| iox::measurement | time                 | mean_usage_idle_mean_bytes_free |
+------------------+----------------------+---------------------------------+
+| cpu              | 1970-01-01T00:00:00Z |                                 |
+| disk             | 1970-01-01T00:00:00Z |                                 |
+------------------+----------------------+---------------------------------+
+-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
+------------------+----------------------+--------------------------+
+| iox::measurement | time                 | mean_usage_idle_mean_foo |
+------------------+----------------------+--------------------------+
+| cpu              | 1970-01-01T00:00:00Z |                          |
+------------------+----------------------+--------------------------+
+-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
+------------------+----------------------+--------------------+--------------------------+
+| iox::measurement | time                 | mean               | mean_usage_idle_mean_foo |
+------------------+----------------------+--------------------+--------------------------+
+| cpu              | 1970-01-01T00:00:00Z | 1.9850000000000003 |                          |
+------------------+----------------------+--------------------+--------------------------+
+-- InfluxQL: SELECT MEAN(foo) FROM cpu;
++
++
+-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+------------------+----------------------+-----------+--------------------------+
+| iox::measurement | time                 | cpu       | mean_usage_idle_mean_foo |
+------------------+----------------------+-----------+--------------------------+
+| cpu              | 1970-01-01T00:00:00Z | cpu-total |                          |
+| cpu              | 1970-01-01T00:00:00Z | cpu0      |                          |
+| cpu              | 1970-01-01T00:00:00Z | cpu1      |                          |
+------------------+----------------------+-----------+--------------------------+
+-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+------------------+----------------------+-----------+--------------------+--------------------------+
+| iox::measurement | time                 | cpu       | mean               | mean_usage_idle_mean_foo |
+------------------+----------------------+-----------+--------------------+--------------------------+
+| cpu              | 1970-01-01T00:00:00Z | cpu-total | 2.9850000000000003 |                          |
+| cpu              | 1970-01-01T00:00:00Z | cpu0      | 0.985              |                          |
+| cpu              | 1970-01-01T00:00:00Z | cpu1      | 1.9849999999999999 |                          |
+------------------+----------------------+-----------+--------------------+--------------------------+
+-- InfluxQL: SELECT MEAN(foo) FROM cpu GROUP BY cpu;
++
++
 -- InfluxQL: SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
 +------------------+----------------------+-------+------+
 | iox::measurement | time                 | count | sum  |
--- a/iox_query_influxql/src/plan/planner.rs
+++ b/iox_query_influxql/src/plan/planner.rs
@ -110,7 +110,7 @@ struct Context<'a> {
    scope: ExprScope,
    tz: Option<Tz>,

-    /// `true` if the query projects aggregates.
+    /// `true` if the query projection specifies aggregate expressions.
    is_aggregate: bool,

    // GROUP BY information
@ -439,6 +439,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
        group_by_tag_set: &[&str],
        schemas: &Schemas,
    ) -> Result<(LogicalPlan, Vec<Expr>)> {
+        if !ctx.is_aggregate {
+            return Ok((input, select_exprs));
+        }
+
        let Some(time_column_index) = find_time_column_index(fields) else {
            return Err(DataFusionError::Internal("unable to find time column".to_owned()))
        };
@ -452,10 +456,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
        // will produce two aggregate expressions:
        //
        // [SUM(foo), COUNT(foo)]
+        //
+        // NOTE:
+        //
+        // It is possible this vector is empty, when all the fields in the
+        // projection refer to columns that do not exist in the current
+        // table.
        let aggr_exprs = find_aggregate_exprs(&select_exprs);
-        if aggr_exprs.is_empty() {
-            return Ok((input, select_exprs));
-        }

        let aggr_group_by_exprs = if let Some(group_by) = ctx.group_by {
            let mut group_by_exprs = Vec::new();
@ -483,6 +490,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
            vec![]
        };

+        if aggr_exprs.is_empty() && aggr_group_by_exprs.is_empty() {
+            // If there are no aggregate expressions in the projection, because
+            // they all referred to non-existent columns in the table, and there
+            // is no GROUP BY, the result set is a single row.
+            //
+            // This is required for InfluxQL compatibility.
+            return Ok((LogicalPlanBuilder::empty(true).build()?, select_exprs));
+        }
+
        let plan = LogicalPlanBuilder::from(input)
            .aggregate(aggr_group_by_exprs.clone(), aggr_exprs.clone())?
            .build()?;
@ -1960,6 +1976,29 @@ mod test {
                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);

+                // Aggregate expression selecting non-existent field
+                assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
+                    EmptyRelation []
+                "###);
+
+                // Aggregate expression with GROUP BY and non-existent field
+                assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data GROUP BY foo"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[]] [foo:Dictionary(Int32, Utf8);N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // Aggregate expression selecting tag, should treat as non-existent
+                assert_snapshot!(plan("SELECT MEAN(f64_field), MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, AVG(data.f64_field) AS mean, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
+                    Aggregate: groupBy=[[]], aggr=[[AVG(data.f64_field)]] [AVG(data.f64_field):Float64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
                // Fallible

                // Cannot combine aggregate and non-aggregate columns in the projection