fix: Incorrect results when planning aggregate expressions (#7431)

* fix: Incorrect results when mixing non-existent fields in aggregates

* chore: Improve comments around aggregates
pull/24376/head
Stuart Carnie 2023-04-04 10:19:50 +10:00 committed by GitHub
parent 839816abf1
commit a1b29b3ebb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 97 additions and 9 deletions

View File

@ -307,12 +307,20 @@ SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk;
-- IOX_COMPARE: sorted
SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none);
-- TODO(sgc): The following two queries produce incorrect results
-- See: https://github.com/influxdata/influxdb_iox/issues/7361
-- using aggregates across measurements
-- SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
-- using aggregates with missing fields
-- SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
-- using aggregates with missing fields, should return a single, null value for the row
SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
-- should return the mean result and a null result for the second column
SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
-- should return no results
SELECT MEAN(foo) FROM cpu;
-- should return a row for each tag key
SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
-- should return a row result for each tag key
SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
-- should return nothing
SELECT MEAN(foo) FROM cpu GROUP BY cpu;
SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
-- supports offset parameter

View File

@ -645,6 +645,47 @@ Error while planning query: Error during planning: invalid number of arguments f
| disk | 2022-10-31T02:00:00Z | | 2234.0 |
| disk | 2022-10-31T02:00:10Z | | 2239.0 |
+------------------+----------------------+--------------------+--------+
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
+------------------+----------------------+---------------------------------+
| iox::measurement | time | mean_usage_idle_mean_bytes_free |
+------------------+----------------------+---------------------------------+
| cpu | 1970-01-01T00:00:00Z | |
| disk | 1970-01-01T00:00:00Z | |
+------------------+----------------------+---------------------------------+
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
+------------------+----------------------+--------------------------+
| iox::measurement | time | mean_usage_idle_mean_foo |
+------------------+----------------------+--------------------------+
| cpu | 1970-01-01T00:00:00Z | |
+------------------+----------------------+--------------------------+
-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
+------------------+----------------------+--------------------+--------------------------+
| iox::measurement | time | mean | mean_usage_idle_mean_foo |
+------------------+----------------------+--------------------+--------------------------+
| cpu | 1970-01-01T00:00:00Z | 1.9850000000000003 | |
+------------------+----------------------+--------------------+--------------------------+
-- InfluxQL: SELECT MEAN(foo) FROM cpu;
++
++
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+------------------+----------------------+-----------+--------------------------+
| iox::measurement | time | cpu | mean_usage_idle_mean_foo |
+------------------+----------------------+-----------+--------------------------+
| cpu | 1970-01-01T00:00:00Z | cpu-total | |
| cpu | 1970-01-01T00:00:00Z | cpu0 | |
| cpu | 1970-01-01T00:00:00Z | cpu1 | |
+------------------+----------------------+-----------+--------------------------+
-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
+------------------+----------------------+-----------+--------------------+--------------------------+
| iox::measurement | time | cpu | mean | mean_usage_idle_mean_foo |
+------------------+----------------------+-----------+--------------------+--------------------------+
| cpu | 1970-01-01T00:00:00Z | cpu-total | 2.9850000000000003 | |
| cpu | 1970-01-01T00:00:00Z | cpu0 | 0.985 | |
| cpu | 1970-01-01T00:00:00Z | cpu1 | 1.9849999999999999 | |
+------------------+----------------------+-----------+--------------------+--------------------------+
-- InfluxQL: SELECT MEAN(foo) FROM cpu GROUP BY cpu;
++
++
-- InfluxQL: SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
+------------------+----------------------+-------+------+
| iox::measurement | time | count | sum |

View File

@ -110,7 +110,7 @@ struct Context<'a> {
scope: ExprScope,
tz: Option<Tz>,
/// `true` if the query projects aggregates.
/// `true` if the query projection specifies aggregate expressions.
is_aggregate: bool,
// GROUP BY information
@ -439,6 +439,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
group_by_tag_set: &[&str],
schemas: &Schemas,
) -> Result<(LogicalPlan, Vec<Expr>)> {
if !ctx.is_aggregate {
return Ok((input, select_exprs));
}
let Some(time_column_index) = find_time_column_index(fields) else {
return Err(DataFusionError::Internal("unable to find time column".to_owned()))
};
@ -452,10 +456,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
// will produce two aggregate expressions:
//
// [SUM(foo), COUNT(foo)]
//
// NOTE:
//
// It is possible this vector is empty, when all the fields in the
// projection refer to columns that do not exist in the current
// table.
let aggr_exprs = find_aggregate_exprs(&select_exprs);
if aggr_exprs.is_empty() {
return Ok((input, select_exprs));
}
let aggr_group_by_exprs = if let Some(group_by) = ctx.group_by {
let mut group_by_exprs = Vec::new();
@ -483,6 +490,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
vec![]
};
if aggr_exprs.is_empty() && aggr_group_by_exprs.is_empty() {
// If there are no aggregate expressions in the projection, because
// they all referred to non-existent columns in the table, and there
// is no GROUP BY, the result set is a single row.
//
// This is required for InfluxQL compatibility.
return Ok((LogicalPlanBuilder::empty(true).build()?, select_exprs));
}
let plan = LogicalPlanBuilder::from(input)
.aggregate(aggr_group_by_exprs.clone(), aggr_exprs.clone())?
.build()?;
@ -1960,6 +1976,29 @@ mod test {
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
"###);
// Aggregate expression selecting non-existent field
assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
EmptyRelation []
"###);
// Aggregate expression with GROUP BY and non-existent field
assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data GROUP BY foo"), @r###"
Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
Aggregate: groupBy=[[data.foo]], aggr=[[]] [foo:Dictionary(Int32, Utf8);N]
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
"###);
// Aggregate expression selecting tag, should treat as non-existent
assert_snapshot!(plan("SELECT MEAN(f64_field), MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, AVG(data.f64_field) AS mean, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
Aggregate: groupBy=[[]], aggr=[[AVG(data.f64_field)]] [AVG(data.f64_field):Float64;N]
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
"###);
// Fallible
// Cannot combine aggregate and non-aggregate columns in the projection