fix: Incorrect results when planning aggregate expressions (#7431)
* fix: Incorrect results when mixing non-existent fields in aggregates * chore: Improve comments around aggregatespull/24376/head
parent
839816abf1
commit
a1b29b3ebb
|
@ -307,12 +307,20 @@ SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk;
|
|||
-- IOX_COMPARE: sorted
|
||||
SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none);
|
||||
|
||||
-- TODO(sgc): The following two queries produce incorrect results
|
||||
-- See: https://github.com/influxdata/influxdb_iox/issues/7361
|
||||
-- using aggregates across measurements
|
||||
-- SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
|
||||
-- using aggregates with missing fields
|
||||
-- SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
|
||||
SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
|
||||
-- using aggregates with missing fields, should return a single, null value for the row
|
||||
SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
|
||||
-- should return the mean result and a null result for the second column
|
||||
SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
|
||||
-- should return no results
|
||||
SELECT MEAN(foo) FROM cpu;
|
||||
-- should return a row for each tag key
|
||||
SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
-- should return a row result for each tag key
|
||||
SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
-- should return nothing
|
||||
SELECT MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
|
||||
SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
|
||||
-- supports offset parameter
|
||||
|
|
|
@ -645,6 +645,47 @@ Error while planning query: Error during planning: invalid number of arguments f
|
|||
| disk | 2022-10-31T02:00:00Z | | 2234.0 |
|
||||
| disk | 2022-10-31T02:00:10Z | | 2239.0 |
|
||||
+------------------+----------------------+--------------------+--------+
|
||||
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(bytes_free) FROM cpu, disk;
|
||||
+------------------+----------------------+---------------------------------+
|
||||
| iox::measurement | time | mean_usage_idle_mean_bytes_free |
|
||||
+------------------+----------------------+---------------------------------+
|
||||
| cpu | 1970-01-01T00:00:00Z | |
|
||||
| disk | 1970-01-01T00:00:00Z | |
|
||||
+------------------+----------------------+---------------------------------+
|
||||
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu;
|
||||
+------------------+----------------------+--------------------------+
|
||||
| iox::measurement | time | mean_usage_idle_mean_foo |
|
||||
+------------------+----------------------+--------------------------+
|
||||
| cpu | 1970-01-01T00:00:00Z | |
|
||||
+------------------+----------------------+--------------------------+
|
||||
-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu;
|
||||
+------------------+----------------------+--------------------+--------------------------+
|
||||
| iox::measurement | time | mean | mean_usage_idle_mean_foo |
|
||||
+------------------+----------------------+--------------------+--------------------------+
|
||||
| cpu | 1970-01-01T00:00:00Z | 1.9850000000000003 | |
|
||||
+------------------+----------------------+--------------------+--------------------------+
|
||||
-- InfluxQL: SELECT MEAN(foo) FROM cpu;
|
||||
++
|
||||
++
|
||||
-- InfluxQL: SELECT MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
+------------------+----------------------+-----------+--------------------------+
|
||||
| iox::measurement | time | cpu | mean_usage_idle_mean_foo |
|
||||
+------------------+----------------------+-----------+--------------------------+
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu-total | |
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu0 | |
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu1 | |
|
||||
+------------------+----------------------+-----------+--------------------------+
|
||||
-- InfluxQL: SELECT MEAN(usage_idle), MEAN(usage_idle) + MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
+------------------+----------------------+-----------+--------------------+--------------------------+
|
||||
| iox::measurement | time | cpu | mean | mean_usage_idle_mean_foo |
|
||||
+------------------+----------------------+-----------+--------------------+--------------------------+
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu-total | 2.9850000000000003 | |
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu0 | 0.985 | |
|
||||
| cpu | 1970-01-01T00:00:00Z | cpu1 | 1.9849999999999999 | |
|
||||
+------------------+----------------------+-----------+--------------------+--------------------------+
|
||||
-- InfluxQL: SELECT MEAN(foo) FROM cpu GROUP BY cpu;
|
||||
++
|
||||
++
|
||||
-- InfluxQL: SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
|
||||
+------------------+----------------------+-------+------+
|
||||
| iox::measurement | time | count | sum |
|
||||
|
|
|
@ -110,7 +110,7 @@ struct Context<'a> {
|
|||
scope: ExprScope,
|
||||
tz: Option<Tz>,
|
||||
|
||||
/// `true` if the query projects aggregates.
|
||||
/// `true` if the query projection specifies aggregate expressions.
|
||||
is_aggregate: bool,
|
||||
|
||||
// GROUP BY information
|
||||
|
@ -439,6 +439,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
|
|||
group_by_tag_set: &[&str],
|
||||
schemas: &Schemas,
|
||||
) -> Result<(LogicalPlan, Vec<Expr>)> {
|
||||
if !ctx.is_aggregate {
|
||||
return Ok((input, select_exprs));
|
||||
}
|
||||
|
||||
let Some(time_column_index) = find_time_column_index(fields) else {
|
||||
return Err(DataFusionError::Internal("unable to find time column".to_owned()))
|
||||
};
|
||||
|
@ -452,10 +456,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
|
|||
// will produce two aggregate expressions:
|
||||
//
|
||||
// [SUM(foo), COUNT(foo)]
|
||||
//
|
||||
// NOTE:
|
||||
//
|
||||
// It is possible this vector is empty, when all the fields in the
|
||||
// projection refer to columns that do not exist in the current
|
||||
// table.
|
||||
let aggr_exprs = find_aggregate_exprs(&select_exprs);
|
||||
if aggr_exprs.is_empty() {
|
||||
return Ok((input, select_exprs));
|
||||
}
|
||||
|
||||
let aggr_group_by_exprs = if let Some(group_by) = ctx.group_by {
|
||||
let mut group_by_exprs = Vec::new();
|
||||
|
@ -483,6 +490,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
|
|||
vec![]
|
||||
};
|
||||
|
||||
if aggr_exprs.is_empty() && aggr_group_by_exprs.is_empty() {
|
||||
// If there are no aggregate expressions in the projection, because
|
||||
// they all referred to non-existent columns in the table, and there
|
||||
// is no GROUP BY, the result set is a single row.
|
||||
//
|
||||
// This is required for InfluxQL compatibility.
|
||||
return Ok((LogicalPlanBuilder::empty(true).build()?, select_exprs));
|
||||
}
|
||||
|
||||
let plan = LogicalPlanBuilder::from(input)
|
||||
.aggregate(aggr_group_by_exprs.clone(), aggr_exprs.clone())?
|
||||
.build()?;
|
||||
|
@ -1960,6 +1976,29 @@ mod test {
|
|||
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
|
||||
"###);
|
||||
|
||||
// Aggregate expression selecting non-existent field
|
||||
assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
|
||||
Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
|
||||
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_f64_field_mean_non_existent:Null;N]
|
||||
EmptyRelation []
|
||||
"###);
|
||||
|
||||
// Aggregate expression with GROUP BY and non-existent field
|
||||
assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data GROUP BY foo"), @r###"
|
||||
Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
|
||||
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_f64_field_mean_non_existent:Null;N]
|
||||
Aggregate: groupBy=[[data.foo]], aggr=[[]] [foo:Dictionary(Int32, Utf8);N]
|
||||
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
|
||||
"###);
|
||||
|
||||
// Aggregate expression selecting tag, should treat as non-existent
|
||||
assert_snapshot!(plan("SELECT MEAN(f64_field), MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
|
||||
Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
|
||||
Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, AVG(data.f64_field) AS mean, NULL AS mean_f64_field_mean_non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_f64_field_mean_non_existent:Null;N]
|
||||
Aggregate: groupBy=[[]], aggr=[[AVG(data.f64_field)]] [AVG(data.f64_field):Float64;N]
|
||||
TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
|
||||
"###);
|
||||
|
||||
// Fallible
|
||||
|
||||
// Cannot combine aggregate and non-aggregate columns in the projection
|
||||
|
|
Loading…
Reference in New Issue