refactor: `IOxReadFilterNode` can always accumulate statistics (#5954)

* refactor: `IOxReadFilterNode` can always accumulate statistics

`IOxReadFilterNode` used to not emit statistics if one chunk has
duplicates or delete predicates. This is wrong (or at least overly
conservative), because the node itself (or the chunks themselves) do NOT
perform dedup or delete predicate filtering. Instead this is done is
done by parent nodes (`DeduplicateExec` and `FilterExec`) and its their
job to propagate statistics correctly.

Helps w/ #5897.

* test: explain setup

Co-authored-by: Andrew Lamb <alamb@influxdata.com>

Co-authored-by: Andrew Lamb <alamb@influxdata.com>
pull/24376/head
Marco Neumann 2022-10-24 13:34:22 +00:00 committed by GitHub
parent 0d4d7e266d
commit 1d440ddb2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 22 additions and 15 deletions

View File

@ -164,7 +164,8 @@ async fn query_after_persist_sees_new_files() {
],
},
// write another parquet file
Step::WriteLineProtocol(setup.lp_to_force_persistence()),
// that has non duplicated data
Step::WriteLineProtocol(setup.lp_to_force_persistence().replace("tag=A", "tag=B")),
Step::WaitForPersisted,
// query should correctly see the data in the second parquet file
Step::Query {

View File

@ -237,15 +237,9 @@ impl ExecutionPlan for DeduplicateExec {
}
fn statistics(&self) -> Statistics {
// TODO: we should acount for overlaps at this point -- if
// there is overlap across the chunks, we probably can't
// provide exact statistics without more work
let is_exact = true;
// for now, pass on the input statistics but note they can not
// be exact
// use a guess from our input but they are NOT exact
Statistics {
is_exact,
is_exact: false,
..self.input.statistics()
}
}

View File

@ -174,12 +174,6 @@ impl ExecutionPlan for IOxReadFilterNode {
fn statistics(&self) -> Statistics {
let mut combined_summary_option: Option<TableSummary> = None;
for chunk in &self.chunks {
if chunk.has_delete_predicates() || chunk.may_contain_pk_duplicates() {
// Not use statistics if there is at least one delete predicate or
// if chunk may have duplicates
return Statistics::default();
}
combined_summary_option = match combined_summary_option {
None => Some(
chunk

View File

@ -83,3 +83,9 @@
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | |
+---------------+---------------------------------------------------------------------------------+
-- SQL: select count(*) from h2o;
+-----------------+
| COUNT(UInt8(1)) |
+-----------------+
| 18 |
+-----------------+

View File

@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
-- Union plan
EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
select count(*) from h2o;

View File

@ -67,3 +67,9 @@
| | IOxReadFilterNode: table_name=h2o, chunks=2 predicate=Predicate |
| | |
+---------------+---------------------------------------------------------------------------------+
-- SQL: select count(*) from h2o;
+-----------------+
| COUNT(UInt8(1)) |
+-----------------+
| 18 |
+-----------------+

View File

@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
-- Union plan
EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
select count(*) from h2o;