From 1d440ddb2dcf6c53e12f9fb20c8e561f353f9b1f Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 24 Oct 2022 13:34:22 +0000 Subject: [PATCH] refactor: `IOxReadFilterNode` can always accumulate statistics (#5954) * refactor: `IOxReadFilterNode` can always accumulate statistics `IOxReadFilterNode` used to not emit statistics if one chunk has duplicates or delete predicates. This is wrong (or at least overly conservative), because the node itself (or the chunks themselves) do NOT perform dedup or delete predicate filtering. Instead this is done is done by parent nodes (`DeduplicateExec` and `FilterExec`) and its their job to propagate statistics correctly. Helps w/ #5897. * test: explain setup Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- influxdb_iox/tests/end_to_end_cases/querier.rs | 3 ++- iox_query/src/provider/deduplicate.rs | 10 ++-------- iox_query/src/provider/physical.rs | 6 ------ query_tests/cases/in/duplicates_ingester.expected | 6 ++++++ query_tests/cases/in/duplicates_ingester.sql | 3 +++ query_tests/cases/in/duplicates_parquet.expected | 6 ++++++ query_tests/cases/in/duplicates_parquet.sql | 3 +++ 7 files changed, 22 insertions(+), 15 deletions(-) diff --git a/influxdb_iox/tests/end_to_end_cases/querier.rs b/influxdb_iox/tests/end_to_end_cases/querier.rs index 38d22746e2..67879c99cc 100644 --- a/influxdb_iox/tests/end_to_end_cases/querier.rs +++ b/influxdb_iox/tests/end_to_end_cases/querier.rs @@ -164,7 +164,8 @@ async fn query_after_persist_sees_new_files() { ], }, // write another parquet file - Step::WriteLineProtocol(setup.lp_to_force_persistence()), + // that has non duplicated data + Step::WriteLineProtocol(setup.lp_to_force_persistence().replace("tag=A", "tag=B")), Step::WaitForPersisted, // query should correctly see the data in the second parquet file Step::Query { diff --git a/iox_query/src/provider/deduplicate.rs b/iox_query/src/provider/deduplicate.rs index 57ff8cdca4..2f765e552d 100644 --- a/iox_query/src/provider/deduplicate.rs +++ b/iox_query/src/provider/deduplicate.rs @@ -237,15 +237,9 @@ impl ExecutionPlan for DeduplicateExec { } fn statistics(&self) -> Statistics { - // TODO: we should acount for overlaps at this point -- if - // there is overlap across the chunks, we probably can't - // provide exact statistics without more work - let is_exact = true; - - // for now, pass on the input statistics but note they can not - // be exact + // use a guess from our input but they are NOT exact Statistics { - is_exact, + is_exact: false, ..self.input.statistics() } } diff --git a/iox_query/src/provider/physical.rs b/iox_query/src/provider/physical.rs index e283e1dfaa..1672dbc70c 100644 --- a/iox_query/src/provider/physical.rs +++ b/iox_query/src/provider/physical.rs @@ -174,12 +174,6 @@ impl ExecutionPlan for IOxReadFilterNode { fn statistics(&self) -> Statistics { let mut combined_summary_option: Option = None; for chunk in &self.chunks { - if chunk.has_delete_predicates() || chunk.may_contain_pk_duplicates() { - // Not use statistics if there is at least one delete predicate or - // if chunk may have duplicates - return Statistics::default(); - } - combined_summary_option = match combined_summary_option { None => Some( chunk diff --git a/query_tests/cases/in/duplicates_ingester.expected b/query_tests/cases/in/duplicates_ingester.expected index fcd47819ad..25cc6e96e8 100644 --- a/query_tests/cases/in/duplicates_ingester.expected +++ b/query_tests/cases/in/duplicates_ingester.expected @@ -83,3 +83,9 @@ | | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | | | | +---------------+---------------------------------------------------------------------------------+ +-- SQL: select count(*) from h2o; ++-----------------+ +| COUNT(UInt8(1)) | ++-----------------+ +| 18 | ++-----------------+ diff --git a/query_tests/cases/in/duplicates_ingester.sql b/query_tests/cases/in/duplicates_ingester.sql index 9ea662b11f..c906978c7b 100644 --- a/query_tests/cases/in/duplicates_ingester.sql +++ b/query_tests/cases/in/duplicates_ingester.sql @@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; -- Union plan EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; + +-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap) +select count(*) from h2o; diff --git a/query_tests/cases/in/duplicates_parquet.expected b/query_tests/cases/in/duplicates_parquet.expected index cfb8a814b9..29f0d42e2a 100644 --- a/query_tests/cases/in/duplicates_parquet.expected +++ b/query_tests/cases/in/duplicates_parquet.expected @@ -67,3 +67,9 @@ | | IOxReadFilterNode: table_name=h2o, chunks=2 predicate=Predicate | | | | +---------------+---------------------------------------------------------------------------------+ +-- SQL: select count(*) from h2o; ++-----------------+ +| COUNT(UInt8(1)) | ++-----------------+ +| 18 | ++-----------------+ diff --git a/query_tests/cases/in/duplicates_parquet.sql b/query_tests/cases/in/duplicates_parquet.sql index 4b0e5f6fd0..b7f15e1b44 100644 --- a/query_tests/cases/in/duplicates_parquet.sql +++ b/query_tests/cases/in/duplicates_parquet.sql @@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; -- Union plan EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; + +-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap) +select count(*) from h2o;