refactor: `IOxReadFilterNode` can always accumulate statistics (#5954)

* refactor: `IOxReadFilterNode` can always accumulate statistics `IOxReadFilterNode` used to not emit statistics if one chunk has duplicates or delete predicates. This is wrong (or at least overly conservative), because the node itself (or the chunks themselves) do NOT perform dedup or delete predicate filtering. Instead this is done is done by parent nodes (`DeduplicateExec` and `FilterExec`) and its their job to propagate statistics correctly. Helps w/ #5897. * test: explain setup Co-authored-by: Andrew Lamb <alamb@influxdata.com> Co-authored-by: Andrew Lamb <alamb@influxdata.com>
2022-10-24 13:34:22 +00:00 · 2022-10-24 13:34:22 +00:00 · 1d440ddb2d
parent 0d4d7e266d
commit 1d440ddb2d
7 changed files with 22 additions and 15 deletions
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@ -164,7 +164,8 @@ async fn query_after_persist_sees_new_files() {
            ],
        },
        // write another parquet file
-        Step::WriteLineProtocol(setup.lp_to_force_persistence()),
+        // that has non duplicated data
+        Step::WriteLineProtocol(setup.lp_to_force_persistence().replace("tag=A", "tag=B")),
        Step::WaitForPersisted,
        // query should correctly see the data in the second parquet file
        Step::Query {
--- a/iox_query/src/provider/deduplicate.rs
+++ b/iox_query/src/provider/deduplicate.rs
@ -237,15 +237,9 @@ impl ExecutionPlan for DeduplicateExec {
    }

    fn statistics(&self) -> Statistics {
-        // TODO: we should acount for overlaps at this point -- if
-        // there is overlap across the chunks, we probably can't
-        // provide exact statistics without more work
-        let is_exact = true;
-
-        // for now, pass on the input statistics but note they can not
-        // be exact
+        // use a guess from our input but they are NOT exact
        Statistics {
-            is_exact,
+            is_exact: false,
            ..self.input.statistics()
        }
    }
--- a/iox_query/src/provider/physical.rs
+++ b/iox_query/src/provider/physical.rs
@ -174,12 +174,6 @@ impl ExecutionPlan for IOxReadFilterNode {
    fn statistics(&self) -> Statistics {
        let mut combined_summary_option: Option<TableSummary> = None;
        for chunk in &self.chunks {
-            if chunk.has_delete_predicates() || chunk.may_contain_pk_duplicates() {
-                // Not use statistics if there is at least one delete predicate or
-                // if chunk may have duplicates
-                return Statistics::default();
-            }
-
            combined_summary_option = match combined_summary_option {
                None => Some(
                    chunk
--- a/query_tests/cases/in/duplicates_ingester.expected
+++ b/query_tests/cases/in/duplicates_ingester.expected
@ -83,3 +83,9 @@
 |               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
 |               |                                                                                 |
 +---------------+---------------------------------------------------------------------------------+
+-- SQL: select count(*) from h2o;
+-----------------+
+| COUNT(UInt8(1)) |
+-----------------+
+| 18              |
+-----------------+
--- a/query_tests/cases/in/duplicates_ingester.sql
+++ b/query_tests/cases/in/duplicates_ingester.sql
@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;

 -- Union plan
 EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+
+-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
+select count(*) from h2o;
--- a/query_tests/cases/in/duplicates_parquet.expected
+++ b/query_tests/cases/in/duplicates_parquet.expected
@ -67,3 +67,9 @@
 |               |       IOxReadFilterNode: table_name=h2o, chunks=2 predicate=Predicate           |
 |               |                                                                                 |
 +---------------+---------------------------------------------------------------------------------+
+-- SQL: select count(*) from h2o;
+-----------------+
+| COUNT(UInt8(1)) |
+-----------------+
+| 18              |
+-----------------+
--- a/query_tests/cases/in/duplicates_parquet.sql
+++ b/query_tests/cases/in/duplicates_parquet.sql
@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;

 -- Union plan
 EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+
+-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
+select count(*) from h2o;