From 1d440ddb2dcf6c53e12f9fb20c8e561f353f9b1f Mon Sep 17 00:00:00 2001
From: Marco Neumann <marco@crepererum.net>
Date: Mon, 24 Oct 2022 13:34:22 +0000
Subject: [PATCH] refactor: `IOxReadFilterNode` can always accumulate
 statistics (#5954)

* refactor: `IOxReadFilterNode` can always accumulate statistics

`IOxReadFilterNode` used to not emit statistics if one chunk has
duplicates or delete predicates. This is wrong (or at least overly
conservative), because the node itself (or the chunks themselves) do NOT
perform dedup or delete predicate filtering. Instead this is done is
done by parent nodes (`DeduplicateExec` and `FilterExec`) and its their
job to propagate statistics correctly.

Helps w/ #5897.

* test: explain setup

Co-authored-by: Andrew Lamb <alamb@influxdata.com>

Co-authored-by: Andrew Lamb <alamb@influxdata.com>
---
 influxdb_iox/tests/end_to_end_cases/querier.rs    |  3 ++-
 iox_query/src/provider/deduplicate.rs             | 10 ++--------
 iox_query/src/provider/physical.rs                |  6 ------
 query_tests/cases/in/duplicates_ingester.expected |  6 ++++++
 query_tests/cases/in/duplicates_ingester.sql      |  3 +++
 query_tests/cases/in/duplicates_parquet.expected  |  6 ++++++
 query_tests/cases/in/duplicates_parquet.sql       |  3 +++
 7 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/influxdb_iox/tests/end_to_end_cases/querier.rs b/influxdb_iox/tests/end_to_end_cases/querier.rs
index 38d22746e2..67879c99cc 100644
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@@ -164,7 +164,8 @@ async fn query_after_persist_sees_new_files() {
             ],
         },
         // write another parquet file
-        Step::WriteLineProtocol(setup.lp_to_force_persistence()),
+        // that has non duplicated data
+        Step::WriteLineProtocol(setup.lp_to_force_persistence().replace("tag=A", "tag=B")),
         Step::WaitForPersisted,
         // query should correctly see the data in the second parquet file
         Step::Query {
diff --git a/iox_query/src/provider/deduplicate.rs b/iox_query/src/provider/deduplicate.rs
index 57ff8cdca4..2f765e552d 100644
--- a/iox_query/src/provider/deduplicate.rs
+++ b/iox_query/src/provider/deduplicate.rs
@@ -237,15 +237,9 @@ impl ExecutionPlan for DeduplicateExec {
     }
 
     fn statistics(&self) -> Statistics {
-        // TODO: we should acount for overlaps at this point -- if
-        // there is overlap across the chunks, we probably can't
-        // provide exact statistics without more work
-        let is_exact = true;
-
-        // for now, pass on the input statistics but note they can not
-        // be exact
+        // use a guess from our input but they are NOT exact
         Statistics {
-            is_exact,
+            is_exact: false,
             ..self.input.statistics()
         }
     }
diff --git a/iox_query/src/provider/physical.rs b/iox_query/src/provider/physical.rs
index e283e1dfaa..1672dbc70c 100644
--- a/iox_query/src/provider/physical.rs
+++ b/iox_query/src/provider/physical.rs
@@ -174,12 +174,6 @@ impl ExecutionPlan for IOxReadFilterNode {
     fn statistics(&self) -> Statistics {
         let mut combined_summary_option: Option<TableSummary> = None;
         for chunk in &self.chunks {
-            if chunk.has_delete_predicates() || chunk.may_contain_pk_duplicates() {
-                // Not use statistics if there is at least one delete predicate or
-                // if chunk may have duplicates
-                return Statistics::default();
-            }
-
             combined_summary_option = match combined_summary_option {
                 None => Some(
                     chunk
diff --git a/query_tests/cases/in/duplicates_ingester.expected b/query_tests/cases/in/duplicates_ingester.expected
index fcd47819ad..25cc6e96e8 100644
--- a/query_tests/cases/in/duplicates_ingester.expected
+++ b/query_tests/cases/in/duplicates_ingester.expected
@@ -83,3 +83,9 @@
 |               |       IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate           |
 |               |                                                                                 |
 +---------------+---------------------------------------------------------------------------------+
+-- SQL: select count(*) from h2o;
++-----------------+
+| COUNT(UInt8(1)) |
++-----------------+
+| 18              |
++-----------------+
diff --git a/query_tests/cases/in/duplicates_ingester.sql b/query_tests/cases/in/duplicates_ingester.sql
index 9ea662b11f..c906978c7b 100644
--- a/query_tests/cases/in/duplicates_ingester.sql
+++ b/query_tests/cases/in/duplicates_ingester.sql
@@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
 
 -- Union plan
 EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+
+-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
+select count(*) from h2o;
diff --git a/query_tests/cases/in/duplicates_parquet.expected b/query_tests/cases/in/duplicates_parquet.expected
index cfb8a814b9..29f0d42e2a 100644
--- a/query_tests/cases/in/duplicates_parquet.expected
+++ b/query_tests/cases/in/duplicates_parquet.expected
@@ -67,3 +67,9 @@
 |               |       IOxReadFilterNode: table_name=h2o, chunks=2 predicate=Predicate           |
 |               |                                                                                 |
 +---------------+---------------------------------------------------------------------------------+
+-- SQL: select count(*) from h2o;
++-----------------+
+| COUNT(UInt8(1)) |
++-----------------+
+| 18              |
++-----------------+
diff --git a/query_tests/cases/in/duplicates_parquet.sql b/query_tests/cases/in/duplicates_parquet.sql
index 4b0e5f6fd0..b7f15e1b44 100644
--- a/query_tests/cases/in/duplicates_parquet.sql
+++ b/query_tests/cases/in/duplicates_parquet.sql
@@ -10,3 +10,6 @@ EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
 
 -- Union plan
 EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+
+-- count(*) plan that ensures that row count statistics are not used (because we don't know how many rows overlap)
+select count(*) from h2o;