From 4dc5cc46a98c204333180758c3347ddd3809212e Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 2 Dec 2020 17:52:49 +0000 Subject: [PATCH] refactor: DRY up the predicate logic --- segment_store/src/column/cmp.rs | 2 +- segment_store/src/segment.rs | 118 ++++++++++---------------------- 2 files changed, 39 insertions(+), 81 deletions(-) diff --git a/segment_store/src/column/cmp.rs b/segment_store/src/column/cmp.rs index 504e62f2f0..5d957235cf 100644 --- a/segment_store/src/column/cmp.rs +++ b/segment_store/src/column/cmp.rs @@ -1,5 +1,5 @@ /// Possible comparison operators -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Copy, Clone)] pub enum Operator { Equal, NotEqual, diff --git a/segment_store/src/segment.rs b/segment_store/src/segment.rs index 6b273dba42..4e532950c6 100644 --- a/segment_store/src/segment.rs +++ b/segment_store/src/segment.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::{borrow::Cow, collections::BTreeMap}; use arrow_deps::arrow::datatypes::SchemaRef; @@ -208,13 +208,9 @@ impl Segment { } } - // Determines the set of row ids that satisfy the time range and all of the - // optional predicates. - // - // TODO(edd): right now `time_range` is special cased so we can use the - // optimised execution path in the column to filter on a range. However, - // eventually we should be able to express the time range as just another - // one or two predicates. + // Determines the set of row ids that satisfy the provided predicates. + // If `predicates` contains two predicates on the time column they are + // special-cased. fn row_ids_from_predicates(&self, predicates: &[Predicate<'_>]) -> RowIDsOption { // TODO(edd): perf - potentially pool this so we can re-use it once // rows have been materialised and it's no longer needed. @@ -228,6 +224,7 @@ impl Segment { // segment but not re-used for subsequent calls _to_ the segment. let mut dst = RowIDs::new_bitmap(); + let mut predicates = Cow::Borrowed(predicates); // If there is a time-range in the predicates (two time predicates), then // execute an optimised version that will use a range based predicate // on the time column. @@ -237,10 +234,34 @@ impl Segment { .count() == 2 { - return self.row_ids_from_predicates_with_time_range(predicates, dst, result_row_ids); + // Apply optimised filtering to time column + let time_pred_row_ids = + self.row_ids_from_predicates_with_time_range(predicates.as_ref(), dst); + match time_pred_row_ids { + // No matching rows based on time range + RowIDsOption::None(_) => return time_pred_row_ids, + + // all rows match - continue to apply other predicates + RowIDsOption::All(_dst) => { + dst = _dst; // hand buffer back + } + + // some rows match - continue to apply predicates + RowIDsOption::Some(row_ids) => { + // fill the result row id set with the matching rows from the + // time column. + result_row_ids.union(&row_ids); + dst = row_ids // hand buffer back + } + } + + // remove time predicates so they're not processed again + let mut filtered_predicates = predicates.to_vec(); + filtered_predicates.retain(|(col, _)| *col != TIME_COLUMN_NAME); + predicates = Cow::Owned(filtered_predicates); } - for (col_name, (op, value)) in predicates { + for (col_name, (op, value)) in predicates.iter() { // N.B column should always exist because validation of // predicates is not the responsibility of the `Segment`. let col = self.column_by_name(col_name).unwrap(); @@ -273,24 +294,19 @@ impl Segment { } if result_row_ids.is_empty() { - // All rows matched all predicates - return the empty buffer. + // All rows matched all predicates because any predictates not matching + // any rows would have resulted in an early return. return RowIDsOption::All(result_row_ids); } RowIDsOption::Some(result_row_ids) } - // Determines the set of row ids that satisfy the time range and all of the - // optional predicates. - // - // TODO(edd): right now `time_range` is special cased so we can use the - // optimised execution path in the column to filter on a range. However, - // eventually we should be able to express the time range as just another - // one or two predicates. + // An optimised function for applying two comparison predicates to a time + // column at once. fn row_ids_from_predicates_with_time_range( &self, predicates: &[Predicate<'_>], - mut dst: RowIDs, - mut result_row_ids: RowIDs, + dst: RowIDs, ) -> RowIDsOption { // find the time range predicates and execute a specialised range based // row id lookup. @@ -300,69 +316,11 @@ impl Segment { .collect::>(); assert!(time_predicates.len() == 2); - let time_row_ids = self.time_column().row_ids_filter_range( + self.time_column().row_ids_filter_range( &time_predicates[0].1, // min time &time_predicates[1].1, // max time dst, - ); - - match time_row_ids { - // No matching rows based on time range - return buffer - RowIDsOption::None(_) => return time_row_ids, - - // all rows match - continue to apply predicates - RowIDsOption::All(_dst) => { - dst = _dst; // hand buffer back - } - - // some rows match - continue to apply predicates - RowIDsOption::Some(row_ids) => { - // union empty result set with matching timestamp rows - result_row_ids.union(&row_ids); - dst = row_ids // hand buffer back - } - } - - for (col_name, (op, value)) in predicates { - if col_name == &TIME_COLUMN_NAME { - continue; // we already processed the time column as a special case. - } - // N.B column should always exist because validation of - // predicates should happen at the `Table` level. - let col = self.column_by_name(col_name).unwrap(); - - // Explanation of how this buffer pattern works here. The idea is - // that the buffer should be returned to the caller so it can be - // re-used on other columns. To do that we need to hand the buffer - // back even if we haven't populated it with any results. - match col.row_ids_filter(op, value, dst) { - // No rows will be returned for the segment because this column - // doe not match any rows. - RowIDsOption::None(_dst) => return RowIDsOption::None(_dst), - - // Intersect the row ids found at this column with all those - // found on other column predicates. - RowIDsOption::Some(row_ids) => { - if result_row_ids.is_empty() { - result_row_ids.union(&row_ids) - } - result_row_ids.intersect(&row_ids); - dst = row_ids; // hand buffer back - } - - // This is basically a no-op because all rows match the - // predicate on this column. - RowIDsOption::All(_dst) => { - dst = _dst; // hand buffer back - } - } - } - - if result_row_ids.is_empty() { - // All rows matched all predicates - return the empty buffer. - return RowIDsOption::All(result_row_ids); - } - RowIDsOption::Some(result_row_ids) + ) } }