From c4176a3f46058d3d4dcb21cd950c36ccb7ff926a Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Oct 2020 12:25:54 +0000 Subject: [PATCH 1/6] refactor: better API for row ids --- .../src/column/dictionary.rs | 119 +++--------- delorean_segment_store/src/column/fixed.rs | 183 ++++++++++-------- .../src/column/fixed_null.rs | 4 + 3 files changed, 136 insertions(+), 170 deletions(-) diff --git a/delorean_segment_store/src/column/dictionary.rs b/delorean_segment_store/src/column/dictionary.rs index c855540a52..a6569d842c 100644 --- a/delorean_segment_store/src/column/dictionary.rs +++ b/delorean_segment_store/src/column/dictionary.rs @@ -193,6 +193,11 @@ impl RLE { self.num_rows } + /// Determine if NULL is encoded in the column. + pub fn contains_null(&self) -> bool { + self.contains_null + } + // // // ---- Methods for getting row ids from values. @@ -201,7 +206,7 @@ impl RLE { /// Populates the provided destination container with the row ids satisfying /// the provided predicate. - pub fn row_ids_filter(&self, value: String, op: cmp::Operator, dst: RowIDs) -> RowIDs { + pub fn row_ids_filter(&self, value: &str, op: cmp::Operator, dst: RowIDs) -> RowIDs { match op { cmp::Operator::Equal | cmp::Operator::NotEqual => self.row_ids_equal(value, op, dst), cmp::Operator::LT | cmp::Operator::LTE | cmp::Operator::GT | cmp::Operator::GTE => { @@ -211,7 +216,7 @@ impl RLE { } // Finds row ids based on = or != operator. - fn row_ids_equal(&self, value: String, op: cmp::Operator, mut dst: RowIDs) -> RowIDs { + fn row_ids_equal(&self, value: &str, op: cmp::Operator, mut dst: RowIDs) -> RowIDs { dst.clear(); let include = match op { cmp::Operator::Equal => true, @@ -219,7 +224,7 @@ impl RLE { _ => unreachable!("invalid operator"), }; - if let Some(encoded_id) = self.entry_index.get(&value) { + if let Some(encoded_id) = self.entry_index.get(value) { let mut index: u32 = 0; for (other_encoded_id, other_rl) in &self.run_lengths { let start = index; @@ -249,11 +254,11 @@ impl RLE { } // Finds row ids based on <, <=, > or >= operator. - fn row_ids_cmp(&self, value: String, op: cmp::Operator, mut dst: RowIDs) -> RowIDs { + fn row_ids_cmp(&self, value: &str, op: cmp::Operator, mut dst: RowIDs) -> RowIDs { dst.clear(); // happy path - the value exists in the column - if let Some(encoded_id) = self.entry_index.get(&value) { + if let Some(encoded_id) = self.entry_index.get(value) { let cmp = match op { cmp::Operator::GT => PartialOrd::gt, cmp::Operator::GTE => PartialOrd::ge, @@ -280,9 +285,9 @@ impl RLE { cmp::Operator::GT | cmp::Operator::GTE => { // find the first decoded value that satisfies the predicate. for (other, other_encoded_id) in &self.entry_index { - if other > &value { + if other.as_str() > value { // change filter from either `x > value` or `x >= value` to `x >= other` - return self.row_ids_cmp(other.clone(), cmp::Operator::GTE, dst); + return self.row_ids_cmp(other, cmp::Operator::GTE, dst); } } } @@ -290,9 +295,9 @@ impl RLE { // find the first decoded value that satisfies the predicate. // Note iteration is in reverse for (other, other_encoded_id) in self.entry_index.iter().rev() { - if other < &value { + if other.as_str() < value { // change filter from either `x < value` or `x <= value` to `x <= other` - return self.row_ids_cmp(other.clone(), cmp::Operator::LTE, dst); + return self.row_ids_cmp(other, cmp::Operator::LTE, dst); } } } @@ -858,40 +863,20 @@ mod test { drle.push_none(); // 9 drle.push_additional(Some("south".to_string()), 2); // 10, 11 - let ids = drle.row_ids_filter( - "east".to_string(), - cmp::Operator::Equal, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east", cmp::Operator::Equal, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8])); - let ids = drle.row_ids_filter( - "south".to_string(), - cmp::Operator::Equal, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"south", cmp::Operator::Equal, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![10, 11])); - let ids = drle.row_ids_filter( - "foo".to_string(), - cmp::Operator::Equal, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"foo", cmp::Operator::Equal, RowIDs::Vector(vec![])); assert!(ids.is_empty()); // != some value not in the column should exclude the NULL value. - let ids = drle.row_ids_filter( - "foo".to_string(), - cmp::Operator::NotEqual, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"foo", cmp::Operator::NotEqual, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11])); - let ids = drle.row_ids_filter( - "east".to_string(), - cmp::Operator::NotEqual, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east", cmp::Operator::NotEqual, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![3, 10, 11])); } @@ -901,11 +886,7 @@ mod test { drle.push_additional(Some("east".to_string()), 2); drle.push_additional(Some("west".to_string()), 1); - let ids = drle.row_ids_filter( - "abba".to_string(), - cmp::Operator::NotEqual, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"abba", cmp::Operator::NotEqual, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2])); } @@ -921,43 +902,23 @@ mod test { drle.push_none(); // 13 drle.push_additional(Some("west".to_string()), 5); // 14, 15, 16, 17, 18 - let ids = drle.row_ids_filter( - "east".to_string(), - cmp::Operator::LTE, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east", cmp::Operator::LTE, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8])); - let ids = drle.row_ids_filter( - "east".to_string(), - cmp::Operator::LT, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east", cmp::Operator::LT, RowIDs::Vector(vec![])); assert!(ids.is_empty()); - let ids = drle.row_ids_filter( - "north".to_string(), - cmp::Operator::GT, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"north", cmp::Operator::GT, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![9, 10, 11, 14, 15, 16, 17, 18])); - let ids = drle.row_ids_filter( - "north".to_string(), - cmp::Operator::GTE, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"north", cmp::Operator::GTE, RowIDs::Vector(vec![])); assert_eq!( ids, RowIDs::Vector(vec![3, 9, 10, 11, 12, 14, 15, 16, 17, 18]) ); // The encoding also supports comparisons on values that don't directly exist in the column. - let ids = drle.row_ids_filter( - "abba".to_string(), - cmp::Operator::GT, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"abba", cmp::Operator::GT, RowIDs::Vector(vec![])); assert_eq!( ids, RowIDs::Vector(vec![ @@ -965,45 +926,25 @@ mod test { ]) ); - let ids = drle.row_ids_filter( - "east1".to_string(), - cmp::Operator::GT, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east1", cmp::Operator::GT, RowIDs::Vector(vec![])); assert_eq!( ids, RowIDs::Vector(vec![3, 9, 10, 11, 12, 14, 15, 16, 17, 18]) ); - let ids = drle.row_ids_filter( - "east1".to_string(), - cmp::Operator::GTE, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east1", cmp::Operator::GTE, RowIDs::Vector(vec![])); assert_eq!( ids, RowIDs::Vector(vec![3, 9, 10, 11, 12, 14, 15, 16, 17, 18]) ); - let ids = drle.row_ids_filter( - "east1".to_string(), - cmp::Operator::LTE, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"east1", cmp::Operator::LTE, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 4, 5, 6, 7, 8])); - let ids = drle.row_ids_filter( - "region".to_string(), - cmp::Operator::LT, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"region", cmp::Operator::LT, RowIDs::Vector(vec![])); assert_eq!(ids, RowIDs::Vector(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 12])); - let ids = drle.row_ids_filter( - "zoo".to_string(), - cmp::Operator::LTE, - RowIDs::Vector(vec![]), - ); + let ids = drle.row_ids_filter(&"zoo", cmp::Operator::LTE, RowIDs::Vector(vec![])); assert_eq!( ids, RowIDs::Vector(vec![ diff --git a/delorean_segment_store/src/column/fixed.rs b/delorean_segment_store/src/column/fixed.rs index da16d110c5..332510a15e 100644 --- a/delorean_segment_store/src/column/fixed.rs +++ b/delorean_segment_store/src/column/fixed.rs @@ -19,7 +19,7 @@ use std::ops::AddAssign; use croaring::Bitmap; -use crate::column::cmp; +use crate::column::{cmp, RowIDs}; #[derive(Debug, Default)] /// A Fixed encoding is one in which every value has a fixed width, and is @@ -270,17 +270,17 @@ where /// representation. /// /// Essentially, this supports `value {=, !=, >, >=, <, <=} x`. - pub fn row_ids_filter(&self, value: U, op: cmp::Operator, bm: Bitmap) -> Bitmap + pub fn row_ids_filter(&self, value: U, op: cmp::Operator, dst: RowIDs) -> RowIDs where T: From, { let physical_value = T::from(value); match op { - cmp::Operator::GT => self.row_ids_cmp_order_bm(&physical_value, PartialOrd::gt, bm), - cmp::Operator::GTE => self.row_ids_cmp_order_bm(&physical_value, PartialOrd::ge, bm), - cmp::Operator::LT => self.row_ids_cmp_order_bm(&physical_value, PartialOrd::lt, bm), - cmp::Operator::LTE => self.row_ids_cmp_order_bm(&physical_value, PartialOrd::le, bm), - _ => self.row_ids_equal_bm(&physical_value, op, bm), + cmp::Operator::GT => self.row_ids_cmp_order(&physical_value, PartialOrd::gt, dst), + cmp::Operator::GTE => self.row_ids_cmp_order(&physical_value, PartialOrd::ge, dst), + cmp::Operator::LT => self.row_ids_cmp_order(&physical_value, PartialOrd::lt, dst), + cmp::Operator::LTE => self.row_ids_cmp_order(&physical_value, PartialOrd::le, dst), + _ => self.row_ids_equal(&physical_value, op, dst), } } @@ -298,8 +298,8 @@ where // Handles finding all rows that match the provided operator on `value`. // For performance reasons ranges of matching values are collected up and // added in bulk to the bitmap. - fn row_ids_equal_bm(&self, value: &T, op: cmp::Operator, mut bm: Bitmap) -> Bitmap { - bm.clear(); + fn row_ids_equal(&self, value: &T, op: cmp::Operator, mut dst: RowIDs) -> RowIDs { + dst.clear(); let desired; if let cmp::Operator::Equal = op { @@ -314,8 +314,8 @@ where let cmp_result = next == value; if cmp_result != desired && found { - let (min, max) = (i as u64 - count as u64, i as u64); - bm.add_range(min..max); + let (min, max) = (i as u32 - count as u32, i as u32); + dst.add_range(min, max); found = false; count = 0; continue; @@ -332,12 +332,12 @@ where // add any remaining range. if found { let (min, max) = ( - (self.values.len()) as u64 - count as u64, - (self.values.len()) as u64, + (self.values.len()) as u32 - count as u32, + (self.values.len()) as u32, ); - bm.add_range(min..max); + dst.add_range(min, max); } - bm + dst } // Handles finding all rows that match the provided operator on `value`. @@ -346,11 +346,11 @@ where // // `op` is a tuple of comparisons where at least one of them must be // satisfied to satisfy the overall operator. - fn row_ids_cmp_order_bm(&self, value: &T, op: F, mut bm: Bitmap) -> Bitmap + fn row_ids_cmp_order(&self, value: &T, op: F, mut dst: RowIDs) -> RowIDs where F: Fn(&T, &T) -> bool, { - bm.clear(); + dst.clear(); let mut found = false; let mut count = 0; @@ -358,8 +358,8 @@ where let cmp_result = op(next, value); if !cmp_result && found { - let (min, max) = (i as u64 - count as u64, i as u64); - bm.add_range(min..max); + let (min, max) = (i as u32 - count as u32, i as u32); + dst.add_range(min, max); found = false; count = 0; continue; @@ -376,12 +376,12 @@ where // add any remaining range. if found { let (min, max) = ( - (self.values.len()) as u64 - count as u64, - (self.values.len()) as u64, + (self.values.len()) as u32 - count as u32, + (self.values.len()) as u32, ); - bm.add_range(min..max); + dst.add_range(min, max); } - bm + dst } /// Returns the set of row ids that satisfy a pair of binary operators @@ -399,8 +399,8 @@ where &self, left: (U, cmp::Operator), right: (U, cmp::Operator), - bm: Bitmap, - ) -> Bitmap + dst: RowIDs, + ) -> RowIDs where T: From, { @@ -415,10 +415,10 @@ where | (cmp::Operator::LT, cmp::Operator::GT) | (cmp::Operator::LT, cmp::Operator::GTE) | (cmp::Operator::LTE, cmp::Operator::GT) - | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order_bm( + | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order( (&left_physical, Self::ord_from_op(&left.1)), (&right_physical, Self::ord_from_op(&right.1)), - bm, + dst, ), (_, _) => panic!("unsupported operators provided"), @@ -442,13 +442,13 @@ where // For performance reasons ranges of matching values are collected up and // added in bulk to the bitmap. // - fn row_ids_cmp_range_order_bm( + fn row_ids_cmp_range_order( &self, left: (&T, (std::cmp::Ordering, std::cmp::Ordering)), right: (&T, (std::cmp::Ordering, std::cmp::Ordering)), - mut bm: Bitmap, - ) -> Bitmap { - bm.clear(); + mut dst: RowIDs, + ) -> RowIDs { + dst.clear(); let left_op = left.1; let right_op = right.1; @@ -465,8 +465,8 @@ where right_cmp_result != Some(right_op.0) && left_cmp_result != Some(right_op.1); if (left_result_no || right_result_no) && found { - let (min, max) = (i as u64 - count as u64, i as u64); - bm.add_range(min..max); + let (min, max) = (i as u32 - count as u32, i as u32); + dst.add_range(min, max); found = false; count = 0; continue; @@ -483,12 +483,12 @@ where // add any remaining range. if found { let (min, max) = ( - (self.values.len()) as u64 - count as u64, - (self.values.len()) as u64, + (self.values.len()) as u32 - count as u32, + (self.values.len()) as u32, ); - bm.add_range(min..max); + dst.add_range(min, max); } - bm + dst } } @@ -665,17 +665,17 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::Equal, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 2, 12]); + let dst = v.row_ids_filter(100, Operator::Equal, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![0, 2, 12]); - let bm = v.row_ids_filter(101, Operator::Equal, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![1, 8]); + let dst = v.row_ids_filter(101, Operator::Equal, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![1, 8]); - let bm = v.row_ids_filter(2030, Operator::Equal, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![6]); + let dst = v.row_ids_filter(2030, Operator::Equal, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![6]); - let bm = v.row_ids_filter(194, Operator::Equal, Bitmap::create()); - assert!(bm.is_empty()); + let dst = v.row_ids_filter(194, Operator::Equal, RowIDs::new_vector()); + assert!(dst.is_empty()); } #[test] @@ -683,17 +683,23 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::NotEqual, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![1, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + let dst = v.row_ids_filter(100, Operator::NotEqual, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![1, 3, 4, 5, 6, 7, 8, 9, 10, 11]); - let bm = v.row_ids_filter(101, Operator::NotEqual, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]); + let dst = v.row_ids_filter(101, Operator::NotEqual, RowIDs::new_vector()); + assert_eq!( + dst.unwrap_vector(), + &vec![0, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12] + ); - let bm = v.row_ids_filter(2030, Operator::NotEqual, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12]); + let dst = v.row_ids_filter(2030, Operator::NotEqual, RowIDs::new_vector()); + assert_eq!( + dst.unwrap_vector(), + &vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12] + ); - let bm = v.row_ids_filter(194, Operator::NotEqual, Bitmap::create()); - assert_eq!(bm.to_vec(), (0..13).collect::>()); + let dst = v.row_ids_filter(194, Operator::NotEqual, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &(0..13).collect::>()); } #[test] @@ -701,11 +707,11 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::LT, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![7, 9, 10, 11]); + let dst = v.row_ids_filter(100, Operator::LT, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![7, 9, 10, 11]); - let bm = v.row_ids_filter(3, Operator::LT, Bitmap::create()); - assert_eq!(bm.to_vec(), Vec::::new()); + let dst = v.row_ids_filter(3, Operator::LT, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &Vec::::new()); } #[test] @@ -713,11 +719,11 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::LTE, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 2, 7, 9, 10, 11, 12]); + let dst = v.row_ids_filter(100, Operator::LTE, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![0, 2, 7, 9, 10, 11, 12]); - let bm = v.row_ids_filter(2, Operator::LTE, Bitmap::create()); - assert!(bm.is_empty()); + let dst = v.row_ids_filter(2, Operator::LTE, RowIDs::new_vector()); + assert!(dst.is_empty()); } #[test] @@ -725,11 +731,11 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::GT, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![1, 3, 4, 5, 6, 8]); + let dst = v.row_ids_filter(100, Operator::GT, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![1, 3, 4, 5, 6, 8]); - let bm = v.row_ids_filter(2030, Operator::GT, Bitmap::create()); - assert!(bm.is_empty()); + let dst = v.row_ids_filter(2030, Operator::GT, RowIDs::new_vector()); + assert!(dst.is_empty()); } #[test] @@ -737,11 +743,11 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = v.row_ids_filter(100, Operator::GTE, Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 1, 2, 3, 4, 5, 6, 8, 12]); + let dst = v.row_ids_filter(100, Operator::GTE, RowIDs::new_vector()); + assert_eq!(dst.unwrap_vector(), &vec![0, 1, 2, 3, 4, 5, 6, 8, 12]); - let bm = v.row_ids_filter(2031, Operator::GTE, Bitmap::create()); - assert!(bm.is_empty()); + let dst = v.row_ids_filter(2031, Operator::GTE, RowIDs::new_vector()); + assert!(dst.is_empty()); } #[test] @@ -749,24 +755,39 @@ mod test { let mut v: Fixed = Fixed::default(); v.values = vec![100, 101, 100, 102, 1000, 300, 2030, 3, 101, 4, 5, 21, 100]; - let bm = - v.row_ids_filter_range((100, Operator::GTE), (240, Operator::LT), Bitmap::create()); - assert_eq!(bm.to_vec(), vec![0, 1, 2, 3, 8, 12]); + let dst = v.row_ids_filter_range( + (100, Operator::GTE), + (240, Operator::LT), + RowIDs::new_vector(), + ); + assert_eq!(dst.unwrap_vector(), &vec![0, 1, 2, 3, 8, 12]); - let bm = v.row_ids_filter_range((100, Operator::GT), (240, Operator::LT), Bitmap::create()); - assert_eq!(bm.to_vec(), vec![1, 3, 8]); + let dst = v.row_ids_filter_range( + (100, Operator::GT), + (240, Operator::LT), + RowIDs::new_vector(), + ); + assert_eq!(dst.unwrap_vector(), &vec![1, 3, 8]); - let bm = v.row_ids_filter_range((10, Operator::LT), (-100, Operator::GT), Bitmap::create()); - assert_eq!(bm.to_vec(), vec![7, 9, 10]); + let dst = v.row_ids_filter_range( + (10, Operator::LT), + (-100, Operator::GT), + RowIDs::new_vector(), + ); + assert_eq!(dst.unwrap_vector(), &vec![7, 9, 10]); - let bm = v.row_ids_filter_range((21, Operator::GTE), (21, Operator::LTE), Bitmap::create()); - assert_eq!(bm.to_vec(), vec![11]); + let dst = v.row_ids_filter_range( + (21, Operator::GTE), + (21, Operator::LTE), + RowIDs::new_vector(), + ); + assert_eq!(dst.unwrap_vector(), &vec![11]); - let bm = v.row_ids_filter_range( + let dst = v.row_ids_filter_range( (10000, Operator::LTE), (3999, Operator::GT), - Bitmap::create(), + RowIDs::new_bitmap(), ); - assert!(bm.is_empty()); + assert!(dst.is_empty()); } } diff --git a/delorean_segment_store/src/column/fixed_null.rs b/delorean_segment_store/src/column/fixed_null.rs index 3e341647c5..dac6da76b8 100644 --- a/delorean_segment_store/src/column/fixed_null.rs +++ b/delorean_segment_store/src/column/fixed_null.rs @@ -63,6 +63,10 @@ where self.arr.is_empty() } + pub fn contains_null(&self) -> bool { + self.arr.null_count() == 0 + } + /// Returns the total size in bytes of the encoded data. Note, this method /// is really an "accurate" estimation. It doesn't include for example the /// size of the `Plain` struct receiver. From 93dde00d04fd7dd8884893cc926c1b370978c3e3 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Oct 2020 12:36:28 +0000 Subject: [PATCH 2/6] refactor: change filter API --- delorean_segment_store/src/column/fixed.rs | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/delorean_segment_store/src/column/fixed.rs b/delorean_segment_store/src/column/fixed.rs index 332510a15e..5ec64b9bee 100644 --- a/delorean_segment_store/src/column/fixed.rs +++ b/delorean_segment_store/src/column/fixed.rs @@ -264,23 +264,17 @@ where // // - /// Returns the set of row ids that satisfy a binary operator on a logical - /// value. Note, it is the caller's responsibility to ensure the value - /// provided can be correctly converted from the logical to physical - /// representation. + /// Returns the set of row ids that satisfy a binary operator on a physical + /// value. /// /// Essentially, this supports `value {=, !=, >, >=, <, <=} x`. - pub fn row_ids_filter(&self, value: U, op: cmp::Operator, dst: RowIDs) -> RowIDs - where - T: From, - { - let physical_value = T::from(value); + pub fn row_ids_filter(&self, value: T, op: cmp::Operator, dst: RowIDs) -> RowIDs { match op { - cmp::Operator::GT => self.row_ids_cmp_order(&physical_value, PartialOrd::gt, dst), - cmp::Operator::GTE => self.row_ids_cmp_order(&physical_value, PartialOrd::ge, dst), - cmp::Operator::LT => self.row_ids_cmp_order(&physical_value, PartialOrd::lt, dst), - cmp::Operator::LTE => self.row_ids_cmp_order(&physical_value, PartialOrd::le, dst), - _ => self.row_ids_equal(&physical_value, op, dst), + cmp::Operator::GT => self.row_ids_cmp_order(&value, PartialOrd::gt, dst), + cmp::Operator::GTE => self.row_ids_cmp_order(&value, PartialOrd::ge, dst), + cmp::Operator::LT => self.row_ids_cmp_order(&value, PartialOrd::lt, dst), + cmp::Operator::LTE => self.row_ids_cmp_order(&value, PartialOrd::le, dst), + _ => self.row_ids_equal(&value, op, dst), } } From 34ba183a3ee4878bfc73aef98fecab4dc6dac42d Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 30 Oct 2020 15:24:17 +0000 Subject: [PATCH 3/6] feat: implement getting row ids from predicate --- delorean_segment_store/src/column.rs | 633 ++++++++++++++++++++- delorean_segment_store/src/column/cmp.rs | 1 + delorean_segment_store/src/column/fixed.rs | 2 - 3 files changed, 629 insertions(+), 7 deletions(-) diff --git a/delorean_segment_store/src/column.rs b/delorean_segment_store/src/column.rs index 20f8784374..fd7ada83a2 100644 --- a/delorean_segment_store/src/column.rs +++ b/delorean_segment_store/src/column.rs @@ -4,6 +4,7 @@ pub mod fixed; pub mod fixed_null; use std::collections::BTreeSet; +use std::convert::TryFrom; use croaring::Bitmap; @@ -180,8 +181,54 @@ impl Column { // /// Determine the set of row ids that satisfy the predicate. + /// + /// TODO(edd): row ids pooling. pub fn row_ids_filter(&self, op: cmp::Operator, value: Value<'_>) -> RowIDsOption { - todo!() + match op { + // When the predicate is == and the metadata range indicates the column + // can't contain `value` then the column doesn't need to be read. + cmp::Operator::Equal => { + if !self.might_contain_value(&value) { + return RowIDsOption::None; // no rows are going to match. + } + } + + // When the predicate is one of {<, <=, >, >=} and the column doesn't + // contain any null values, and the entire range of values satisfies the + // predicate then the column doesn't need to be read. + cmp::Operator::GT | cmp::Operator::GTE | cmp::Operator::LT | cmp::Operator::LTE => { + if self.predicate_matches_all_values(&op, &value) { + return RowIDsOption::All; + } + } + + // When the predicate is != and the metadata range indicates that the + // column can't possibly contain `value` then the predicate must + // match all rows on the column. + cmp::Operator::NotEqual => { + if !self.might_contain_value(&value) { + return RowIDsOption::All; // all rows are going to match. + } + } + } + + // TODO(edd): figure out pooling of these + let dst = RowIDs::Bitmap(Bitmap::create()); + + // Check the column for all rows that satisfy the predicate. + let row_ids = match &self { + Column::String(_, data) => data.row_ids_filter(op, value.string().as_str(), dst), + Column::Float(_, data) => data.row_ids_filter(op, value.scalar(), dst), + Column::Integer(_, data) => data.row_ids_filter(op, value.scalar(), dst), + Column::Unsigned(_, data) => data.row_ids_filter(op, value.scalar(), dst), + Column::Bool => todo!(), + Column::ByteArray(_, data) => todo!(), + }; + + if row_ids.is_empty() { + return RowIDsOption::None; + } + RowIDsOption::Some(row_ids) } /// Determine the set of row ids that satisfy both of the predicates. @@ -196,6 +243,102 @@ impl Column { todo!() } + // Helper method to determine if the column possibly contains this value + fn might_contain_value(&self, value: &Value<'_>) -> bool { + match &self { + Column::String(meta, _) => { + if let Value::String(other) = value { + meta.might_contain_value(&other) + } else { + unreachable!("impossible value comparison"); + } + } + // breaking this down: + // * Extract a Scalar variant from `value`, which should panic if + // that's not possible; + // * Try to safely convert that scalar to a primitive value based + // on the logical type used for the metadata on the column. + // * If the value can't be safely converted then there is no way + // that said value could be stored in the column at all -> false. + // * Otherwise if the value falls inside the range of values in + // the column range then it may be in the column -> true. + Column::Float(meta, _) => value + .scalar() + .try_as_f64() + .map_or_else(|| false, |v| meta.might_contain_value(&v)), + Column::Integer(meta, _) => value + .scalar() + .try_as_i64() + .map_or_else(|| false, |v| meta.might_contain_value(&v)), + Column::Unsigned(meta, _) => value + .scalar() + .try_as_u64() + .map_or_else(|| false, |v| meta.might_contain_value(&v)), + Column::Bool => todo!(), + Column::ByteArray(meta, _) => todo!(), + } + } + + // Helper method to determine if the predicate matches all the values in + // the column. + fn predicate_matches_all_values(&self, op: &cmp::Operator, value: &Value<'_>) -> bool { + match &self { + Column::String(meta, data) => { + if data.contains_null() { + false + } else if let Value::String(other) = value { + meta.might_match_all_values(op, other) + } else { + unreachable!("impossible value comparison"); + } + } + // breaking this down: + // * If the column contains null values then it's not possible for + // all values in the column to match the predicate. + // * Extract a Scalar variant from `value`, which should panic if + // that's not possible; + // * Try to safely convert that scalar to a primitive value based + // on the logical type used for the metadata on the column. + // * If the value can't be safely converted then -> false. + // * Otherwise if the value falls inside the range of values in + // the column range then check if all values satisfy the + // predicate. + // + Column::Float(meta, data) => { + if data.contains_null() { + return false; + } + + value + .scalar() + .try_as_f64() + .map_or_else(|| false, |v| meta.might_match_all_values(op, &v)) + } + Column::Integer(meta, data) => { + if data.contains_null() { + return false; + } + + value + .scalar() + .try_as_i64() + .map_or_else(|| false, |v| meta.might_match_all_values(op, &v)) + } + Column::Unsigned(meta, data) => { + if data.contains_null() { + return false; + } + + value + .scalar() + .try_as_u64() + .map_or_else(|| false, |v| meta.might_match_all_values(op, &v)) + } + Column::Bool => todo!(), + Column::ByteArray(meta, _) => todo!(), + } + } + // // Methods for selecting // @@ -242,7 +385,10 @@ impl Column { #[derive(Default, Debug, PartialEq)] // The meta-data for a column -pub struct MetaData { +pub struct MetaData +where + T: PartialOrd + std::fmt::Debug, +{ // The total size of the column in bytes. size: u64, @@ -253,6 +399,38 @@ pub struct MetaData { range: Option<(T, T)>, } +impl MetaData { + fn might_contain_value(&self, v: &T) -> bool { + match &self.range { + Some(range) => &range.0 <= v && v <= &range.1, + None => false, + } + } + + // Determines if it's possible that predicate would match all rows in the + // column. It is up to the caller to determine if the column contains null + // values, which would invalidate a truthful result. + fn might_match_all_values(&self, op: &cmp::Operator, v: &T) -> bool { + println!("comparing {:?} {:?}, {:?}", op, v, self.range); + match &self.range { + Some(range) => match op { + // all values in column equal to v + cmp::Operator::Equal => range.0 == range.1 && &range.1 == v, + // all values larger or smaller than v so can't contain v + cmp::Operator::NotEqual => v < &range.0 || v > &range.1, + // all values in column > v + cmp::Operator::GT => &range.0 > v, + // all values in column >= v + cmp::Operator::GTE => &range.0 >= v, + // all values in column < v + cmp::Operator::LT => &range.1 < v, + // all values in column <= v + cmp::Operator::LTE => &range.1 <= v, + }, + None => false, // only null values in column. + } + } +} pub enum StringEncoding { RLE(dictionary::RLE), // TODO - simple array encoding, e.g., via Arrow String array. @@ -261,6 +439,13 @@ pub enum StringEncoding { /// This implementation is concerned with how to produce string columns with /// different encodings. impl StringEncoding { + /// Determines if the column contains a NULL value. + pub fn contains_null(&self) -> bool { + match &self { + Self::RLE(c) => c.contains_null(), + } + } + /// Returns the logical value found at the provided row id. pub fn value(&self, row_id: u32) -> Value<'_> { match &self { @@ -289,6 +474,13 @@ impl StringEncoding { } } + /// Returns the row ids that satisfy the provided predicate. + pub fn row_ids_filter(&self, op: cmp::Operator, value: &str, dst: RowIDs) -> RowIDs { + match &self { + Self::RLE(c) => c.row_ids_filter(value, op, dst), + } + } + fn from_arrow_string_array(arr: arrow::array::StringArray) -> Self { // // TODO(edd): potentially switch on things like cardinality in the input @@ -513,6 +705,14 @@ pub enum IntegerEncoding { } impl IntegerEncoding { + /// Determines if the column contains a NULL value. + pub fn contains_null(&self) -> bool { + if let Self::I64I64N(c) = &self { + return c.contains_null(); + } + false + } + /// Returns the logical value found at the provided row id. pub fn value(&self, row_id: u32) -> Value<'_> { match &self { @@ -641,6 +841,8 @@ impl IntegerEncoding { } } + /// All encoded values for the column. For `IntegerEncoding` this is + /// typically equivalent to `all_values`. pub fn all_encoded_values(&self, dst: EncodedValues) -> EncodedValues { // Right now the use-case for encoded values on non-string columns is // that it's used for grouping with timestamp columns, which should be @@ -659,6 +861,43 @@ impl IntegerEncoding { _ => unreachable!("currently only support encoded values as i64"), } } + + /// Returns the row ids that satisfy the provided predicate. + /// + /// Note: it is the caller's responsibility to ensure that the provided + /// `Scalar` value will fit within the physical type of the encoded column. + /// `row_ids_filter` will panic if this invariant is broken. + pub fn row_ids_filter(&self, op: cmp::Operator, value: &Scalar, dst: RowIDs) -> RowIDs { + match &self { + IntegerEncoding::I64I64(c) => c.row_ids_filter(value.as_i64(), op, dst), + IntegerEncoding::I64I32(c) => c.row_ids_filter(value.as_i32(), op, dst), + IntegerEncoding::I64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + IntegerEncoding::I64I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + IntegerEncoding::I64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + IntegerEncoding::I64I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + IntegerEncoding::I64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::I32I32(c) => c.row_ids_filter(value.as_i32(), op, dst), + IntegerEncoding::I32I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + IntegerEncoding::I32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + IntegerEncoding::I32I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + IntegerEncoding::I32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::I16I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + IntegerEncoding::I16I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + IntegerEncoding::I16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::I8I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + IntegerEncoding::U64U64(c) => c.row_ids_filter(value.as_u64(), op, dst), + IntegerEncoding::U64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + IntegerEncoding::U64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + IntegerEncoding::U64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::U32U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + IntegerEncoding::U32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + IntegerEncoding::U32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::U16U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + IntegerEncoding::U16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::U8U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + IntegerEncoding::I64I64N(c) => todo!(), + } + } } pub enum FloatEncoding { @@ -668,6 +907,13 @@ pub enum FloatEncoding { } impl FloatEncoding { + /// Determines if the column contains a NULL value. + pub fn contains_null(&self) -> bool { + // TODO(edd): when adding the nullable columns then ask the nullable + // encoding if it has any null values. + false + } + /// Returns the logical value found at the provided row id. pub fn value(&self, row_id: u32) -> Value<'_> { match &self { @@ -687,6 +933,18 @@ impl FloatEncoding { Self::Fixed32(c) => Values::F32(Float32Array::from(c.values::(row_ids, vec![]))), } } + + /// Returns the row ids that satisfy the provided predicate. + /// + /// Note: it is the caller's responsibility to ensure that the provided + /// `Scalar` value will fit within the physical type of the encoded column. + /// `row_ids_filter` will panic if this invariant is broken. + pub fn row_ids_filter(&self, op: cmp::Operator, value: &Scalar, dst: RowIDs) -> RowIDs { + match &self { + FloatEncoding::Fixed64(c) => c.row_ids_filter(value.as_f64(), op, dst), + FloatEncoding::Fixed32(c) => c.row_ids_filter(value.as_f32(), op, dst), + } + } } // Converts an Arrow `StringArray` into a column, currently using the RLE @@ -1254,7 +1512,6 @@ pub enum AggregateResult<'a> { #[derive(Debug, PartialEq)] pub enum Scalar { - // TODO(edd): flesh out more logical types. I64(i64), I32(i32), I16(i16), @@ -1269,6 +1526,86 @@ pub enum Scalar { F32(f32), } +macro_rules! typed_scalar_converters { + ($(($name:ident, $try_name:ident, $type:ident),)*) => { + $( + fn $name(&self) -> $type { + match &self { + Self::I64(v) => $type::try_from(*v).unwrap(), + Self::I32(v) => $type::try_from(*v).unwrap(), + Self::I16(v) => $type::try_from(*v).unwrap(), + Self::I8(v) => $type::try_from(*v).unwrap(), + Self::U64(v) => $type::try_from(*v).unwrap(), + Self::U32(v) => $type::try_from(*v).unwrap(), + Self::U16(v) => $type::try_from(*v).unwrap(), + Self::U8(v) => $type::try_from(*v).unwrap(), + Self::F64(v) => panic!("cannot convert Self::F64"), + Self::F32(v) => panic!("cannot convert Scalar::F32"), + } + } + + fn $try_name(&self) -> Option<$type> { + match &self { + Self::I64(v) => $type::try_from(*v).ok(), + Self::I32(v) => $type::try_from(*v).ok(), + Self::I16(v) => $type::try_from(*v).ok(), + Self::I8(v) => $type::try_from(*v).ok(), + Self::U64(v) => $type::try_from(*v).ok(), + Self::U32(v) => $type::try_from(*v).ok(), + Self::U16(v) => $type::try_from(*v).ok(), + Self::U8(v) => $type::try_from(*v).ok(), + Self::F64(v) => panic!("cannot convert Self::F64"), + Self::F32(v) => panic!("cannot convert Scalar::F32"), + } + } + )* + }; +} + +impl Scalar { + // Implementations of all the accessors for the variants of `Packers`. + typed_scalar_converters! { + (as_i64, try_as_i64, i64), + (as_i32, try_as_i32, i32), + (as_i16, try_as_i16, i16), + (as_i8, try_as_i8, i8), + (as_u64, try_as_u64, u64), + (as_u32, try_as_u32, u32), + (as_u16, try_as_u16, u16), + (as_u8, try_as_u8, u8), + } + + fn as_f32(&self) -> f32 { + if let Scalar::F32(v) = &self { + return *v; + } + panic!("cannot convert Self to f32"); + } + + fn try_as_f32(&self) -> Option { + if let Scalar::F32(v) = &self { + return Some(*v); + } + None + } + + fn as_f64(&self) -> f64 { + match &self { + Scalar::F64(v) => *v, + Scalar::F32(v) => f64::from(*v), + _ => unimplemented!("converting integer Scalar to f64 unsupported"), + } + } + + fn try_as_f64(&self) -> Option { + match &self { + Scalar::F64(v) => Some(*v), + Scalar::F32(v) => Some(f64::from(*v)), + _ => unimplemented!("converting integer Scalar to f64 unsupported"), + } + } +} + /// Each variant is a possible value type that can be returned from a column. #[derive(Debug, PartialEq)] pub enum Value<'a> { @@ -1276,7 +1613,7 @@ pub enum Value<'a> { Null, // A UTF-8 valid string. - String(&'a str), + String(&'a String), // An arbitrary byte array. ByteArray(&'a [u8]), @@ -1288,6 +1625,22 @@ pub enum Value<'a> { Scalar(Scalar), } +impl Value<'_> { + fn scalar(&self) -> &Scalar { + if let Self::Scalar(s) = self { + return s; + } + panic!("cannot unwrap Value to Scalar"); + } + + fn string(&self) -> &String { + if let Self::String(s) = self { + return s; + } + panic!("cannot unwrap Value to String"); + } +} + /// Each variant is a typed vector of materialised values for a column. NULL /// values are represented as None #[derive(Debug, PartialEq)] @@ -1363,6 +1716,7 @@ impl EncodedValues { /// A specific type of Option for `RowIDs` where the notion of all rows ids is /// represented. +#[derive(Debug)] pub enum RowIDsOption { None, Some(RowIDs), @@ -1372,6 +1726,16 @@ pub enum RowIDsOption { All, } +impl RowIDsOption { + /// Returns the `Some` variant or panics. + pub fn unwrap(&self) -> &RowIDs { + if let Self::Some(ids) = self { + return ids; + } + panic!("cannot unwrap RowIDsOption to RowIDs"); + } +} + /// Represents vectors of row IDs, which are usually used for intermediate /// results as a method of late materialisation. #[derive(PartialEq, Debug)] @@ -1381,6 +1745,37 @@ pub enum RowIDs { } impl RowIDs { + pub fn new_bitmap() -> Self { + Self::Bitmap(Bitmap::create()) + } + + pub fn new_vector() -> Self { + Self::Vector(vec![]) + } + + pub fn unwrap_bitmap(&self) -> &Bitmap { + if let Self::Bitmap(bm) = self { + return bm; + } + panic!("cannot unwrap RowIDs to Bitmap"); + } + + pub fn unwrap_vector(&self) -> &Vec { + if let Self::Vector(arr) = self { + return arr; + } + panic!("cannot unwrap RowIDs to Vector"); + } + + // Converts the RowIDs to a Vec. This is expensive and should only be + // used for testing. + pub fn to_vec(&self) -> Vec { + match self { + RowIDs::Bitmap(bm) => bm.to_vec(), + RowIDs::Vector(arr) => arr.clone(), + } + } + pub fn len(&self) -> usize { match self { RowIDs::Bitmap(ids) => ids.cardinality() as usize, @@ -1767,7 +2162,7 @@ mod test { assert_eq!(col.value(0), Value::Scalar(Scalar::F64(-19.2))); let col = Column::from(&[Some("a"), Some("b"), None, Some("c")][..]); - assert_eq!(col.value(1), Value::String("b")); + assert_eq!(col.value(1), Value::String(&"b".to_owned())); assert_eq!(col.value(2), Value::Null); } @@ -1943,4 +2338,232 @@ mod test { ]) ); } + + #[test] + fn row_ids_filter_str() { + let input = &[ + Some("Badlands"), + None, + Some("Racing in the Street"), + Some("Streets of Fire"), + None, + None, + Some("Darkness on the Edge of Town"), + ]; + + let col = Column::from(&input[..]); + let mut row_ids = + col.row_ids_filter(cmp::Operator::Equal, Value::String(&"Badlands".to_string())); + assert_eq!(row_ids.unwrap().to_vec(), vec![0]); + + row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::String(&"Factory".to_string())); + assert!(matches!(row_ids, RowIDsOption::None)); + + row_ids = col.row_ids_filter( + cmp::Operator::GT, + Value::String(&"Adam Raised a Cain".to_string()), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 2, 3, 6]); + + row_ids = col.row_ids_filter( + cmp::Operator::LTE, + Value::String(&"Streets of Fire".to_string()), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 2, 3, 6]); + + row_ids = col.row_ids_filter( + cmp::Operator::LT, + Value::String(&"Something in the Night".to_string()), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 2, 6]); + + // when the column doesn't contain any NULL values the `All` variant + // might be returned. + let input = &[ + Some("Badlands"), + Some("Racing in the Street"), + Some("Streets of Fire"), + Some("Darkness on the Edge of Town"), + ]; + + let col = Column::from(&input[..]); + row_ids = col.row_ids_filter( + cmp::Operator::NotEqual, + Value::String(&"Adam Raised a Cain".to_string()), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + } + + #[test] + fn row_ids_filter_int() { + let input = &[100, 200, 300, 2, 200, 22, 30]; + + let col = Column::from(&input[..]); + let mut row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::I32(200))); + assert_eq!(row_ids.unwrap().to_vec(), vec![1, 4]); + + row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::I32(2000))); + assert!(matches!(row_ids, RowIDsOption::None)); + + row_ids = col.row_ids_filter(cmp::Operator::GT, Value::Scalar(Scalar::I32(2))); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 1, 2, 4, 5, 6]); + + row_ids = col.row_ids_filter(cmp::Operator::GTE, Value::Scalar(Scalar::I32(2))); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter(cmp::Operator::NotEqual, Value::Scalar(Scalar::I32(-1257))); + assert!(matches!(row_ids, RowIDsOption::All)); + } + + #[test] + fn row_ids_filter_uint() { + let input = &[100_u32, 200, 300, 2, 200, 22, 30]; + + let col = Column::from(&input[..]); + let mut row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::I32(200))); + assert_eq!(row_ids.unwrap().to_vec(), vec![1, 4]); + + row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::U16(2000))); + assert!(matches!(row_ids, RowIDsOption::None)); + + row_ids = col.row_ids_filter(cmp::Operator::GT, Value::Scalar(Scalar::U32(2))); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 1, 2, 4, 5, 6]); + + row_ids = col.row_ids_filter(cmp::Operator::GTE, Value::Scalar(Scalar::U64(2))); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter(cmp::Operator::NotEqual, Value::Scalar(Scalar::I32(-1257))); + assert!(matches!(row_ids, RowIDsOption::All)); + } + + #[test] + fn row_ids_filter_float() { + let input = &[100.2, 200.0, 300.1, 2.22, -200.2, 22.2, 30.2]; + + let col = Column::from(&input[..]); + let mut row_ids = + col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::F32(200.0))); + assert_eq!(row_ids.unwrap().to_vec(), vec![1]); + + row_ids = col.row_ids_filter(cmp::Operator::Equal, Value::Scalar(Scalar::F64(2000.0))); + assert!(matches!(row_ids, RowIDsOption::None)); + + row_ids = col.row_ids_filter(cmp::Operator::GT, Value::Scalar(Scalar::F64(-200.0))); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 1, 2, 3, 5, 6]); + + row_ids = col.row_ids_filter(cmp::Operator::GTE, Value::Scalar(Scalar::F64(-200.2))); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter( + cmp::Operator::NotEqual, + Value::Scalar(Scalar::F32(-1257.029)), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + } + + #[test] + fn might_contain_value() { + let input = &[100i64, 200, 300, 2, 200, 22, 30]; + let col = Column::from(&input[..]); + + let cases: Vec<(Scalar, bool)> = vec![ + (Scalar::U64(200), true), + (Scalar::U32(200), true), + (Scalar::U8(200), true), + (Scalar::I64(100), true), + (Scalar::I32(30), true), + (Scalar::I8(2), true), + (Scalar::U64(100000000), false), + (Scalar::I64(-1), false), + (Scalar::U64(u64::MAX), false), + ]; + + for (scalar, result) in cases { + assert_eq!(col.might_contain_value(&Value::Scalar(scalar)), result); + } + + let input = &[100i16, 200, 300, 2, 200, 22, 30]; + let col = Column::from(&input[..]); + + let cases: Vec<(Scalar, bool)> = vec![ + (Scalar::U64(200), true), + (Scalar::U16(200), true), + (Scalar::U8(200), true), + (Scalar::I64(100), true), + (Scalar::I32(30), true), + (Scalar::I8(2), true), + (Scalar::U64(100000000), false), + (Scalar::I64(-1), false), + (Scalar::U64(u64::MAX), false), + ]; + + for (scalar, result) in cases { + assert_eq!(col.might_contain_value(&Value::Scalar(scalar)), result); + } + + let input = &[100u64, 200, 300, 2, 200, 22, 30]; + let col = Column::from(&input[..]); + + let cases: Vec<(Scalar, bool)> = vec![ + (Scalar::U64(200), true), + (Scalar::U32(200), true), + (Scalar::U8(200), true), + (Scalar::I64(100), true), + (Scalar::I32(30), true), + (Scalar::I8(2), true), + (Scalar::U64(100000000), false), + (Scalar::I64(-1), false), + ]; + + for (scalar, result) in cases { + assert_eq!(col.might_contain_value(&Value::Scalar(scalar)), result); + } + + let input = &[100.0, 200.2, 300.2]; + let col = Column::from(&input[..]); + + let cases: Vec<(Scalar, bool)> = + vec![(Scalar::F64(100.0), true), (Scalar::F32(100.0), true)]; + + for (scalar, result) in cases { + assert_eq!(col.might_contain_value(&Value::Scalar(scalar)), result); + } + } + + #[test] + fn predicate_matches_all_values() { + let input = &[100i64, 200, 300, 2, 200, 22, 30]; + let col = Column::from(&input[..]); + + let cases: Vec<(cmp::Operator, Scalar, bool)> = vec![ + (cmp::Operator::GT, Scalar::U64(100), false), + (cmp::Operator::GT, Scalar::I64(100), false), + (cmp::Operator::GT, Scalar::I8(-99), true), + (cmp::Operator::GT, Scalar::I64(100), false), + (cmp::Operator::LT, Scalar::I64(300), false), + (cmp::Operator::LTE, Scalar::I32(300), true), + (cmp::Operator::Equal, Scalar::I32(2), false), + (cmp::Operator::NotEqual, Scalar::I32(2), false), + (cmp::Operator::NotEqual, Scalar::I64(1), true), + (cmp::Operator::NotEqual, Scalar::I64(301), true), + ]; + + for (op, scalar, result) in cases { + assert_eq!( + col.predicate_matches_all_values(&op, &Value::Scalar(scalar)), + result + ); + } + + // Future improvement would be to support this type of check. + let input = &[100i8, -20]; + let col = Column::from(&input[..]); + assert_eq!( + col.predicate_matches_all_values( + &cmp::Operator::LT, + &Value::Scalar(Scalar::U64(u64::MAX)) + ), + false + ); + } } diff --git a/delorean_segment_store/src/column/cmp.rs b/delorean_segment_store/src/column/cmp.rs index 96233e7989..77f9392ab9 100644 --- a/delorean_segment_store/src/column/cmp.rs +++ b/delorean_segment_store/src/column/cmp.rs @@ -1,4 +1,5 @@ /// Possible comparison operators +#[derive(Debug)] pub enum Operator { Equal, NotEqual, diff --git a/delorean_segment_store/src/column/fixed.rs b/delorean_segment_store/src/column/fixed.rs index 5ec64b9bee..4d9d5ec33b 100644 --- a/delorean_segment_store/src/column/fixed.rs +++ b/delorean_segment_store/src/column/fixed.rs @@ -17,8 +17,6 @@ use std::fmt::{Debug, Display}; use std::mem::size_of; use std::ops::AddAssign; -use croaring::Bitmap; - use crate::column::{cmp, RowIDs}; #[derive(Debug, Default)] From 952959cd5fb142db7d89514dcf1a4b68b9d5440d Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 30 Oct 2020 17:10:40 +0000 Subject: [PATCH 4/6] fix: fix bug with range predicate method --- delorean_segment_store/src/column/fixed.rs | 63 +++++++++++++------ .../src/column/fixed_null.rs | 20 +++++- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/delorean_segment_store/src/column/fixed.rs b/delorean_segment_store/src/column/fixed.rs index 4d9d5ec33b..ee77bf3692 100644 --- a/delorean_segment_store/src/column/fixed.rs +++ b/delorean_segment_store/src/column/fixed.rs @@ -30,7 +30,7 @@ use crate::column::{cmp, RowIDs}; /// pub struct Fixed where - T: PartialOrd, + T: PartialOrd + std::fmt::Debug, { // backing data values: Vec, @@ -42,7 +42,7 @@ where impl std::fmt::Display for Fixed where - T: Display + PartialOrd + Copy, + T: std::fmt::Debug + Display + PartialOrd + Copy, { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -56,7 +56,7 @@ where impl Fixed where - T: PartialOrd + Copy, + T: std::fmt::Debug + PartialOrd + Copy, { pub fn num_rows(&self) -> u32 { self.values.len() as u32 @@ -377,28 +377,19 @@ where } /// Returns the set of row ids that satisfy a pair of binary operators - /// against two values of the same logical type. - /// - /// Note, it is the caller's responsibility to provide values that can - /// safely be converted from the logical type `U` to the physical type `T`. + /// against two values of the same physical type. /// /// This method is a special case optimisation for common cases where one /// wishes to do the equivalent of WHERE x > y AND x <= y` for example. /// /// Essentially, this supports: /// `x {>, >=, <, <=} value1 AND x {>, >=, <, <=} value2`. - pub fn row_ids_filter_range( + pub fn row_ids_filter_range( &self, - left: (U, cmp::Operator), - right: (U, cmp::Operator), + left: (T, cmp::Operator), + right: (T, cmp::Operator), dst: RowIDs, - ) -> RowIDs - where - T: From, - { - let left_physical = T::from(left.0); - let right_physical = T::from(right.0); - + ) -> RowIDs { match (&left.1, &right.1) { (cmp::Operator::GT, cmp::Operator::LT) | (cmp::Operator::GT, cmp::Operator::LTE) @@ -408,8 +399,8 @@ where | (cmp::Operator::LT, cmp::Operator::GTE) | (cmp::Operator::LTE, cmp::Operator::GT) | (cmp::Operator::LTE, cmp::Operator::GTE) => self.row_ids_cmp_range_order( - (&left_physical, Self::ord_from_op(&left.1)), - (&right_physical, Self::ord_from_op(&right.1)), + (&left.0, Self::ord_from_op(&left.1)), + (&right.0, Self::ord_from_op(&right.1)), dst, ), @@ -454,7 +445,7 @@ where let left_result_no = left_cmp_result != Some(left_op.0) && left_cmp_result != Some(left_op.1); let right_result_no = - right_cmp_result != Some(right_op.0) && left_cmp_result != Some(right_op.1); + right_cmp_result != Some(right_op.0) && right_cmp_result != Some(right_op.1); if (left_result_no || right_result_no) && found { let (min, max) = (i as u32 - count as u32, i as u32); @@ -652,6 +643,29 @@ mod test { // assert!(v.max::(&[0, 1, 2, 3]).is_nan()); } + #[test] + fn ord_from_op() { + assert_eq!( + Fixed::::ord_from_op(&cmp::Operator::LT), + (Ordering::Less, Ordering::Less) + ); + + assert_eq!( + Fixed::::ord_from_op(&cmp::Operator::GT), + (Ordering::Greater, Ordering::Greater) + ); + + assert_eq!( + Fixed::::ord_from_op(&cmp::Operator::LTE), + (Ordering::Less, Ordering::Equal) + ); + + assert_eq!( + Fixed::::ord_from_op(&cmp::Operator::GTE), + (Ordering::Greater, Ordering::Equal) + ); + } + #[test] fn row_ids_filter_eq() { let mut v: Fixed = Fixed::default(); @@ -781,5 +795,14 @@ mod test { RowIDs::new_bitmap(), ); assert!(dst.is_empty()); + + let mut v: Fixed = Fixed::default(); + v.values = vec![100, 200, 300, 2, 200, 22, 30]; + let dst = v.row_ids_filter_range( + (200, Operator::GTE), + (300, Operator::LTE), + RowIDs::new_vector(), + ); + assert_eq!(dst.unwrap_vector(), &vec![1, 2, 4]); } } diff --git a/delorean_segment_store/src/column/fixed_null.rs b/delorean_segment_store/src/column/fixed_null.rs index dac6da76b8..94785a2705 100644 --- a/delorean_segment_store/src/column/fixed_null.rs +++ b/delorean_segment_store/src/column/fixed_null.rs @@ -449,7 +449,7 @@ where let left_result_no = left_cmp_result != Some(left_op.0) && left_cmp_result != Some(left_op.1); let right_result_no = - right_cmp_result != Some(right_op.0) && left_cmp_result != Some(right_op.1); + right_cmp_result != Some(right_op.0) && right_cmp_result != Some(right_op.1); if (self.arr.is_null(i) || left_result_no || right_result_no) && found { let (min, max) = (i as u64 - count as u64, i as u64); @@ -770,7 +770,7 @@ mod test { #[test] fn row_ids_filter_range() { - let v = super::FixedNull::::from( + let v = FixedNull::::from( vec![ Some(100), Some(101), @@ -815,5 +815,21 @@ mod test { Bitmap::create(), ); assert_eq!(bm.to_vec(), Vec::::new()); + + let v = FixedNull::::from( + vec![ + Some(100), + Some(200), + Some(300), + Some(2), + Some(200), + Some(22), + Some(30), + ] + .as_slice(), + ); + let bm = + v.row_ids_filter_range((200, Operator::GTE), (300, Operator::LTE), Bitmap::create()); + assert_eq!(bm.to_vec(), vec![1, 2, 4]); } } From c5dc48db04c6ceb3adb2fed77ffa95faf7277b75 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 30 Oct 2020 19:03:11 +0000 Subject: [PATCH 5/6] feat: implement range based predicate on column --- delorean_segment_store/src/column.rs | 512 +++++++++++++++++++++++---- 1 file changed, 442 insertions(+), 70 deletions(-) diff --git a/delorean_segment_store/src/column.rs b/delorean_segment_store/src/column.rs index fd7ada83a2..d3f8a76cf3 100644 --- a/delorean_segment_store/src/column.rs +++ b/delorean_segment_store/src/column.rs @@ -184,32 +184,12 @@ impl Column { /// /// TODO(edd): row ids pooling. pub fn row_ids_filter(&self, op: cmp::Operator, value: Value<'_>) -> RowIDsOption { - match op { - // When the predicate is == and the metadata range indicates the column - // can't contain `value` then the column doesn't need to be read. - cmp::Operator::Equal => { - if !self.might_contain_value(&value) { - return RowIDsOption::None; // no rows are going to match. - } - } - - // When the predicate is one of {<, <=, >, >=} and the column doesn't - // contain any null values, and the entire range of values satisfies the - // predicate then the column doesn't need to be read. - cmp::Operator::GT | cmp::Operator::GTE | cmp::Operator::LT | cmp::Operator::LTE => { - if self.predicate_matches_all_values(&op, &value) { - return RowIDsOption::All; - } - } - - // When the predicate is != and the metadata range indicates that the - // column can't possibly contain `value` then the predicate must - // match all rows on the column. - cmp::Operator::NotEqual => { - if !self.might_contain_value(&value) { - return RowIDsOption::All; // all rows are going to match. - } - } + // If we can get an answer using only the meta-data on the column then + // return that answer. + match self.evaluate_predicate_on_meta(&op, &value) { + PredicateMatch::None => return RowIDsOption::None, + PredicateMatch::All => return RowIDsOption::All, + PredicateMatch::SomeMaybe => {} // have to apply predicate to column } // TODO(edd): figure out pooling of these @@ -240,7 +220,95 @@ impl Column { low: (cmp::Operator, Value<'_>), high: (cmp::Operator, Value<'_>), ) -> RowIDsOption { - todo!() + let l = self.evaluate_predicate_on_meta(&low.0, &low.1); + let h = self.evaluate_predicate_on_meta(&high.0, &high.1); + match (l, h) { + (PredicateMatch::All, PredicateMatch::All) => return RowIDsOption::All, + + // One of the predicates can't be satisfied, therefore no rows will + // match both predicates. + (PredicateMatch::None, _) | (_, PredicateMatch::None) => return RowIDsOption::None, + + // One of the predicates matches all rows so reduce the operation + // to the other side. + (PredicateMatch::SomeMaybe, PredicateMatch::All) => { + return self.row_ids_filter(low.0, low.1); + } + (PredicateMatch::All, PredicateMatch::SomeMaybe) => { + return self.row_ids_filter(high.0, high.1); + } + + // Have to apply the predicates to the column to identify correct + // set of rows. + (PredicateMatch::SomeMaybe, PredicateMatch::SomeMaybe) => {} + } + + // TODO(edd): figure out pooling of these + let dst = RowIDs::Bitmap(Bitmap::create()); + + // Check the column for all rows that satisfy the predicate. + let row_ids = match &self { + Column::String(_, data) => unimplemented!("not supported on string columns yet"), + Column::Float(_, data) => { + data.row_ids_filter_range((low.0, low.1.scalar()), (high.0, high.1.scalar()), dst) + } + Column::Integer(_, data) => { + data.row_ids_filter_range((low.0, low.1.scalar()), (high.0, high.1.scalar()), dst) + } + Column::Unsigned(_, data) => { + data.row_ids_filter_range((low.0, low.1.scalar()), (high.0, high.1.scalar()), dst) + } + Column::Bool => todo!(), + Column::ByteArray(_, data) => todo!(), + }; + + if row_ids.is_empty() { + return RowIDsOption::None; + } + RowIDsOption::Some(row_ids) + } + + // Helper function to determine if the predicate matches either no rows or + // all the rows in a column. This is determined by looking at the metadata + // on the column. + // + // `None` indicates that the column may contain some matching rows and the + // predicate should be directly applied to the column. + fn evaluate_predicate_on_meta(&self, op: &cmp::Operator, value: &Value<'_>) -> PredicateMatch { + match op { + // When the predicate is == and the metadata range indicates the column + // can't contain `value` then the column doesn't need to be read. + cmp::Operator::Equal => { + if !self.might_contain_value(&value) { + return PredicateMatch::None; // no rows are going to match. + } + } + + // When the predicate is one of {<, <=, >, >=} and the column doesn't + // contain any null values, and the entire range of values satisfies the + // predicate then the column doesn't need to be read. + cmp::Operator::GT | cmp::Operator::GTE | cmp::Operator::LT | cmp::Operator::LTE => { + if self.predicate_matches_all_values(&op, &value) { + return PredicateMatch::All; + } + } + + // When the predicate is != and the metadata range indicates that the + // column can't possibly contain `value` then the predicate must + // match all rows on the column. + cmp::Operator::NotEqual => { + if !self.might_contain_value(&value) { + return PredicateMatch::All; // all rows are going to match. + } + } + } + + if self.predicate_matches_no_values(&op, &value) { + return PredicateMatch::None; + } + + // The predicate could match some values + PredicateMatch::SomeMaybe } // Helper method to determine if the column possibly contains this value @@ -339,6 +407,32 @@ impl Column { } } + // Helper method to determine if the predicate can not possibly match any + // values in the column. + fn predicate_matches_no_values(&self, op: &cmp::Operator, value: &Value<'_>) -> bool { + match &self { + Column::String(meta, data) => { + if let Value::String(other) = value { + meta.match_no_values(op, other) + } else { + unreachable!("impossible value comparison"); + } + } + // breaking this down: + // * Extract a Scalar variant from `value`, which should panic if + // that's not possible; + // * Convert that scalar to a primitive value based + // on the logical type used for the metadata on the column. + // * See if one can prove none of the column can match the predicate. + // + Column::Float(meta, data) => meta.match_no_values(op, &value.scalar().as_f64()), + Column::Integer(meta, data) => meta.match_no_values(op, &value.scalar().as_i64()), + Column::Unsigned(meta, data) => meta.match_no_values(op, &value.scalar().as_u64()), + Column::Bool => todo!(), + Column::ByteArray(meta, _) => todo!(), + } + } + // // Methods for selecting // @@ -411,7 +505,6 @@ impl MetaData { // column. It is up to the caller to determine if the column contains null // values, which would invalidate a truthful result. fn might_match_all_values(&self, op: &cmp::Operator, v: &T) -> bool { - println!("comparing {:?} {:?}, {:?}", op, v, self.range); match &self.range { Some(range) => match op { // all values in column equal to v @@ -430,6 +523,28 @@ impl MetaData { None => false, // only null values in column. } } + + // Determines if it can be shown that the predicate would not match any rows + // in the column. + fn match_no_values(&self, op: &cmp::Operator, v: &T) -> bool { + match &self.range { + Some(range) => match op { + // no values are `v` so no rows will match `== v` + cmp::Operator::Equal => range.0 == range.1 && &range.1 != v, + // all values are `v` so no rows will match `!= v` + cmp::Operator::NotEqual => range.0 == range.1 && &range.1 == v, + // max value in column is `<= v` so no values can be `> v` + cmp::Operator::GT => &range.1 <= v, + // max value in column is `< v` so no values can be `>= v` + cmp::Operator::GTE => &range.1 < v, + // min value in column is `>= v` so no values can be `< v` + cmp::Operator::LT => &range.0 >= v, + // min value in column is `> v` so no values can be `<= v` + cmp::Operator::LTE => &range.0 > v, + }, + None => true, // only null values in column so no values satisfy `v` + } + } } pub enum StringEncoding { RLE(dictionary::RLE), @@ -828,13 +943,13 @@ impl IntegerEncoding { // non-null signed 64-bit integers. match dst { EncodedValues::I64(dst) => match &self { - IntegerEncoding::I64I64(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64I32(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64U32(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64I16(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64U16(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64I8(data) => EncodedValues::I64(data.values(row_ids, dst)), - IntegerEncoding::I64U8(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64I64(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64I32(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64U32(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64I16(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64U16(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64I8(data) => EncodedValues::I64(data.values(row_ids, dst)), + Self::I64U8(data) => EncodedValues::I64(data.values(row_ids, dst)), _ => unreachable!("encoded values on encoding type not supported"), }, _ => unreachable!("currently only support encoded values as i64"), @@ -849,13 +964,13 @@ impl IntegerEncoding { // non-null signed 64-bit integers. match dst { EncodedValues::I64(dst) => match &self { - IntegerEncoding::I64I64(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64I32(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64U32(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64I16(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64U16(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64I8(data) => EncodedValues::I64(data.all_values(dst)), - IntegerEncoding::I64U8(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64I64(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64I32(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64U32(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64I16(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64U16(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64I8(data) => EncodedValues::I64(data.all_values(dst)), + Self::I64U8(data) => EncodedValues::I64(data.all_values(dst)), _ => unreachable!("encoded values on encoding type not supported"), }, _ => unreachable!("currently only support encoded values as i64"), @@ -869,33 +984,127 @@ impl IntegerEncoding { /// `row_ids_filter` will panic if this invariant is broken. pub fn row_ids_filter(&self, op: cmp::Operator, value: &Scalar, dst: RowIDs) -> RowIDs { match &self { - IntegerEncoding::I64I64(c) => c.row_ids_filter(value.as_i64(), op, dst), - IntegerEncoding::I64I32(c) => c.row_ids_filter(value.as_i32(), op, dst), - IntegerEncoding::I64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), - IntegerEncoding::I64I16(c) => c.row_ids_filter(value.as_i16(), op, dst), - IntegerEncoding::I64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), - IntegerEncoding::I64I8(c) => c.row_ids_filter(value.as_i8(), op, dst), - IntegerEncoding::I64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::I32I32(c) => c.row_ids_filter(value.as_i32(), op, dst), - IntegerEncoding::I32I16(c) => c.row_ids_filter(value.as_i16(), op, dst), - IntegerEncoding::I32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), - IntegerEncoding::I32I8(c) => c.row_ids_filter(value.as_i8(), op, dst), - IntegerEncoding::I32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::I16I16(c) => c.row_ids_filter(value.as_i16(), op, dst), - IntegerEncoding::I16I8(c) => c.row_ids_filter(value.as_i8(), op, dst), - IntegerEncoding::I16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::I8I8(c) => c.row_ids_filter(value.as_i8(), op, dst), - IntegerEncoding::U64U64(c) => c.row_ids_filter(value.as_u64(), op, dst), - IntegerEncoding::U64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), - IntegerEncoding::U64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), - IntegerEncoding::U64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::U32U32(c) => c.row_ids_filter(value.as_u32(), op, dst), - IntegerEncoding::U32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), - IntegerEncoding::U32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::U16U16(c) => c.row_ids_filter(value.as_u16(), op, dst), - IntegerEncoding::U16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::U8U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - IntegerEncoding::I64I64N(c) => todo!(), + Self::I64I64(c) => c.row_ids_filter(value.as_i64(), op, dst), + Self::I64I32(c) => c.row_ids_filter(value.as_i32(), op, dst), + Self::I64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + Self::I64I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + Self::I64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + Self::I64I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + Self::I64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::I32I32(c) => c.row_ids_filter(value.as_i32(), op, dst), + Self::I32I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + Self::I32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + Self::I32I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + Self::I32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::I16I16(c) => c.row_ids_filter(value.as_i16(), op, dst), + Self::I16I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + Self::I16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::I8I8(c) => c.row_ids_filter(value.as_i8(), op, dst), + Self::U64U64(c) => c.row_ids_filter(value.as_u64(), op, dst), + Self::U64U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + Self::U64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + Self::U64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::U32U32(c) => c.row_ids_filter(value.as_u32(), op, dst), + Self::U32U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + Self::U32U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::U16U16(c) => c.row_ids_filter(value.as_u16(), op, dst), + Self::U16U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::U8U8(c) => c.row_ids_filter(value.as_u8(), op, dst), + Self::I64I64N(c) => todo!(), + } + } + + /// Returns the row ids that satisfy both the provided predicates. + /// + /// Note: it is the caller's responsibility to ensure that the provided + /// `Scalar` value will fit within the physical type of the encoded column. + /// `row_ids_filter` will panic if this invariant is broken. + pub fn row_ids_filter_range( + &self, + low: (cmp::Operator, &Scalar), + high: (cmp::Operator, &Scalar), + dst: RowIDs, + ) -> RowIDs { + match &self { + Self::I64I64(c) => { + c.row_ids_filter_range((low.1.as_i64(), low.0), (high.1.as_i64(), high.0), dst) + } + Self::I64I32(c) => { + c.row_ids_filter_range((low.1.as_i32(), low.0), (high.1.as_i32(), high.0), dst) + } + Self::I64U32(c) => { + c.row_ids_filter_range((low.1.as_u32(), low.0), (high.1.as_u32(), high.0), dst) + } + Self::I64I16(c) => { + c.row_ids_filter_range((low.1.as_i16(), low.0), (high.1.as_i16(), high.0), dst) + } + Self::I64U16(c) => { + c.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::I64I8(c) => { + c.row_ids_filter_range((low.1.as_i8(), low.0), (high.1.as_i8(), high.0), dst) + } + Self::I64U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::I32I32(c) => { + c.row_ids_filter_range((low.1.as_i32(), low.0), (high.1.as_i32(), high.0), dst) + } + Self::I32I16(c) => { + c.row_ids_filter_range((low.1.as_i16(), low.0), (high.1.as_i16(), high.0), dst) + } + Self::I32U16(c) => { + c.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::I32I8(c) => { + c.row_ids_filter_range((low.1.as_i8(), low.0), (high.1.as_i8(), high.0), dst) + } + Self::I32U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::I16I16(c) => { + c.row_ids_filter_range((low.1.as_i16(), low.0), (high.1.as_i16(), high.0), dst) + } + Self::I16I8(c) => { + c.row_ids_filter_range((low.1.as_i8(), low.0), (high.1.as_i8(), high.0), dst) + } + Self::I16U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::I8I8(c) => { + c.row_ids_filter_range((low.1.as_i8(), low.0), (high.1.as_i8(), high.0), dst) + } + Self::U64U64(c) => { + c.row_ids_filter_range((low.1.as_u64(), low.0), (high.1.as_u64(), high.0), dst) + } + Self::U64U32(c) => { + c.row_ids_filter_range((low.1.as_u32(), low.0), (high.1.as_u32(), high.0), dst) + } + Self::U64U16(c) => { + c.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::U64U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::U32U32(c) => { + c.row_ids_filter_range((low.1.as_u32(), low.0), (high.1.as_u32(), high.0), dst) + } + Self::U32U16(c) => { + c.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::U32U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::U16U16(c) => { + c.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::U16U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::U8U8(c) => { + c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + Self::I64I64N(c) => todo!(), } } } @@ -945,6 +1154,27 @@ impl FloatEncoding { FloatEncoding::Fixed32(c) => c.row_ids_filter(value.as_f32(), op, dst), } } + + /// Returns the row ids that satisfy both the provided predicates. + /// + /// Note: it is the caller's responsibility to ensure that the provided + /// `Scalar` value will fit within the physical type of the encoded column. + /// `row_ids_filter` will panic if this invariant is broken. + pub fn row_ids_filter_range( + &self, + low: (cmp::Operator, &Scalar), + high: (cmp::Operator, &Scalar), + dst: RowIDs, + ) -> RowIDs { + match &self { + FloatEncoding::Fixed64(c) => { + c.row_ids_filter_range((low.1.as_f64(), low.0), (high.1.as_f64(), high.0), dst) + } + FloatEncoding::Fixed32(c) => { + c.row_ids_filter_range((low.1.as_f32(), low.0), (high.1.as_f32(), high.0), dst) + } + } + } } // Converts an Arrow `StringArray` into a column, currently using the RLE @@ -1714,9 +1944,16 @@ impl EncodedValues { } } +#[derive(Debug, PartialEq)] +enum PredicateMatch { + None, + SomeMaybe, + All, +} + /// A specific type of Option for `RowIDs` where the notion of all rows ids is /// represented. -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum RowIDsOption { None, Some(RowIDs), @@ -2392,6 +2629,18 @@ mod test { Value::String(&"Adam Raised a Cain".to_string()), ); assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter( + cmp::Operator::GT, + Value::String(&"Adam Raised a Cain".to_string()), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter( + cmp::Operator::NotEqual, + Value::String(&"Thunder Road".to_string()), + ); + assert!(matches!(row_ids, RowIDsOption::All)); } #[test] @@ -2413,6 +2662,9 @@ mod test { row_ids = col.row_ids_filter(cmp::Operator::NotEqual, Value::Scalar(Scalar::I32(-1257))); assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter(cmp::Operator::LT, Value::Scalar(Scalar::I64(i64::MAX))); + assert!(matches!(row_ids, RowIDsOption::All)); } #[test] @@ -2461,6 +2713,63 @@ mod test { assert!(matches!(row_ids, RowIDsOption::All)); } + #[test] + fn row_ids_range() { + let input = &[100, 200, 300, 2, 200, 22, 30]; + + let col = Column::from(&input[..]); + let mut row_ids = col.row_ids_filter_range( + (cmp::Operator::GT, Value::Scalar(Scalar::I32(100))), + (cmp::Operator::LT, Value::Scalar(Scalar::I32(300))), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![1, 4]); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(200))), + (cmp::Operator::LTE, Value::Scalar(Scalar::I32(300))), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![1, 2, 4]); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(23333))), + (cmp::Operator::LTE, Value::Scalar(Scalar::I32(999999))), + ); + assert!(matches!(row_ids, RowIDsOption::None)); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GT, Value::Scalar(Scalar::I32(-100))), + (cmp::Operator::LT, Value::Scalar(Scalar::I32(301))), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(2))), + (cmp::Operator::LTE, Value::Scalar(Scalar::I32(300))), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(87))), + (cmp::Operator::LTE, Value::Scalar(Scalar::I32(999999))), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 1, 2, 4]); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(0))), + ( + cmp::Operator::NotEqual, + Value::Scalar(Scalar::I64(i64::MAX)), + ), + ); + assert!(matches!(row_ids, RowIDsOption::All)); + + row_ids = col.row_ids_filter_range( + (cmp::Operator::GTE, Value::Scalar(Scalar::I32(0))), + (cmp::Operator::NotEqual, Value::Scalar(Scalar::I64(99))), + ); + assert_eq!(row_ids.unwrap().to_vec(), vec![0, 1, 2, 3, 4, 5, 6]); + } + #[test] fn might_contain_value() { let input = &[100i64, 200, 300, 2, 200, 22, 30]; @@ -2566,4 +2875,67 @@ mod test { false ); } + + #[test] + fn evaluate_predicate_on_meta() { + let input = &[100i64, 200, 300, 2, 200, 22, 30]; + let col = Column::from(&input[..]); + + let cases: Vec<(cmp::Operator, Scalar, PredicateMatch)> = vec![ + ( + cmp::Operator::GT, + Scalar::U64(100), + PredicateMatch::SomeMaybe, + ), + ( + cmp::Operator::GT, + Scalar::I64(100), + PredicateMatch::SomeMaybe, + ), + (cmp::Operator::GT, Scalar::I8(-99), PredicateMatch::All), + ( + cmp::Operator::GT, + Scalar::I64(100), + PredicateMatch::SomeMaybe, + ), + ( + cmp::Operator::LT, + Scalar::I64(300), + PredicateMatch::SomeMaybe, + ), + (cmp::Operator::LTE, Scalar::I32(300), PredicateMatch::All), + ( + cmp::Operator::Equal, + Scalar::I32(2), + PredicateMatch::SomeMaybe, + ), + ( + cmp::Operator::NotEqual, + Scalar::I32(2), + PredicateMatch::SomeMaybe, + ), + (cmp::Operator::NotEqual, Scalar::I64(1), PredicateMatch::All), + ( + cmp::Operator::NotEqual, + Scalar::I64(301), + PredicateMatch::All, + ), + (cmp::Operator::GT, Scalar::I64(100000), PredicateMatch::None), + (cmp::Operator::GTE, Scalar::I64(301), PredicateMatch::None), + (cmp::Operator::LT, Scalar::I64(2), PredicateMatch::None), + (cmp::Operator::LTE, Scalar::I8(-100), PredicateMatch::None), + ( + cmp::Operator::Equal, + Scalar::I64(100000), + PredicateMatch::None, + ), + ]; + + for (op, scalar, result) in cases { + assert_eq!( + col.evaluate_predicate_on_meta(&op, &Value::Scalar(scalar)), + result + ); + } + } } From 2027b1f8da62a174ce7471851f0982d6d5364d41 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Mon, 2 Nov 2020 13:24:29 +0000 Subject: [PATCH 6/6] refactor: PR feedback Co-authored-by: Andrew Lamb --- delorean_segment_store/src/column.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delorean_segment_store/src/column.rs b/delorean_segment_store/src/column.rs index d3f8a76cf3..1f7ef0c65d 100644 --- a/delorean_segment_store/src/column.rs +++ b/delorean_segment_store/src/column.rs @@ -1793,7 +1793,7 @@ macro_rules! typed_scalar_converters { } impl Scalar { - // Implementations of all the accessors for the variants of `Packers`. + // Implementations of all the accessors for the variants of `Scalar`. typed_scalar_converters! { (as_i64, try_as_i64, i64), (as_i32, try_as_i32, i32),