diff --git a/read_buffer/src/column.rs b/read_buffer/src/column.rs index 484ab06055..0cad1770f5 100644 --- a/read_buffer/src/column.rs +++ b/read_buffer/src/column.rs @@ -1,3 +1,4 @@ +pub mod bool; pub mod cmp; pub mod dictionary; pub mod fixed; diff --git a/read_buffer/src/column/bool.rs b/read_buffer/src/column/bool.rs new file mode 100644 index 0000000000..2f17e1ca03 --- /dev/null +++ b/read_buffer/src/column/bool.rs @@ -0,0 +1,440 @@ +//! An encoding nullable bool, by an Arrow array. +use std::cmp::Ordering; +use std::fmt::Debug; + +use arrow_deps::arrow::array::{Array, BooleanArray}; + +use crate::column::{cmp, RowIDs}; + +#[derive(Debug)] +pub struct Bool { + arr: BooleanArray, +} + +impl std::fmt::Display for Bool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "[Bool] rows: {:?}, nulls: {:?}, size: {}", + self.arr.len(), + self.arr.null_count(), + self.size() + ) + } +} +impl Bool { + pub fn num_rows(&self) -> u32 { + self.arr.len() as u32 + } + + pub fn is_empty(&self) -> bool { + self.arr.is_empty() + } + + pub fn contains_null(&self) -> bool { + self.arr.null_count() > 0 + } + + /// Returns the total size in bytes of the encoded data. Note, this method + /// is really an "accurate" estimation. It doesn't include for example the + /// size of the `Plain` struct receiver. + pub fn size(&self) -> u64 { + unimplemented!("not yet implemented") + } + + // + // + // ---- Methods for getting row ids from values. + // + // + + /// Returns the first logical row that contains a value `v`. + pub fn first_row_id_eq_value(&self, v: bool) -> Option { + for i in 0..self.arr.len() { + if self.arr.is_null(i) { + continue; + } else if self.arr.value(i) == v { + return Some(i); + } + } + None + } + + // + // + // ---- Methods for getting decoded values. + // + // + + /// Return the logical value at the provided row ID. A NULL value + /// is represented by None. + pub fn value(&self, row_id: u32) -> Option { + if self.arr.is_null(row_id as usize) { + return None; + } + Some(self.arr.value(row_id as usize)) + } + + /// Returns the logical values for the provided row IDs. + /// + /// NULL values are represented by None. + pub fn values(&self, row_ids: &[u32], mut dst: Vec>) -> Vec> { + dst.clear(); + dst.reserve(row_ids.len()); + + for &row_id in row_ids { + if self.arr.is_null(row_id as usize) { + dst.push(None) + } else { + dst.push(Some(self.arr.value(row_id as usize))) + } + } + assert_eq!(dst.len(), row_ids.len()); + dst + } + + /// Returns the logical values for all the rows in the column. + /// + /// NULL values are represented by None. + pub fn all_values(&self, mut dst: Vec>) -> Vec> { + dst.clear(); + dst.reserve(self.arr.len()); + + for i in 0..self.num_rows() as usize { + if self.arr.is_null(i) { + dst.push(None) + } else { + dst.push(Some(self.arr.value(i))) + } + } + assert_eq!(dst.len(), self.num_rows() as usize); + dst + } + + // + // + // ---- Methods for aggregation. + // + // + + /// Returns the count of the non-null values for the provided + /// row IDs. + pub fn count(&self, row_ids: &[u32]) -> u32 { + if self.arr.null_count() == 0 { + return row_ids.len() as u32; + } + + let mut count = 0; + for &i in row_ids { + if self.arr.is_null(i as usize) { + continue; + } + count += 1; + } + count + } + + /// Returns the first logical (decoded) value from the provided + /// row IDs. + pub fn first(&self, row_ids: &[u32]) -> Option { + self.value(row_ids[0]) + } + + /// Returns the last logical (decoded) value from the provided + /// row IDs. + pub fn last(&self, row_ids: &[u32]) -> Option { + self.value(row_ids[row_ids.len() - 1]) + } + + /// Returns the minimum logical (decoded) non-null value from the provided + /// row IDs. + pub fn min(&self, row_ids: &[u32]) -> Option { + let mut min: Option = self.value(row_ids[0]); + for &v in row_ids.iter().skip(1) { + if self.arr.is_null(v as usize) { + continue; + } + + if self.value(v) < min { + min = self.value(v); + } + } + min + } + + /// Returns the maximum logical (decoded) non-null value from the provided + /// row IDs. + pub fn max(&self, row_ids: &[u32]) -> Option { + let mut max: Option = self.value(row_ids[0]); + for &v in row_ids.iter().skip(1) { + if self.arr.is_null(v as usize) { + continue; + } + + if self.value(v) > max { + max = self.value(v); + } + } + max + } + + // + // + // ---- Methods for filtering via operators. + // + // + + /// Returns the set of row ids that satisfy a binary operator on a logical + /// value. + /// + /// Essentially, this supports `value {=, !=, >, >=, <, <=} x`. + /// + /// The equivalent of `IS NULL` is not currently supported via this method. + pub fn row_ids_filter(&self, value: bool, op: &cmp::Operator, dst: RowIDs) -> RowIDs { + match op { + cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst), + cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst), + cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst), + cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst), + _ => self.row_ids_equal(value, op, dst), + } + } + + // Helper function to convert comparison operators to cmp orderings. + fn ord_from_op(op: &cmp::Operator) -> (Ordering, Ordering) { + match op { + cmp::Operator::GT => (Ordering::Greater, Ordering::Greater), + cmp::Operator::GTE => (Ordering::Greater, Ordering::Equal), + cmp::Operator::LT => (Ordering::Less, Ordering::Less), + cmp::Operator::LTE => (Ordering::Less, Ordering::Equal), + _ => panic!("cannot convert operator to ordering"), + } + } + + // Handles finding all rows that match the provided operator on `value`. + // For performance reasons ranges of matching values are collected up and + // added in bulk to the bitmap. + fn row_ids_equal(&self, value: bool, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs { + dst.clear(); + + let desired; + if let cmp::Operator::Equal = op { + desired = true; // == operator + } else { + desired = false; // != operator + } + + let mut found = false; + let mut count = 0; + for i in 0..self.num_rows() as usize { + let mut cmp_result: bool; + let cmp_result = self.arr.value(i) == value; + + if (self.arr.is_null(i) || cmp_result != desired) && found { + let (min, max) = (i as u32 - count, i as u32); + dst.add_range(min, max); + found = false; + count = 0; + continue; + } else if self.arr.is_null(i) || cmp_result != desired { + continue; + } + + if !found { + found = true; + } + count += 1; + } + + // add any remaining range. + if found { + let (min, max) = (self.num_rows() - count, self.num_rows()); + dst.add_range(min, max); + } + dst + } + + // Handles finding all rows that match the provided operator on `value`. + // For performance reasons ranges of matching values are collected up and + // added in bulk to the bitmap. + // + // `op` is a tuple of comparisons where at least one of them must be + // satisfied to satisfy the overall operator. + fn row_ids_cmp_order( + &self, + value: bool, + op: (std::cmp::Ordering, std::cmp::Ordering), + mut dst: RowIDs, + ) -> RowIDs { + dst.clear(); + + let mut found = false; + let mut count = 0; + for i in 0..self.num_rows() as usize { + let cmp_result = self.arr.value(i).partial_cmp(&value); + + if (self.arr.is_null(i) || (cmp_result != Some(op.0) && cmp_result != Some(op.1))) + && found + { + let (min, max) = (i as u32 - count, i as u32); + dst.add_range(min, max); + found = false; + count = 0; + continue; + } else if self.arr.is_null(i) || (cmp_result != Some(op.0) && cmp_result != Some(op.1)) + { + continue; + } + + if !found { + found = true; + } + count += 1; + } + + // add any remaining range. + if found { + let (min, max) = (self.num_rows() - count, self.num_rows()); + dst.add_range(min, max); + } + dst + } +} + +impl From<&[bool]> for Bool { + fn from(v: &[bool]) -> Self { + Self { + arr: BooleanArray::from(v.to_vec()), + } + } +} + +impl From<&[Option]> for Bool { + fn from(v: &[Option]) -> Self { + Self { + arr: BooleanArray::from(v.to_vec()), + } + } +} + +impl From for Bool { + fn from(arr: BooleanArray) -> Self { + Self { arr } + } +} + +#[cfg(test)] +mod test { + use super::cmp::Operator; + use super::*; + + fn some_vec(v: Vec) -> Vec> { + v.iter().map(|x| Some(*x)).collect() + } + + #[test] + fn first_row_id_eq_value() { + let v = Bool::from(vec![true, true].as_slice()); + + assert_eq!(v.first_row_id_eq_value(true), Some(0)); + assert_eq!(v.first_row_id_eq_value(false), None); + } + + #[test] + fn value() { + let v = Bool::from(vec![Some(false), Some(true), Some(false), None].as_slice()); + assert_eq!(v.value(1), Some(true)); + assert_eq!(v.value(3), None); + } + + #[test] + fn count() { + let v = Bool::from(&[Some(true), None, Some(true)][..]); + assert_eq!(v.count(&[0, 1, 2]), 2); + assert_eq!(v.count(&[0, 2]), 2); + assert_eq!(v.count(&[0, 1]), 1); + assert_eq!(v.count(&[1]), 0); + } + + #[test] + fn first() { + let v = Bool::from(&[false, true, true][..]); + assert_eq!(v.first(&[0, 1, 2]), Some(false)); + assert_eq!(v.first(&[1, 2]), Some(true)); + } + + #[test] + fn last() { + let v = Bool::from(&[false, true, false][..]); + assert_eq!(v.last(&[0, 1, 2]), Some(false)); + assert_eq!(v.last(&[1, 2]), Some(false)); + assert_eq!(v.last(&[0, 2]), Some(false)); + } + + #[test] + fn min() { + let v = Bool::from(&[Some(true), Some(true), Some(false), None][..]); + assert_eq!(v.min(&[0, 1, 2, 3]), Some(false)); + assert_eq!(v.min(&[1, 2]), Some(false)); + assert_eq!(v.min(&[0, 1]), Some(true)); + assert_eq!(v.min(&[0, 3]), Some(true)); + assert_eq!(v.min(&[3]), None); + } + + #[test] + fn max() { + let v = Bool::from(&[Some(true), Some(true), Some(false), None][..]); + assert_eq!(v.max(&[0, 1, 2, 3]), Some(true)); + assert_eq!(v.max(&[1, 2]), Some(true)); + assert_eq!(v.max(&[0, 1]), Some(true)); + assert_eq!(v.max(&[0, 3]), Some(true)); + assert_eq!(v.max(&[3]), None); + } + + #[test] + fn row_ids_filter() { + let v = Bool::from(&[Some(true), Some(false), None, None, Some(true)][..]); + + // EQ + let row_ids = v.row_ids_filter(true, &Operator::Equal, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 4]); + + let row_ids = v.row_ids_filter(false, &Operator::Equal, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![1]); + + // NEQ + let row_ids = v.row_ids_filter(true, &Operator::NotEqual, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![1]); + + let row_ids = v.row_ids_filter(false, &Operator::NotEqual, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 4]); + + // GT + let row_ids = v.row_ids_filter(true, &Operator::GT, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), Vec::::new()); + + let row_ids = v.row_ids_filter(false, &Operator::GT, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 4]); + + // GTE + let row_ids = v.row_ids_filter(true, &Operator::GTE, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 4]); + + let row_ids = v.row_ids_filter(false, &Operator::GTE, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 1, 4]); + + // LT + let row_ids = v.row_ids_filter(true, &Operator::LT, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![1]); + + let row_ids = v.row_ids_filter(false, &Operator::LT, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), Vec::::new()); + + // LTE + let row_ids = v.row_ids_filter(true, &Operator::LTE, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![0, 1, 4]); + + let row_ids = v.row_ids_filter(false, &Operator::LTE, RowIDs::new_vector()); + assert_eq!(row_ids.to_vec(), vec![1]); + } +}