feat: an arrow-backed boolean encoding
parent
0dae29f9ea
commit
36d9541cbc
|
@ -1,3 +1,4 @@
|
|||
pub mod bool;
|
||||
pub mod cmp;
|
||||
pub mod dictionary;
|
||||
pub mod fixed;
|
||||
|
|
|
@ -0,0 +1,440 @@
|
|||
//! An encoding nullable bool, by an Arrow array.
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Debug;
|
||||
|
||||
use arrow_deps::arrow::array::{Array, BooleanArray};
|
||||
|
||||
use crate::column::{cmp, RowIDs};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Bool {
|
||||
arr: BooleanArray,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Bool {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"[Bool] rows: {:?}, nulls: {:?}, size: {}",
|
||||
self.arr.len(),
|
||||
self.arr.null_count(),
|
||||
self.size()
|
||||
)
|
||||
}
|
||||
}
|
||||
impl Bool {
|
||||
pub fn num_rows(&self) -> u32 {
|
||||
self.arr.len() as u32
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.arr.is_empty()
|
||||
}
|
||||
|
||||
pub fn contains_null(&self) -> bool {
|
||||
self.arr.null_count() > 0
|
||||
}
|
||||
|
||||
/// Returns the total size in bytes of the encoded data. Note, this method
|
||||
/// is really an "accurate" estimation. It doesn't include for example the
|
||||
/// size of the `Plain` struct receiver.
|
||||
pub fn size(&self) -> u64 {
|
||||
unimplemented!("not yet implemented")
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// ---- Methods for getting row ids from values.
|
||||
//
|
||||
//
|
||||
|
||||
/// Returns the first logical row that contains a value `v`.
|
||||
pub fn first_row_id_eq_value(&self, v: bool) -> Option<usize> {
|
||||
for i in 0..self.arr.len() {
|
||||
if self.arr.is_null(i) {
|
||||
continue;
|
||||
} else if self.arr.value(i) == v {
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// ---- Methods for getting decoded values.
|
||||
//
|
||||
//
|
||||
|
||||
/// Return the logical value at the provided row ID. A NULL value
|
||||
/// is represented by None.
|
||||
pub fn value(&self, row_id: u32) -> Option<bool> {
|
||||
if self.arr.is_null(row_id as usize) {
|
||||
return None;
|
||||
}
|
||||
Some(self.arr.value(row_id as usize))
|
||||
}
|
||||
|
||||
/// Returns the logical values for the provided row IDs.
|
||||
///
|
||||
/// NULL values are represented by None.
|
||||
pub fn values(&self, row_ids: &[u32], mut dst: Vec<Option<bool>>) -> Vec<Option<bool>> {
|
||||
dst.clear();
|
||||
dst.reserve(row_ids.len());
|
||||
|
||||
for &row_id in row_ids {
|
||||
if self.arr.is_null(row_id as usize) {
|
||||
dst.push(None)
|
||||
} else {
|
||||
dst.push(Some(self.arr.value(row_id as usize)))
|
||||
}
|
||||
}
|
||||
assert_eq!(dst.len(), row_ids.len());
|
||||
dst
|
||||
}
|
||||
|
||||
/// Returns the logical values for all the rows in the column.
|
||||
///
|
||||
/// NULL values are represented by None.
|
||||
pub fn all_values(&self, mut dst: Vec<Option<bool>>) -> Vec<Option<bool>> {
|
||||
dst.clear();
|
||||
dst.reserve(self.arr.len());
|
||||
|
||||
for i in 0..self.num_rows() as usize {
|
||||
if self.arr.is_null(i) {
|
||||
dst.push(None)
|
||||
} else {
|
||||
dst.push(Some(self.arr.value(i)))
|
||||
}
|
||||
}
|
||||
assert_eq!(dst.len(), self.num_rows() as usize);
|
||||
dst
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// ---- Methods for aggregation.
|
||||
//
|
||||
//
|
||||
|
||||
/// Returns the count of the non-null values for the provided
|
||||
/// row IDs.
|
||||
pub fn count(&self, row_ids: &[u32]) -> u32 {
|
||||
if self.arr.null_count() == 0 {
|
||||
return row_ids.len() as u32;
|
||||
}
|
||||
|
||||
let mut count = 0;
|
||||
for &i in row_ids {
|
||||
if self.arr.is_null(i as usize) {
|
||||
continue;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Returns the first logical (decoded) value from the provided
|
||||
/// row IDs.
|
||||
pub fn first(&self, row_ids: &[u32]) -> Option<bool> {
|
||||
self.value(row_ids[0])
|
||||
}
|
||||
|
||||
/// Returns the last logical (decoded) value from the provided
|
||||
/// row IDs.
|
||||
pub fn last(&self, row_ids: &[u32]) -> Option<bool> {
|
||||
self.value(row_ids[row_ids.len() - 1])
|
||||
}
|
||||
|
||||
/// Returns the minimum logical (decoded) non-null value from the provided
|
||||
/// row IDs.
|
||||
pub fn min(&self, row_ids: &[u32]) -> Option<bool> {
|
||||
let mut min: Option<bool> = self.value(row_ids[0]);
|
||||
for &v in row_ids.iter().skip(1) {
|
||||
if self.arr.is_null(v as usize) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.value(v) < min {
|
||||
min = self.value(v);
|
||||
}
|
||||
}
|
||||
min
|
||||
}
|
||||
|
||||
/// Returns the maximum logical (decoded) non-null value from the provided
|
||||
/// row IDs.
|
||||
pub fn max(&self, row_ids: &[u32]) -> Option<bool> {
|
||||
let mut max: Option<bool> = self.value(row_ids[0]);
|
||||
for &v in row_ids.iter().skip(1) {
|
||||
if self.arr.is_null(v as usize) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.value(v) > max {
|
||||
max = self.value(v);
|
||||
}
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// ---- Methods for filtering via operators.
|
||||
//
|
||||
//
|
||||
|
||||
/// Returns the set of row ids that satisfy a binary operator on a logical
|
||||
/// value.
|
||||
///
|
||||
/// Essentially, this supports `value {=, !=, >, >=, <, <=} x`.
|
||||
///
|
||||
/// The equivalent of `IS NULL` is not currently supported via this method.
|
||||
pub fn row_ids_filter(&self, value: bool, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
|
||||
match op {
|
||||
cmp::Operator::GT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
|
||||
cmp::Operator::GTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
|
||||
cmp::Operator::LT => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
|
||||
cmp::Operator::LTE => self.row_ids_cmp_order(value, Self::ord_from_op(&op), dst),
|
||||
_ => self.row_ids_equal(value, op, dst),
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to convert comparison operators to cmp orderings.
|
||||
fn ord_from_op(op: &cmp::Operator) -> (Ordering, Ordering) {
|
||||
match op {
|
||||
cmp::Operator::GT => (Ordering::Greater, Ordering::Greater),
|
||||
cmp::Operator::GTE => (Ordering::Greater, Ordering::Equal),
|
||||
cmp::Operator::LT => (Ordering::Less, Ordering::Less),
|
||||
cmp::Operator::LTE => (Ordering::Less, Ordering::Equal),
|
||||
_ => panic!("cannot convert operator to ordering"),
|
||||
}
|
||||
}
|
||||
|
||||
// Handles finding all rows that match the provided operator on `value`.
|
||||
// For performance reasons ranges of matching values are collected up and
|
||||
// added in bulk to the bitmap.
|
||||
fn row_ids_equal(&self, value: bool, op: &cmp::Operator, mut dst: RowIDs) -> RowIDs {
|
||||
dst.clear();
|
||||
|
||||
let desired;
|
||||
if let cmp::Operator::Equal = op {
|
||||
desired = true; // == operator
|
||||
} else {
|
||||
desired = false; // != operator
|
||||
}
|
||||
|
||||
let mut found = false;
|
||||
let mut count = 0;
|
||||
for i in 0..self.num_rows() as usize {
|
||||
let mut cmp_result: bool;
|
||||
let cmp_result = self.arr.value(i) == value;
|
||||
|
||||
if (self.arr.is_null(i) || cmp_result != desired) && found {
|
||||
let (min, max) = (i as u32 - count, i as u32);
|
||||
dst.add_range(min, max);
|
||||
found = false;
|
||||
count = 0;
|
||||
continue;
|
||||
} else if self.arr.is_null(i) || cmp_result != desired {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !found {
|
||||
found = true;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
|
||||
// add any remaining range.
|
||||
if found {
|
||||
let (min, max) = (self.num_rows() - count, self.num_rows());
|
||||
dst.add_range(min, max);
|
||||
}
|
||||
dst
|
||||
}
|
||||
|
||||
// Handles finding all rows that match the provided operator on `value`.
|
||||
// For performance reasons ranges of matching values are collected up and
|
||||
// added in bulk to the bitmap.
|
||||
//
|
||||
// `op` is a tuple of comparisons where at least one of them must be
|
||||
// satisfied to satisfy the overall operator.
|
||||
fn row_ids_cmp_order(
|
||||
&self,
|
||||
value: bool,
|
||||
op: (std::cmp::Ordering, std::cmp::Ordering),
|
||||
mut dst: RowIDs,
|
||||
) -> RowIDs {
|
||||
dst.clear();
|
||||
|
||||
let mut found = false;
|
||||
let mut count = 0;
|
||||
for i in 0..self.num_rows() as usize {
|
||||
let cmp_result = self.arr.value(i).partial_cmp(&value);
|
||||
|
||||
if (self.arr.is_null(i) || (cmp_result != Some(op.0) && cmp_result != Some(op.1)))
|
||||
&& found
|
||||
{
|
||||
let (min, max) = (i as u32 - count, i as u32);
|
||||
dst.add_range(min, max);
|
||||
found = false;
|
||||
count = 0;
|
||||
continue;
|
||||
} else if self.arr.is_null(i) || (cmp_result != Some(op.0) && cmp_result != Some(op.1))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if !found {
|
||||
found = true;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
|
||||
// add any remaining range.
|
||||
if found {
|
||||
let (min, max) = (self.num_rows() - count, self.num_rows());
|
||||
dst.add_range(min, max);
|
||||
}
|
||||
dst
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&[bool]> for Bool {
|
||||
fn from(v: &[bool]) -> Self {
|
||||
Self {
|
||||
arr: BooleanArray::from(v.to_vec()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&[Option<bool>]> for Bool {
|
||||
fn from(v: &[Option<bool>]) -> Self {
|
||||
Self {
|
||||
arr: BooleanArray::from(v.to_vec()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BooleanArray> for Bool {
|
||||
fn from(arr: BooleanArray) -> Self {
|
||||
Self { arr }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::cmp::Operator;
|
||||
use super::*;
|
||||
|
||||
fn some_vec<T: Copy>(v: Vec<T>) -> Vec<Option<T>> {
|
||||
v.iter().map(|x| Some(*x)).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first_row_id_eq_value() {
|
||||
let v = Bool::from(vec![true, true].as_slice());
|
||||
|
||||
assert_eq!(v.first_row_id_eq_value(true), Some(0));
|
||||
assert_eq!(v.first_row_id_eq_value(false), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn value() {
|
||||
let v = Bool::from(vec![Some(false), Some(true), Some(false), None].as_slice());
|
||||
assert_eq!(v.value(1), Some(true));
|
||||
assert_eq!(v.value(3), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count() {
|
||||
let v = Bool::from(&[Some(true), None, Some(true)][..]);
|
||||
assert_eq!(v.count(&[0, 1, 2]), 2);
|
||||
assert_eq!(v.count(&[0, 2]), 2);
|
||||
assert_eq!(v.count(&[0, 1]), 1);
|
||||
assert_eq!(v.count(&[1]), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first() {
|
||||
let v = Bool::from(&[false, true, true][..]);
|
||||
assert_eq!(v.first(&[0, 1, 2]), Some(false));
|
||||
assert_eq!(v.first(&[1, 2]), Some(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn last() {
|
||||
let v = Bool::from(&[false, true, false][..]);
|
||||
assert_eq!(v.last(&[0, 1, 2]), Some(false));
|
||||
assert_eq!(v.last(&[1, 2]), Some(false));
|
||||
assert_eq!(v.last(&[0, 2]), Some(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn min() {
|
||||
let v = Bool::from(&[Some(true), Some(true), Some(false), None][..]);
|
||||
assert_eq!(v.min(&[0, 1, 2, 3]), Some(false));
|
||||
assert_eq!(v.min(&[1, 2]), Some(false));
|
||||
assert_eq!(v.min(&[0, 1]), Some(true));
|
||||
assert_eq!(v.min(&[0, 3]), Some(true));
|
||||
assert_eq!(v.min(&[3]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max() {
|
||||
let v = Bool::from(&[Some(true), Some(true), Some(false), None][..]);
|
||||
assert_eq!(v.max(&[0, 1, 2, 3]), Some(true));
|
||||
assert_eq!(v.max(&[1, 2]), Some(true));
|
||||
assert_eq!(v.max(&[0, 1]), Some(true));
|
||||
assert_eq!(v.max(&[0, 3]), Some(true));
|
||||
assert_eq!(v.max(&[3]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn row_ids_filter() {
|
||||
let v = Bool::from(&[Some(true), Some(false), None, None, Some(true)][..]);
|
||||
|
||||
// EQ
|
||||
let row_ids = v.row_ids_filter(true, &Operator::Equal, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 4]);
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::Equal, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![1]);
|
||||
|
||||
// NEQ
|
||||
let row_ids = v.row_ids_filter(true, &Operator::NotEqual, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![1]);
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::NotEqual, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 4]);
|
||||
|
||||
// GT
|
||||
let row_ids = v.row_ids_filter(true, &Operator::GT, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), Vec::<u32>::new());
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::GT, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 4]);
|
||||
|
||||
// GTE
|
||||
let row_ids = v.row_ids_filter(true, &Operator::GTE, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 4]);
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::GTE, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 1, 4]);
|
||||
|
||||
// LT
|
||||
let row_ids = v.row_ids_filter(true, &Operator::LT, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![1]);
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::LT, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), Vec::<u32>::new());
|
||||
|
||||
// LTE
|
||||
let row_ids = v.row_ids_filter(true, &Operator::LTE, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![0, 1, 4]);
|
||||
|
||||
let row_ids = v.row_ids_filter(false, &Operator::LTE, RowIDs::new_vector());
|
||||
assert_eq!(row_ids.to_vec(), vec![1]);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue