refactor: wire up distinct_values with iterator
parent
7d0248cc94
commit
fcc978bb75
|
@ -13,7 +13,7 @@ use either::Either;
|
|||
use arrow_deps::{arrow, arrow::array::Array};
|
||||
|
||||
use crate::schema::LogicalDataType;
|
||||
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, ValueSet, Values};
|
||||
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, Values};
|
||||
use boolean::BooleanEncoding;
|
||||
use encoding::{bool, dictionary, fixed_null};
|
||||
use float::FloatEncoding;
|
||||
|
@ -240,18 +240,24 @@ impl Column {
|
|||
}
|
||||
|
||||
// The distinct set of values found at the logical row ids.
|
||||
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
||||
assert!(
|
||||
row_ids.len() as u32 <= self.num_rows(),
|
||||
"too many row ids {:?} provided for column with {:?} rows",
|
||||
row_ids.len(),
|
||||
self.num_rows()
|
||||
);
|
||||
|
||||
pub fn distinct_values(&self, row_ids: impl Iterator<Item = u32>) -> BTreeSet<Option<&'_ str>> {
|
||||
match &self {
|
||||
Column::String(_, data) => data.distinct_values(row_ids),
|
||||
Column::ByteArray(_, _) => todo!(),
|
||||
_ => unimplemented!("distinct values is not implemented for this type"),
|
||||
Column::Float(_, _) => {
|
||||
unimplemented!("distinct values is not implemented for Float column")
|
||||
}
|
||||
Column::Integer(_, _) => {
|
||||
unimplemented!("distinct values is not implemented for Integer column")
|
||||
}
|
||||
Column::Unsigned(_, _) => {
|
||||
unimplemented!("distinct values is not implemented for Unsigned column")
|
||||
}
|
||||
Column::Bool(_, _) => {
|
||||
unimplemented!("distinct values is not implemented for Bool column")
|
||||
}
|
||||
Column::ByteArray(_, _) => {
|
||||
unimplemented!("distinct values is not implemented for ByteArray column")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -679,7 +685,7 @@ impl Column {
|
|||
|
||||
/// Determines if the column contains other values than those provided in
|
||||
/// `values`.
|
||||
pub fn contains_other_values(&self, values: &BTreeSet<Option<&String>>) -> bool {
|
||||
pub fn has_other_values(&self, values: &BTreeSet<String>) -> bool {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
@ -1107,10 +1113,11 @@ pub enum RowIDsOption {
|
|||
impl RowIDsOption {
|
||||
/// Returns the `Some` variant or panics.
|
||||
pub fn unwrap(&self) -> &RowIDs {
|
||||
if let Self::Some(ids) = self {
|
||||
return ids;
|
||||
match &self {
|
||||
RowIDsOption::None(_) => panic!("cannot unwrap RowIDsOption to RowIDs"),
|
||||
RowIDsOption::Some(ids) => ids,
|
||||
RowIDsOption::All(ids) => ids,
|
||||
}
|
||||
panic!("cannot unwrap RowIDsOption to RowIDs");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1159,6 +1166,15 @@ impl RowIDs {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over the contents of the RowIDs.
|
||||
pub fn iter(&self) -> RowIDsIterator<'_> {
|
||||
match self {
|
||||
RowIDs::Bitmap(bm) => RowIDsIterator::new(bm.iter()),
|
||||
// we want an iterator of u32 rather than &u32.
|
||||
RowIDs::Vector(vec) => RowIDsIterator::new(vec.iter().cloned()),
|
||||
}
|
||||
}
|
||||
|
||||
// Converts the RowIDs to a Vec<u32>. This is expensive and should only be
|
||||
// used for testing.
|
||||
pub fn to_vec(&self) -> Vec<u32> {
|
||||
|
@ -1241,6 +1257,24 @@ impl RowIDs {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct RowIDsIterator<'a> {
|
||||
itr: Box<dyn Iterator<Item = u32> + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> RowIDsIterator<'a> {
|
||||
fn new(itr: impl Iterator<Item = u32> + 'a) -> Self {
|
||||
Self { itr: Box::new(itr) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for RowIDsIterator<'_> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.itr.next()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
@ -1484,15 +1518,21 @@ mod test {
|
|||
Some("world"),
|
||||
];
|
||||
|
||||
let hello = "hello".to_string();
|
||||
let world = "world".to_string();
|
||||
let mut exp = BTreeSet::new();
|
||||
exp.insert(Some(&hello));
|
||||
exp.insert(Some(&world));
|
||||
exp.insert(Some("hello"));
|
||||
exp.insert(Some("world"));
|
||||
exp.insert(None);
|
||||
|
||||
let col = Column::from(&input[..]);
|
||||
assert_eq!(col.distinct_values(&[0, 1, 2, 3, 4]), ValueSet::String(exp));
|
||||
assert_eq!(col.distinct_values(vec![0, 1, 2, 3, 4].into_iter()), exp);
|
||||
assert_eq!(
|
||||
col.distinct_values(RowIDs::Vector(vec![0, 1, 2, 3, 4]).iter()),
|
||||
exp
|
||||
);
|
||||
|
||||
let mut bm = Bitmap::create();
|
||||
bm.add_many(&[0, 1, 2, 3, 4]);
|
||||
assert_eq!(col.distinct_values(RowIDs::Bitmap(bm).iter()), exp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -237,9 +237,9 @@ impl Encoding {
|
|||
/// increasing set.
|
||||
fn distinct_values<'a>(
|
||||
&'a self,
|
||||
row_ids: &[u32],
|
||||
dst: BTreeSet<Option<&'a String>>,
|
||||
) -> BTreeSet<Option<&'a String>> {
|
||||
row_ids: impl Iterator<Item = u32>,
|
||||
dst: BTreeSet<Option<&'a str>>,
|
||||
) -> BTreeSet<Option<&'a str>> {
|
||||
match self {
|
||||
Encoding::RLE(enc) => enc.distinct_values(row_ids, dst),
|
||||
Encoding::Plain(enc) => enc.distinct_values(row_ids, dst),
|
||||
|
@ -1046,12 +1046,10 @@ mod test {
|
|||
|
||||
enc.push_additional(Some("east".to_string()), 3);
|
||||
|
||||
let values = enc.distinct_values((0..3).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
||||
let values = enc.distinct_values((0..3).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||
assert_eq!(
|
||||
values,
|
||||
vec![Some(&"east".to_string())]
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
vec![Some("east")].into_iter().collect::<BTreeSet<_>>(),
|
||||
"{}",
|
||||
name,
|
||||
);
|
||||
|
@ -1061,35 +1059,30 @@ mod test {
|
|||
enc.push_additional(Some("south".to_string()), 2); // 9, 10
|
||||
enc.push_none(); // 11
|
||||
|
||||
let values = enc.distinct_values((0..12).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
||||
let values = enc.distinct_values((0..12).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||
assert_eq!(
|
||||
values,
|
||||
vec![
|
||||
None,
|
||||
Some(&"east".to_string()),
|
||||
Some(&"north".to_string()),
|
||||
Some(&"south".to_string()),
|
||||
]
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"{}",
|
||||
name,
|
||||
);
|
||||
|
||||
let values = enc.distinct_values((0..4).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
||||
assert_eq!(
|
||||
values,
|
||||
vec![Some(&"east".to_string()), Some(&"north".to_string()),]
|
||||
vec![None, Some("east"), Some("north"), Some("south"),]
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"{}",
|
||||
name,
|
||||
);
|
||||
|
||||
let values = enc.distinct_values(&[3, 10], BTreeSet::new());
|
||||
let values = enc.distinct_values((0..4).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||
assert_eq!(
|
||||
values,
|
||||
vec![Some(&"north".to_string()), Some(&"south".to_string()),]
|
||||
vec![Some("east"), Some("north"),]
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"{}",
|
||||
name,
|
||||
);
|
||||
|
||||
let values = enc.distinct_values(vec![3, 10].into_iter(), BTreeSet::new());
|
||||
assert_eq!(
|
||||
values,
|
||||
vec![Some("north"), Some("south"),]
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"{}",
|
||||
|
|
|
@ -618,19 +618,19 @@ impl Plain {
|
|||
/// increasing set.
|
||||
pub fn distinct_values<'a>(
|
||||
&'a self,
|
||||
row_ids: &[u32],
|
||||
mut dst: BTreeSet<Option<&'a String>>,
|
||||
) -> BTreeSet<Option<&'a String>> {
|
||||
row_ids: impl Iterator<Item = u32>,
|
||||
mut dst: BTreeSet<Option<&'a str>>,
|
||||
) -> BTreeSet<Option<&'a str>> {
|
||||
// TODO(edd): Perf... We can improve on this if we know the column is
|
||||
// totally ordered.
|
||||
dst.clear();
|
||||
|
||||
for &row_id in row_ids {
|
||||
for row_id in row_ids {
|
||||
let encoded_id = self.encoded_data[row_id as usize];
|
||||
let value = &self.entries[encoded_id as usize].as_ref();
|
||||
|
||||
if !dst.contains(value) {
|
||||
dst.insert(*value);
|
||||
let value = self.entries[encoded_id as usize].as_deref();
|
||||
if !dst.contains(&value) {
|
||||
dst.insert(value);
|
||||
}
|
||||
|
||||
if dst.len() as u32 == self.cardinality() {
|
||||
|
|
|
@ -664,9 +664,9 @@ impl RLE {
|
|||
/// increasing set.
|
||||
pub fn distinct_values<'a>(
|
||||
&'a self,
|
||||
row_ids: &[u32],
|
||||
mut dst: BTreeSet<Option<&'a String>>,
|
||||
) -> BTreeSet<Option<&'a String>> {
|
||||
row_ids: impl Iterator<Item = u32>,
|
||||
mut dst: BTreeSet<Option<&'a str>>,
|
||||
) -> BTreeSet<Option<&'a str>> {
|
||||
// TODO(edd): Perf... We can improve on this if we know the column is
|
||||
// totally ordered.
|
||||
dst.clear();
|
||||
|
@ -689,7 +689,7 @@ impl RLE {
|
|||
|
||||
let mut i = 1;
|
||||
'by_row: for row_id in row_ids {
|
||||
while curr_logical_row_id + curr_entry_rl <= *row_id {
|
||||
while curr_logical_row_id + curr_entry_rl <= row_id {
|
||||
// this encoded entry does not cover the row we need.
|
||||
// move on to next entry
|
||||
curr_logical_row_id += curr_entry_rl;
|
||||
|
|
|
@ -3,8 +3,8 @@ use std::collections::BTreeSet;
|
|||
use arrow_deps::arrow::{self, array::Array};
|
||||
use either::Either;
|
||||
|
||||
use super::cmp;
|
||||
use super::encoding::dictionary::{Encoding, Plain, RLE};
|
||||
use super::{cmp, ValueSet};
|
||||
use crate::column::{RowIDs, Value, Values};
|
||||
|
||||
// Edd's totally made up magic constant. This determines whether we would use
|
||||
|
@ -142,10 +142,10 @@ impl StringEncoding {
|
|||
/// Returns the distinct set of values found at the provided row ids.
|
||||
///
|
||||
/// TODO(edd): perf - pooling of destination sets.
|
||||
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
||||
pub fn distinct_values(&self, row_ids: impl Iterator<Item = u32>) -> BTreeSet<Option<&'_ str>> {
|
||||
match &self {
|
||||
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
||||
Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
||||
Self::RLEDictionary(c) => c.distinct_values(row_ids, BTreeSet::new()),
|
||||
Self::Dictionary(c) => c.distinct_values(row_ids, BTreeSet::new()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -992,6 +992,62 @@ impl RowGroup {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the distinct set of values for the selected columns, constrained
|
||||
/// by an optional predicate.
|
||||
pub fn column_values<'a>(
|
||||
&'a self,
|
||||
predicate: &Predicate,
|
||||
columns: &[ColumnName<'_>],
|
||||
mut dst: BTreeMap<String, BTreeSet<String>>,
|
||||
) -> BTreeMap<String, BTreeSet<String>> {
|
||||
// Build up candidate columns
|
||||
let candidate_columns = self
|
||||
.all_columns_by_name
|
||||
.iter()
|
||||
// Filter any columns that are not present in the `Selection`.
|
||||
.filter_map(|(name, &id)| {
|
||||
if columns.iter().any(|selection| name == selection) {
|
||||
Some((name, &self.columns[id]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
// Further filter candidate columns by removing any columns that we
|
||||
// can prove we already have all the distinct values for.
|
||||
.filter(|(name, column)| {
|
||||
match dst.get(*name) {
|
||||
// process the column if we haven't got all the distinct
|
||||
// values.
|
||||
Some(values) => column.has_other_values(values),
|
||||
// no existing values for this column - we will need to
|
||||
// process it.
|
||||
None => true,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let row_ids = self.row_ids_from_predicate(predicate);
|
||||
for (name, column) in candidate_columns {
|
||||
// If no rows match there is nothing to do, if some rows match then
|
||||
// extract an iterator of those IDs. If all rows match then create
|
||||
// an iterator of all rows without materialising them.
|
||||
let row_itr: Box<dyn Iterator<Item = u32>> = match &row_ids {
|
||||
RowIDsOption::None(_) => return dst,
|
||||
RowIDsOption::Some(row_ids) => Box::new(row_ids.iter()),
|
||||
RowIDsOption::All(_) => Box::new(0..self.rows()),
|
||||
};
|
||||
|
||||
let results = dst.entry(name.clone()).or_default();
|
||||
for value in column.distinct_values(row_itr).iter() {
|
||||
if value.is_some() && !results.contains(value.unwrap()) {
|
||||
results.insert(value.unwrap().to_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialise a `RowGroup` from an Arrow RecordBatch.
|
||||
|
@ -2946,4 +3002,66 @@ west,host-d,11,9
|
|||
vec!["temp".to_owned()],
|
||||
);
|
||||
}
|
||||
|
||||
fn to_map<'a>(arr: Vec<(&str, &[&'a str])>) -> BTreeMap<String, BTreeSet<String>> {
|
||||
arr.iter()
|
||||
.map(|(k, values)| {
|
||||
(
|
||||
k.to_string(),
|
||||
values
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<BTreeSet<_>>(),
|
||||
)
|
||||
})
|
||||
.collect::<BTreeMap<_, _>>()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn column_values() {
|
||||
// Build a row group.
|
||||
let mut columns = BTreeMap::new();
|
||||
let tc = ColumnType::Time(Column::from(&[1_i64, 2, 3][..]));
|
||||
columns.insert("time".to_string(), tc);
|
||||
|
||||
let rc = ColumnType::Tag(Column::from(&["west", "south", "north"][..]));
|
||||
columns.insert("region".to_string(), rc);
|
||||
|
||||
let ec = ColumnType::Tag(Column::from(&["prod", "stag", "stag"][..]));
|
||||
columns.insert("env".to_string(), ec);
|
||||
|
||||
let rg = RowGroup::new(3, columns);
|
||||
|
||||
let result = rg.column_values(&Predicate::default(), &["region"], BTreeMap::new());
|
||||
assert_eq!(
|
||||
result,
|
||||
to_map(vec![("region", &["north", "west", "south"])])
|
||||
);
|
||||
|
||||
let result = rg.column_values(&Predicate::default(), &["env", "region"], BTreeMap::new());
|
||||
assert_eq!(
|
||||
result,
|
||||
to_map(vec![
|
||||
("env", &["prod", "stag"]),
|
||||
("region", &["north", "west", "south"])
|
||||
])
|
||||
);
|
||||
|
||||
let result = rg.column_values(
|
||||
&Predicate::new(vec![BinaryExpr::from(("time", ">", 1_i64))]),
|
||||
&["env", "region"],
|
||||
BTreeMap::new(),
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
to_map(vec![("env", &["stag"]), ("region", &["north", "south"])])
|
||||
);
|
||||
|
||||
let result = rg.column_values(
|
||||
&Predicate::new(vec![BinaryExpr::from(("time", ">", 4_i64))]),
|
||||
&["env", "region"],
|
||||
BTreeMap::new(),
|
||||
);
|
||||
assert_eq!(result, to_map(vec![]));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue