refactor: wire up distinct_values with iterator
parent
7d0248cc94
commit
fcc978bb75
|
@ -13,7 +13,7 @@ use either::Either;
|
||||||
use arrow_deps::{arrow, arrow::array::Array};
|
use arrow_deps::{arrow, arrow::array::Array};
|
||||||
|
|
||||||
use crate::schema::LogicalDataType;
|
use crate::schema::LogicalDataType;
|
||||||
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, ValueSet, Values};
|
use crate::value::{EncodedValues, OwnedValue, Scalar, Value, Values};
|
||||||
use boolean::BooleanEncoding;
|
use boolean::BooleanEncoding;
|
||||||
use encoding::{bool, dictionary, fixed_null};
|
use encoding::{bool, dictionary, fixed_null};
|
||||||
use float::FloatEncoding;
|
use float::FloatEncoding;
|
||||||
|
@ -240,18 +240,24 @@ impl Column {
|
||||||
}
|
}
|
||||||
|
|
||||||
// The distinct set of values found at the logical row ids.
|
// The distinct set of values found at the logical row ids.
|
||||||
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
pub fn distinct_values(&self, row_ids: impl Iterator<Item = u32>) -> BTreeSet<Option<&'_ str>> {
|
||||||
assert!(
|
|
||||||
row_ids.len() as u32 <= self.num_rows(),
|
|
||||||
"too many row ids {:?} provided for column with {:?} rows",
|
|
||||||
row_ids.len(),
|
|
||||||
self.num_rows()
|
|
||||||
);
|
|
||||||
|
|
||||||
match &self {
|
match &self {
|
||||||
Column::String(_, data) => data.distinct_values(row_ids),
|
Column::String(_, data) => data.distinct_values(row_ids),
|
||||||
Column::ByteArray(_, _) => todo!(),
|
Column::Float(_, _) => {
|
||||||
_ => unimplemented!("distinct values is not implemented for this type"),
|
unimplemented!("distinct values is not implemented for Float column")
|
||||||
|
}
|
||||||
|
Column::Integer(_, _) => {
|
||||||
|
unimplemented!("distinct values is not implemented for Integer column")
|
||||||
|
}
|
||||||
|
Column::Unsigned(_, _) => {
|
||||||
|
unimplemented!("distinct values is not implemented for Unsigned column")
|
||||||
|
}
|
||||||
|
Column::Bool(_, _) => {
|
||||||
|
unimplemented!("distinct values is not implemented for Bool column")
|
||||||
|
}
|
||||||
|
Column::ByteArray(_, _) => {
|
||||||
|
unimplemented!("distinct values is not implemented for ByteArray column")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -679,7 +685,7 @@ impl Column {
|
||||||
|
|
||||||
/// Determines if the column contains other values than those provided in
|
/// Determines if the column contains other values than those provided in
|
||||||
/// `values`.
|
/// `values`.
|
||||||
pub fn contains_other_values(&self, values: &BTreeSet<Option<&String>>) -> bool {
|
pub fn has_other_values(&self, values: &BTreeSet<String>) -> bool {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1107,10 +1113,11 @@ pub enum RowIDsOption {
|
||||||
impl RowIDsOption {
|
impl RowIDsOption {
|
||||||
/// Returns the `Some` variant or panics.
|
/// Returns the `Some` variant or panics.
|
||||||
pub fn unwrap(&self) -> &RowIDs {
|
pub fn unwrap(&self) -> &RowIDs {
|
||||||
if let Self::Some(ids) = self {
|
match &self {
|
||||||
return ids;
|
RowIDsOption::None(_) => panic!("cannot unwrap RowIDsOption to RowIDs"),
|
||||||
|
RowIDsOption::Some(ids) => ids,
|
||||||
|
RowIDsOption::All(ids) => ids,
|
||||||
}
|
}
|
||||||
panic!("cannot unwrap RowIDsOption to RowIDs");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1159,6 +1166,15 @@ impl RowIDs {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns an iterator over the contents of the RowIDs.
|
||||||
|
pub fn iter(&self) -> RowIDsIterator<'_> {
|
||||||
|
match self {
|
||||||
|
RowIDs::Bitmap(bm) => RowIDsIterator::new(bm.iter()),
|
||||||
|
// we want an iterator of u32 rather than &u32.
|
||||||
|
RowIDs::Vector(vec) => RowIDsIterator::new(vec.iter().cloned()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Converts the RowIDs to a Vec<u32>. This is expensive and should only be
|
// Converts the RowIDs to a Vec<u32>. This is expensive and should only be
|
||||||
// used for testing.
|
// used for testing.
|
||||||
pub fn to_vec(&self) -> Vec<u32> {
|
pub fn to_vec(&self) -> Vec<u32> {
|
||||||
|
@ -1241,6 +1257,24 @@ impl RowIDs {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct RowIDsIterator<'a> {
|
||||||
|
itr: Box<dyn Iterator<Item = u32> + 'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> RowIDsIterator<'a> {
|
||||||
|
fn new(itr: impl Iterator<Item = u32> + 'a) -> Self {
|
||||||
|
Self { itr: Box::new(itr) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for RowIDsIterator<'_> {
|
||||||
|
type Item = u32;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.itr.next()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -1484,15 +1518,21 @@ mod test {
|
||||||
Some("world"),
|
Some("world"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let hello = "hello".to_string();
|
|
||||||
let world = "world".to_string();
|
|
||||||
let mut exp = BTreeSet::new();
|
let mut exp = BTreeSet::new();
|
||||||
exp.insert(Some(&hello));
|
exp.insert(Some("hello"));
|
||||||
exp.insert(Some(&world));
|
exp.insert(Some("world"));
|
||||||
exp.insert(None);
|
exp.insert(None);
|
||||||
|
|
||||||
let col = Column::from(&input[..]);
|
let col = Column::from(&input[..]);
|
||||||
assert_eq!(col.distinct_values(&[0, 1, 2, 3, 4]), ValueSet::String(exp));
|
assert_eq!(col.distinct_values(vec![0, 1, 2, 3, 4].into_iter()), exp);
|
||||||
|
assert_eq!(
|
||||||
|
col.distinct_values(RowIDs::Vector(vec![0, 1, 2, 3, 4]).iter()),
|
||||||
|
exp
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut bm = Bitmap::create();
|
||||||
|
bm.add_many(&[0, 1, 2, 3, 4]);
|
||||||
|
assert_eq!(col.distinct_values(RowIDs::Bitmap(bm).iter()), exp);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
@ -237,9 +237,9 @@ impl Encoding {
|
||||||
/// increasing set.
|
/// increasing set.
|
||||||
fn distinct_values<'a>(
|
fn distinct_values<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
row_ids: &[u32],
|
row_ids: impl Iterator<Item = u32>,
|
||||||
dst: BTreeSet<Option<&'a String>>,
|
dst: BTreeSet<Option<&'a str>>,
|
||||||
) -> BTreeSet<Option<&'a String>> {
|
) -> BTreeSet<Option<&'a str>> {
|
||||||
match self {
|
match self {
|
||||||
Encoding::RLE(enc) => enc.distinct_values(row_ids, dst),
|
Encoding::RLE(enc) => enc.distinct_values(row_ids, dst),
|
||||||
Encoding::Plain(enc) => enc.distinct_values(row_ids, dst),
|
Encoding::Plain(enc) => enc.distinct_values(row_ids, dst),
|
||||||
|
@ -1046,12 +1046,10 @@ mod test {
|
||||||
|
|
||||||
enc.push_additional(Some("east".to_string()), 3);
|
enc.push_additional(Some("east".to_string()), 3);
|
||||||
|
|
||||||
let values = enc.distinct_values((0..3).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
let values = enc.distinct_values((0..3).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
values,
|
values,
|
||||||
vec![Some(&"east".to_string())]
|
vec![Some("east")].into_iter().collect::<BTreeSet<_>>(),
|
||||||
.into_iter()
|
|
||||||
.collect::<BTreeSet<_>>(),
|
|
||||||
"{}",
|
"{}",
|
||||||
name,
|
name,
|
||||||
);
|
);
|
||||||
|
@ -1061,35 +1059,30 @@ mod test {
|
||||||
enc.push_additional(Some("south".to_string()), 2); // 9, 10
|
enc.push_additional(Some("south".to_string()), 2); // 9, 10
|
||||||
enc.push_none(); // 11
|
enc.push_none(); // 11
|
||||||
|
|
||||||
let values = enc.distinct_values((0..12).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
let values = enc.distinct_values((0..12).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
values,
|
values,
|
||||||
vec![
|
vec![None, Some("east"), Some("north"), Some("south"),]
|
||||||
None,
|
|
||||||
Some(&"east".to_string()),
|
|
||||||
Some(&"north".to_string()),
|
|
||||||
Some(&"south".to_string()),
|
|
||||||
]
|
|
||||||
.into_iter()
|
|
||||||
.collect::<BTreeSet<_>>(),
|
|
||||||
"{}",
|
|
||||||
name,
|
|
||||||
);
|
|
||||||
|
|
||||||
let values = enc.distinct_values((0..4).collect::<Vec<_>>().as_slice(), BTreeSet::new());
|
|
||||||
assert_eq!(
|
|
||||||
values,
|
|
||||||
vec![Some(&"east".to_string()), Some(&"north".to_string()),]
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<BTreeSet<_>>(),
|
.collect::<BTreeSet<_>>(),
|
||||||
"{}",
|
"{}",
|
||||||
name,
|
name,
|
||||||
);
|
);
|
||||||
|
|
||||||
let values = enc.distinct_values(&[3, 10], BTreeSet::new());
|
let values = enc.distinct_values((0..4).collect::<Vec<_>>().into_iter(), BTreeSet::new());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
values,
|
values,
|
||||||
vec![Some(&"north".to_string()), Some(&"south".to_string()),]
|
vec![Some("east"), Some("north"),]
|
||||||
|
.into_iter()
|
||||||
|
.collect::<BTreeSet<_>>(),
|
||||||
|
"{}",
|
||||||
|
name,
|
||||||
|
);
|
||||||
|
|
||||||
|
let values = enc.distinct_values(vec![3, 10].into_iter(), BTreeSet::new());
|
||||||
|
assert_eq!(
|
||||||
|
values,
|
||||||
|
vec![Some("north"), Some("south"),]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<BTreeSet<_>>(),
|
.collect::<BTreeSet<_>>(),
|
||||||
"{}",
|
"{}",
|
||||||
|
|
|
@ -618,19 +618,19 @@ impl Plain {
|
||||||
/// increasing set.
|
/// increasing set.
|
||||||
pub fn distinct_values<'a>(
|
pub fn distinct_values<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
row_ids: &[u32],
|
row_ids: impl Iterator<Item = u32>,
|
||||||
mut dst: BTreeSet<Option<&'a String>>,
|
mut dst: BTreeSet<Option<&'a str>>,
|
||||||
) -> BTreeSet<Option<&'a String>> {
|
) -> BTreeSet<Option<&'a str>> {
|
||||||
// TODO(edd): Perf... We can improve on this if we know the column is
|
// TODO(edd): Perf... We can improve on this if we know the column is
|
||||||
// totally ordered.
|
// totally ordered.
|
||||||
dst.clear();
|
dst.clear();
|
||||||
|
|
||||||
for &row_id in row_ids {
|
for row_id in row_ids {
|
||||||
let encoded_id = self.encoded_data[row_id as usize];
|
let encoded_id = self.encoded_data[row_id as usize];
|
||||||
let value = &self.entries[encoded_id as usize].as_ref();
|
|
||||||
|
|
||||||
if !dst.contains(value) {
|
let value = self.entries[encoded_id as usize].as_deref();
|
||||||
dst.insert(*value);
|
if !dst.contains(&value) {
|
||||||
|
dst.insert(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
if dst.len() as u32 == self.cardinality() {
|
if dst.len() as u32 == self.cardinality() {
|
||||||
|
|
|
@ -664,9 +664,9 @@ impl RLE {
|
||||||
/// increasing set.
|
/// increasing set.
|
||||||
pub fn distinct_values<'a>(
|
pub fn distinct_values<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
row_ids: &[u32],
|
row_ids: impl Iterator<Item = u32>,
|
||||||
mut dst: BTreeSet<Option<&'a String>>,
|
mut dst: BTreeSet<Option<&'a str>>,
|
||||||
) -> BTreeSet<Option<&'a String>> {
|
) -> BTreeSet<Option<&'a str>> {
|
||||||
// TODO(edd): Perf... We can improve on this if we know the column is
|
// TODO(edd): Perf... We can improve on this if we know the column is
|
||||||
// totally ordered.
|
// totally ordered.
|
||||||
dst.clear();
|
dst.clear();
|
||||||
|
@ -689,7 +689,7 @@ impl RLE {
|
||||||
|
|
||||||
let mut i = 1;
|
let mut i = 1;
|
||||||
'by_row: for row_id in row_ids {
|
'by_row: for row_id in row_ids {
|
||||||
while curr_logical_row_id + curr_entry_rl <= *row_id {
|
while curr_logical_row_id + curr_entry_rl <= row_id {
|
||||||
// this encoded entry does not cover the row we need.
|
// this encoded entry does not cover the row we need.
|
||||||
// move on to next entry
|
// move on to next entry
|
||||||
curr_logical_row_id += curr_entry_rl;
|
curr_logical_row_id += curr_entry_rl;
|
||||||
|
|
|
@ -3,8 +3,8 @@ use std::collections::BTreeSet;
|
||||||
use arrow_deps::arrow::{self, array::Array};
|
use arrow_deps::arrow::{self, array::Array};
|
||||||
use either::Either;
|
use either::Either;
|
||||||
|
|
||||||
|
use super::cmp;
|
||||||
use super::encoding::dictionary::{Encoding, Plain, RLE};
|
use super::encoding::dictionary::{Encoding, Plain, RLE};
|
||||||
use super::{cmp, ValueSet};
|
|
||||||
use crate::column::{RowIDs, Value, Values};
|
use crate::column::{RowIDs, Value, Values};
|
||||||
|
|
||||||
// Edd's totally made up magic constant. This determines whether we would use
|
// Edd's totally made up magic constant. This determines whether we would use
|
||||||
|
@ -142,10 +142,10 @@ impl StringEncoding {
|
||||||
/// Returns the distinct set of values found at the provided row ids.
|
/// Returns the distinct set of values found at the provided row ids.
|
||||||
///
|
///
|
||||||
/// TODO(edd): perf - pooling of destination sets.
|
/// TODO(edd): perf - pooling of destination sets.
|
||||||
pub fn distinct_values(&self, row_ids: &[u32]) -> ValueSet<'_> {
|
pub fn distinct_values(&self, row_ids: impl Iterator<Item = u32>) -> BTreeSet<Option<&'_ str>> {
|
||||||
match &self {
|
match &self {
|
||||||
Self::RLEDictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
Self::RLEDictionary(c) => c.distinct_values(row_ids, BTreeSet::new()),
|
||||||
Self::Dictionary(c) => ValueSet::String(c.distinct_values(row_ids, BTreeSet::new())),
|
Self::Dictionary(c) => c.distinct_values(row_ids, BTreeSet::new()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -992,6 +992,62 @@ impl RowGroup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the distinct set of values for the selected columns, constrained
|
||||||
|
/// by an optional predicate.
|
||||||
|
pub fn column_values<'a>(
|
||||||
|
&'a self,
|
||||||
|
predicate: &Predicate,
|
||||||
|
columns: &[ColumnName<'_>],
|
||||||
|
mut dst: BTreeMap<String, BTreeSet<String>>,
|
||||||
|
) -> BTreeMap<String, BTreeSet<String>> {
|
||||||
|
// Build up candidate columns
|
||||||
|
let candidate_columns = self
|
||||||
|
.all_columns_by_name
|
||||||
|
.iter()
|
||||||
|
// Filter any columns that are not present in the `Selection`.
|
||||||
|
.filter_map(|(name, &id)| {
|
||||||
|
if columns.iter().any(|selection| name == selection) {
|
||||||
|
Some((name, &self.columns[id]))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// Further filter candidate columns by removing any columns that we
|
||||||
|
// can prove we already have all the distinct values for.
|
||||||
|
.filter(|(name, column)| {
|
||||||
|
match dst.get(*name) {
|
||||||
|
// process the column if we haven't got all the distinct
|
||||||
|
// values.
|
||||||
|
Some(values) => column.has_other_values(values),
|
||||||
|
// no existing values for this column - we will need to
|
||||||
|
// process it.
|
||||||
|
None => true,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let row_ids = self.row_ids_from_predicate(predicate);
|
||||||
|
for (name, column) in candidate_columns {
|
||||||
|
// If no rows match there is nothing to do, if some rows match then
|
||||||
|
// extract an iterator of those IDs. If all rows match then create
|
||||||
|
// an iterator of all rows without materialising them.
|
||||||
|
let row_itr: Box<dyn Iterator<Item = u32>> = match &row_ids {
|
||||||
|
RowIDsOption::None(_) => return dst,
|
||||||
|
RowIDsOption::Some(row_ids) => Box::new(row_ids.iter()),
|
||||||
|
RowIDsOption::All(_) => Box::new(0..self.rows()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = dst.entry(name.clone()).or_default();
|
||||||
|
for value in column.distinct_values(row_itr).iter() {
|
||||||
|
if value.is_some() && !results.contains(value.unwrap()) {
|
||||||
|
results.insert(value.unwrap().to_owned());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dst
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialise a `RowGroup` from an Arrow RecordBatch.
|
/// Initialise a `RowGroup` from an Arrow RecordBatch.
|
||||||
|
@ -2946,4 +3002,66 @@ west,host-d,11,9
|
||||||
vec!["temp".to_owned()],
|
vec!["temp".to_owned()],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn to_map<'a>(arr: Vec<(&str, &[&'a str])>) -> BTreeMap<String, BTreeSet<String>> {
|
||||||
|
arr.iter()
|
||||||
|
.map(|(k, values)| {
|
||||||
|
(
|
||||||
|
k.to_string(),
|
||||||
|
values
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect::<BTreeSet<_>>(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<BTreeMap<_, _>>()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn column_values() {
|
||||||
|
// Build a row group.
|
||||||
|
let mut columns = BTreeMap::new();
|
||||||
|
let tc = ColumnType::Time(Column::from(&[1_i64, 2, 3][..]));
|
||||||
|
columns.insert("time".to_string(), tc);
|
||||||
|
|
||||||
|
let rc = ColumnType::Tag(Column::from(&["west", "south", "north"][..]));
|
||||||
|
columns.insert("region".to_string(), rc);
|
||||||
|
|
||||||
|
let ec = ColumnType::Tag(Column::from(&["prod", "stag", "stag"][..]));
|
||||||
|
columns.insert("env".to_string(), ec);
|
||||||
|
|
||||||
|
let rg = RowGroup::new(3, columns);
|
||||||
|
|
||||||
|
let result = rg.column_values(&Predicate::default(), &["region"], BTreeMap::new());
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
to_map(vec![("region", &["north", "west", "south"])])
|
||||||
|
);
|
||||||
|
|
||||||
|
let result = rg.column_values(&Predicate::default(), &["env", "region"], BTreeMap::new());
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
to_map(vec![
|
||||||
|
("env", &["prod", "stag"]),
|
||||||
|
("region", &["north", "west", "south"])
|
||||||
|
])
|
||||||
|
);
|
||||||
|
|
||||||
|
let result = rg.column_values(
|
||||||
|
&Predicate::new(vec![BinaryExpr::from(("time", ">", 1_i64))]),
|
||||||
|
&["env", "region"],
|
||||||
|
BTreeMap::new(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
to_map(vec![("env", &["stag"]), ("region", &["north", "south"])])
|
||||||
|
);
|
||||||
|
|
||||||
|
let result = rg.column_values(
|
||||||
|
&Predicate::new(vec![BinaryExpr::from(("time", ">", 4_i64))]),
|
||||||
|
&["env", "region"],
|
||||||
|
BTreeMap::new(),
|
||||||
|
);
|
||||||
|
assert_eq!(result, to_map(vec![]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue