perf: add ability to get all encoded values

pull/24376/head
Edd Robinson 2020-09-04 12:08:22 +01:00
parent cfa0ef9c23
commit cad5e45208
3 changed files with 89 additions and 20 deletions

View File

@ -537,6 +537,22 @@ impl Column {
}
}
/// Materialise all of the encoded values.
pub fn all_encoded_values(&self) -> Vector {
match self {
Column::String(c) => {
let now = std::time::Instant::now();
let v = c.all_encoded_values();
log::debug!("time getting all encoded values {:?}", now.elapsed());
log::debug!("dictionary {:?}", c.data.dictionary());
Vector::Integer(v)
}
Column::Float(c) => Vector::Float(c.all_encoded_values()),
Column::Integer(c) => Vector::Integer(c.all_encoded_values()),
}
}
/// Given an encoded value for a row, materialise and return the decoded
/// version.
///
@ -986,6 +1002,10 @@ impl String {
self.data.encoded_values(row_ids)
}
pub fn all_encoded_values(&self) -> Vec<i64> {
self.data.all_encoded_values()
}
/// Return the decoded value for an encoded ID.
///
/// Panics if there is no decoded value for the provided id
@ -1037,6 +1057,10 @@ impl Float {
self.data.encoded_values(row_ids)
}
pub fn all_encoded_values(&self) -> Vec<f64> {
self.data.all_encoded_values()
}
pub fn scan_from(&self, row_id: usize) -> &[f64] {
self.data.scan_from(row_id)
}
@ -1106,6 +1130,10 @@ impl Integer {
self.data.encoded_values(row_ids)
}
pub fn all_encoded_values(&self) -> Vec<i64> {
self.data.all_encoded_values()
}
pub fn scan_from(&self, row_id: usize) -> &[i64] {
self.data.scan_from(row_id)
}

View File

@ -68,6 +68,12 @@ where
self.values(row_ids)
}
/// Return all encoded values. For this encoding this is just the decoded
/// values
pub fn all_encoded_values(&self) -> Vec<T> {
self.values.clone()
}
// TODO(edd): fix this when added NULL support
pub fn scan_from_until_some(&self, _row_id: usize) -> Option<T> {
unreachable!("to remove");
@ -485,6 +491,26 @@ impl DictionaryRLE {
out
}
// values materialises a vector of references to all logical values in the
// encoding.
pub fn all_values(&mut self) -> Vec<Option<&String>> {
let mut out: Vec<Option<&String>> = Vec::with_capacity(self.total as usize);
// build reverse mapping.
let mut idx_value = BTreeMap::new();
for (k, v) in &self.entry_index {
idx_value.insert(v, k);
}
assert_eq!(idx_value.len(), self.entry_index.len());
for (idx, rl) in &self.run_lengths {
// TODO(edd): fix unwrap - we know that the value exists in map...
let v = idx_value.get(&idx).unwrap().as_ref();
out.extend(iter::repeat(v).take(*rl as usize));
}
out
}
/// Return the decoded value for an encoded ID.
///
/// Panics if there is no decoded value for the provided id
@ -528,22 +554,13 @@ impl DictionaryRLE {
out
}
// values materialises a vector of references to all logical values in the
// encoding.
pub fn all_values(&mut self) -> Vec<Option<&String>> {
let mut out: Vec<Option<&String>> = Vec::with_capacity(self.total as usize);
// build reverse mapping.
let mut idx_value = BTreeMap::new();
for (k, v) in &self.entry_index {
idx_value.insert(v, k);
}
assert_eq!(idx_value.len(), self.entry_index.len());
// all_encoded_values materialises a vector of all encoded values for the
// column.
pub fn all_encoded_values(&self) -> Vec<i64> {
let mut out: Vec<i64> = Vec::with_capacity(self.total as usize);
for (idx, rl) in &self.run_lengths {
// TODO(edd): fix unwrap - we know that the value exists in map...
let v = idx_value.get(&idx).unwrap().as_ref();
out.extend(iter::repeat(v).take(*rl as usize));
out.extend(iter::repeat(*idx as i64).take(*rl as usize));
}
out
}

View File

@ -228,7 +228,7 @@ impl Segment {
group_columns: &[String],
aggregates: &[(String, AggregateType)],
window: i64,
) -> BTreeMap<Vec<String>, Vec<(String, Option<column::Aggregate>)>> {
) -> BTreeMap<Vec<i64>, Vec<(&String, &AggregateType, Option<column::Aggregate>)>> {
// Build a hash table - essentially, scan columns for matching row ids,
// emitting the encoded value for each column and track those value
// combinations in a hashmap with running aggregates.
@ -242,6 +242,10 @@ impl Segment {
assert_ne!(group_columns[group_columns.len() - 1], "time");
}
// TODO(edd): Perf - if there is no predicate and we want entire segment
// then it will be a lot faster to not build filtered_row_ids and just
// get all encoded values for each grouping column...
// filter on predicates and time
let filtered_row_ids: croaring::Bitmap;
if let Some(row_ids) = self.filter_by_predicates_eq(time_range, predicates) {
@ -263,7 +267,12 @@ impl Segment {
let mut group_column_encoded_values = Vec::with_capacity(group_columns.len());
for group_column in group_columns {
if let Some(column) = self.column(&group_column) {
let encoded_values = column.encoded_values(&filtered_row_ids_vec);
let encoded_values = if filtered_row_ids_vec.len() == self.meta.rows {
column.all_encoded_values()
} else {
column.encoded_values(&filtered_row_ids_vec)
};
assert_eq!(
filtered_row_ids.cardinality() as usize,
encoded_values.len()
@ -325,10 +334,10 @@ impl Segment {
.collect::<Vec<_>>();
// hashMap is about 20% faster than BTreeMap in this case
let mut hash_table: HashMap<
let mut hash_table: BTreeMap<
Vec<i64>,
Vec<(&String, &AggregateType, Option<column::Aggregate>)>,
> = HashMap::new();
> = BTreeMap::new();
let mut aggregate_row: Vec<(&str, Option<column::Scalar>)> =
std::iter::repeat_with(|| ("", None))
@ -406,8 +415,10 @@ impl Segment {
}
processed_rows += 1;
}
// println!("groups: {:?}", hash_table.len());
log::debug!("({:?} rows processed) {:?}", processed_rows, hash_table);
BTreeMap::new()
// hash_table
}
pub fn aggregate_by_group_using_sort(
@ -451,7 +462,11 @@ impl Segment {
let mut group_column_encoded_values = Vec::with_capacity(group_columns.len());
for group_column in group_columns {
if let Some(column) = self.column(&group_column) {
let encoded_values = column.encoded_values(&filtered_row_ids_vec);
let encoded_values = if filtered_row_ids_vec.len() == self.meta.rows {
column.all_encoded_values()
} else {
column.encoded_values(&filtered_row_ids_vec)
};
assert_eq!(
filtered_row_ids.cardinality() as usize,
encoded_values.len()
@ -557,6 +572,10 @@ impl Segment {
assert_ne!(group_columns[group_columns.len() - 1], "time");
}
// TODO(edd): Perf - if there is no predicate and we want entire segment
// then it will be a lot faster to not build filtered_row_ids and just
// get all encoded values for each grouping column...
// filter on predicates and time
let filtered_row_ids: croaring::Bitmap;
if let Some(row_ids) = self.filter_by_predicates_eq(time_range, predicates) {
@ -577,7 +596,11 @@ impl Segment {
let mut group_column_encoded_values = Vec::with_capacity(group_columns.len());
for group_column in group_columns {
if let Some(column) = self.column(&group_column) {
let encoded_values = column.encoded_values(&filtered_row_ids_vec);
let encoded_values = if filtered_row_ids_vec.len() == self.meta.rows {
column.all_encoded_values()
} else {
column.encoded_values(&filtered_row_ids_vec)
};
assert_eq!(
filtered_row_ids.cardinality() as usize,
encoded_values.len()
@ -709,6 +732,7 @@ impl Segment {
aggregates: group_key_aggregates,
});
// println!("groups: {:?}", results.len());
log::debug!("({:?} rows processed) {:?}", processed_rows, results);
// results
vec![]