influxdb/read_buffer/benches/dictionary.rs

156 lines
5.1 KiB
Rust

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use rand::distributions::Alphanumeric;
use rand::prelude::*;
use rand::Rng;
use read_buffer::benchmarks::{dictionary, Operator, RowIDs};
const ROWS: [usize; 3] = [100_000, 1_000_000, 10_000_000];
const LOCATIONS: [Location; 3] = [Location::Start, Location::Middle, Location::End];
const ROWS_MATCHING_VALUE: [usize; 3] = [10, 100, 1000];
#[derive(Debug)]
enum Location {
Start,
Middle,
End,
}
enum EncType {
RLEDictionary,
Dictionary,
}
fn select(c: &mut Criterion) {
let mut rng = rand::thread_rng();
benchmark_select(
c,
"encoding_rle_select",
EncType::RLEDictionary,
&ROWS,
&LOCATIONS,
&ROWS_MATCHING_VALUE,
&mut rng,
);
benchmark_select(
c,
"encoding_dict_select",
EncType::Dictionary,
&ROWS,
&LOCATIONS,
&ROWS_MATCHING_VALUE,
&mut rng,
);
}
fn benchmark_select(
c: &mut Criterion,
benchmark_group_name: &str,
enc_type: EncType,
row_size: &[usize],
locations: &[Location],
rows_selected: &[usize],
rng: &mut ThreadRng,
) {
let mut group = c.benchmark_group(benchmark_group_name);
for &num_rows in row_size {
for location in locations {
for &rows_select in rows_selected.iter().rev() {
let col_data = generate_column(num_rows, rows_select, rng);
let cardinality = num_rows / rows_select;
let mut col_dict = std::collections::BTreeSet::new();
for v in &col_data {
col_dict.insert(v.clone());
}
let value = match location {
Location::Start => {
// find a value in the column close to the beginning.
&col_data[rng.gen_range(0..col_data.len() / 20)] // something in first 5%
}
Location::Middle => {
// find a value in the column somewhere in the middle
let fifth = col_data.len() / 5;
&col_data[rng.gen_range(2 * fifth..3 * fifth)] // something in middle fifth
}
Location::End => {
&col_data
[rng.gen_range(col_data.len() - (col_data.len() / 9)..col_data.len())]
} // something in the last ~10%
};
group.throughput(Throughput::Elements(num_rows as u64));
let encoding: dictionary::Encoding = match enc_type {
EncType::RLEDictionary => {
let mut encoding = dictionary::RLE::with_dictionary(col_dict);
// Could be faster but it's just the bench setup...
for v in &col_data {
encoding.push(v.to_owned());
}
dictionary::Encoding::RLE(encoding)
}
EncType::Dictionary => {
let mut encoding = dictionary::Plain::with_dictionary(col_dict);
// Could be faster but it's just the bench setup...
for v in &col_data {
encoding.push(v.to_owned());
}
dictionary::Encoding::Plain(encoding)
}
};
let input = (RowIDs::new_bitmap(), value);
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"enc_{:?}/rows_{:?}/loc_{:?}/card_{:?}",
encoding.debug_name(),
num_rows,
location,
cardinality
)),
&input,
|b, _| {
b.iter(|| {
// do work
let row_ids = encoding.row_ids_filter(
value.as_str(),
&Operator::Equal,
RowIDs::new_bitmap(),
);
let as_vec = row_ids.to_vec();
let values = encoding.values(as_vec.as_slice(), vec![]);
assert_eq!(values.len(), rows_select);
});
},
);
}
}
}
group.finish();
}
fn generate_column(rows: usize, rows_per_value: usize, rng: &mut ThreadRng) -> Vec<String> {
let mut col = Vec::with_capacity(rows);
let distinct_values = rows / rows_per_value;
for _ in 0..distinct_values {
let value = format!(
"value-{}",
rng.sample_iter(&Alphanumeric)
.map(char::from)
.take(8)
.collect::<String>()
);
col.extend(std::iter::repeat(value).take(rows_per_value));
}
assert_eq!(col.len(), rows);
col
}
criterion_group!(benches, select,);
criterion_main!(benches);