influxdb/router/benches/schema_validator.rs

use std::{iter, sync::Arc};

use criterion::{
    criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup, Criterion,
    Throughput,
};
use data_types::NamespaceName;
use hashbrown::HashMap;
use iox_catalog::{interface::Catalog, mem::MemCatalog};
use mutable_batch::MutableBatch;
use once_cell::sync::Lazy;
use router::{
    dml_handlers::{DmlHandler, SchemaValidator},
    namespace_cache::{MemoryNamespaceCache, NamespaceCache, ReadThroughCache, ShardedCache},
};
use schema::Projection;
use tokio::runtime::Runtime;

static NAMESPACE: Lazy<NamespaceName<'static>> = Lazy::new(|| "bananas".try_into().unwrap());

fn runtime() -> Runtime {
    tokio::runtime::Builder::new_current_thread()
        .build()
        .unwrap()
}

fn schema_validator_benchmarks(c: &mut Criterion) {
    let mut group = c.benchmark_group("schema_validator");

    bench(&mut group, 1, 1);

    bench(&mut group, 1, 100);
    bench(&mut group, 1, 10000);

    bench(&mut group, 100, 1);
    bench(&mut group, 10000, 1);

    group.finish();
}

fn bench(group: &mut BenchmarkGroup<WallTime>, tables: usize, columns_per_table: usize) {
    let metrics = Arc::new(metric::Registry::default());

    let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
    let ns_cache = Arc::new(ReadThroughCache::new(
        Arc::new(ShardedCache::new(
            iter::repeat_with(|| Arc::new(MemoryNamespaceCache::default())).take(10),
        )),
        Arc::clone(&catalog),
    ));
    let validator = SchemaValidator::new(catalog, Arc::clone(&ns_cache), &metrics);

    for i in 0..65_000 {
        let write = lp_to_writes(format!("{}{}", i + 10_000_000, generate_lp(1, 1)).as_str());
        let namespace_schema = runtime().block_on(ns_cache.get_schema(&NAMESPACE)).unwrap();
        let _ = runtime().block_on(validator.write(&NAMESPACE, namespace_schema, write, None));
    }

    let write = lp_to_writes(&generate_lp(tables, columns_per_table));
    let column_count = write
        .values()
        .fold(0, |acc, b| acc + b.schema(Projection::All).unwrap().len());

    group.throughput(Throughput::Elements(column_count as _));
    group.bench_function(format!("{tables}x{columns_per_table}"), |b| {
        b.to_async(runtime()).iter_batched(
            || {
                (
                    write.clone(),
                    runtime().block_on(ns_cache.get_schema(&NAMESPACE)).unwrap(),
                )
            },
            |(write, namespace_schema)| validator.write(&NAMESPACE, namespace_schema, write, None),
            BatchSize::SmallInput,
        );
    });
}

fn generate_lp(tables: usize, columns_per_table: usize) -> String {
    (0..tables)
        .map(|i| {
            let cols = (0..columns_per_table)
                .map(|i| format!("val{i}=42i"))
                .collect::<Vec<_>>()
                .join(",");

            format!("table{i},tag=A {cols}")
        })
        .collect::<Vec<_>>()
        .join("\n")
}

// Parse `lp` into a table-keyed MutableBatch map.
fn lp_to_writes(lp: &str) -> HashMap<String, MutableBatch> {
    let (writes, _) = mutable_batch_lp::lines_to_batches_stats(lp, 42)
        .expect("failed to build test writes from LP");
    writes
}

criterion_group!(benches, schema_validator_benchmarks);
criterion_main!(benches);
style: format imports Re-order and re-format the imports so that they follow a consistent pattern. This helps eliminate conflicts due to imports. 2022-10-26 14:24:49 +00:00			`use std::{iter, sync::Arc};`

feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`use criterion::{`
			`criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup, Criterion,`
			`Throughput,`
			`};`
fix: Pass the NamespaceSchema through the dml write traits 2023-05-08 18:25:19 +00:00			`use data_types::NamespaceName;`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`use hashbrown::HashMap;`
feat(router): Use read-through NamespaceCache with DML handlers This removes the look-aside cache from the retention_validation and schema_validation DML handlers, instead setting up the new NamespaceCache decorator and using that to handle cache misses. 2023-04-11 13:54:06 +00:00			`use iox_catalog::{interface::Catalog, mem::MemCatalog};`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`use mutable_batch::MutableBatch;`
refactor: Replace all uses of lazy_static with once_cell Went through and remove all lazy_static uses with once_cell (while waiting for the project to compile). There are still dependencies using lazy_static so it is still in the crate graph but at least there isn't an explicit dependency on it (and it is easier to update to `std::lazy::Lazy` once that is stable). 2022-06-29 11:27:43 +00:00			`use once_cell::sync::Lazy;`
fix: Rename router2 to router 2022-05-06 18:51:52 +00:00			`use router::{`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`dml_handlers::{DmlHandler, SchemaValidator},`
fix: Pass the NamespaceSchema through the dml write traits 2023-05-08 18:25:19 +00:00			`namespace_cache::{MemoryNamespaceCache, NamespaceCache, ReadThroughCache, ShardedCache},`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`};`
refactor: Rename `schema::selection::Selection` to `schema::projection::Projection` (#6037) * chore: Rename `schema::selection::Selection` to `schema::projection::Projection` * fix: docs Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2022-11-02 18:15:04 +00:00			`use schema::Projection;`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`use tokio::runtime::Runtime;`

chore: Rename DatabaseName to NamespaceName (#6100) * chore: Rename DatabaseName to NamespaceName * fix: fmt * chore: Updates some more references * chore: more cleanup * fix: adjust test Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2022-11-10 14:13:59 +00:00			`static NAMESPACE: Lazy<NamespaceName<'static>> = Lazy::new(\|\| "bananas".try_into().unwrap());`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00
			`fn runtime() -> Runtime {`
			`tokio::runtime::Builder::new_current_thread()`
			`.build()`
			`.unwrap()`
			`}`

test(bench): router partitioner Adds a benchmark that exercises the router's partitioning DmlHandler implementation against a set of three files (very small, small, medium) with 4 different partitioning schemes: * Single tag, which occurs in all rows * Single tag, which does not occur in any row * Default strftime formatter (YYYY-MM-DD) * Long and complicated strftime formatter This covers the entire partitioning overhead - building the formatters, evaluating each row, grouping the values into per-partition buckets, and returning to the caller, where it normally would be passed to the next handler in the pipeline. Note that only one template part is evaluated in each case - this measures the overhead of each type of formatter. In reality, we'd expect partitioning with custom schemes to utilise more than one part, increasing the cost of partitioning proportionally. This is a lower-bound measurement! 2023-06-02 14:04:09 +00:00			`fn schema_validator_benchmarks(c: &mut Criterion) {`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`let mut group = c.benchmark_group("schema_validator");`

			`bench(&mut group, 1, 1);`

			`bench(&mut group, 1, 100);`
			`bench(&mut group, 1, 10000);`

			`bench(&mut group, 100, 1);`
			`bench(&mut group, 10000, 1);`

			`group.finish();`
			`}`

			`fn bench(group: &mut BenchmarkGroup<WallTime>, tables: usize, columns_per_table: usize) {`
			`let metrics = Arc::new(metric::Registry::default());`

feat(router): Use read-through NamespaceCache with DML handlers This removes the look-aside cache from the retention_validation and schema_validation DML handlers, instead setting up the new NamespaceCache decorator and using that to handle cache misses. 2023-04-11 13:54:06 +00:00			`let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));`
			`let ns_cache = Arc::new(ReadThroughCache::new(`
			`Arc::new(ShardedCache::new(`
			`iter::repeat_with(\|\| Arc::new(MemoryNamespaceCache::default())).take(10),`
			`)),`
			`Arc::clone(&catalog),`
refactor: infallible JumpHash initialisation This doesn't really need to be fallible but forces propagation of a ton of error handling - no shards is always a sign of something being very wrong, and can be caught in the caller if it's for some reason an acceptable state / can be recovered from. 2022-08-24 11:16:04 +00:00			`));`
fix: Pass the NamespaceSchema through the dml write traits 2023-05-08 18:25:19 +00:00			`let validator = SchemaValidator::new(catalog, Arc::clone(&ns_cache), &metrics);`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00
			`for i in 0..65_000 {`
			`let write = lp_to_writes(format!("{}{}", i + 10_000_000, generate_lp(1, 1)).as_str());`
fix: Pass the NamespaceSchema through the dml write traits 2023-05-08 18:25:19 +00:00			`let namespace_schema = runtime().block_on(ns_cache.get_schema(&NAMESPACE)).unwrap();`
			`let _ = runtime().block_on(validator.write(&NAMESPACE, namespace_schema, write, None));`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`}`

			`let write = lp_to_writes(&generate_lp(tables, columns_per_table));`
			`let column_count = write`
			`.values()`
refactor: Rename `schema::selection::Selection` to `schema::projection::Projection` (#6037) * chore: Rename `schema::selection::Selection` to `schema::projection::Projection` * fix: docs Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2022-11-02 18:15:04 +00:00			`.fold(0, \|acc, b\| acc + b.schema(Projection::All).unwrap().len());`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00
			`group.throughput(Throughput::Elements(column_count as _));`
			`group.bench_function(format!("{tables}x{columns_per_table}"), \|b\| {`
			`b.to_async(runtime()).iter_batched(`
fix: Pass the NamespaceSchema through the dml write traits 2023-05-08 18:25:19 +00:00			`\|\| {`
			`(`
			`write.clone(),`
			`runtime().block_on(ns_cache.get_schema(&NAMESPACE)).unwrap(),`
			`)`
			`},`
			`\|(write, namespace_schema)\| validator.write(&NAMESPACE, namespace_schema, write, None),`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`BatchSize::SmallInput,`
			`);`
			`});`
			`}`

			`fn generate_lp(tables: usize, columns_per_table: usize) -> String {`
			`(0..tables)`
			`.map(\|i\| {`
			`let cols = (0..columns_per_table)`
fix: Move variables within format strings. Thanks clippy! Changes made automatically using `cargo clippy --fix`. 2023-01-30 22:20:05 +00:00			`.map(\|i\| format!("val{i}=42i"))`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`.collect::<Vec<_>>()`
			`.join(",");`

			`format!("table{i},tag=A {cols}")`
			`})`
			`.collect::<Vec<_>>()`
			`.join("\n")`
			`}`

			// Parse `lp` into a table-keyed MutableBatch map.
			`fn lp_to_writes(lp: &str) -> HashMap<String, MutableBatch> {`
			`let (writes, _) = mutable_batch_lp::lines_to_batches_stats(lp, 42)`
			`.expect("failed to build test writes from LP");`
			`writes`
			`}`

test(bench): router partitioner Adds a benchmark that exercises the router's partitioning DmlHandler implementation against a set of three files (very small, small, medium) with 4 different partitioning schemes: * Single tag, which occurs in all rows * Single tag, which does not occur in any row * Default strftime formatter (YYYY-MM-DD) * Long and complicated strftime formatter This covers the entire partitioning overhead - building the formatters, evaluating each row, grouping the values into per-partition buckets, and returning to the caller, where it normally would be passed to the next handler in the pipeline. Note that only one template part is evaluated in each case - this measures the overhead of each type of formatter. In reality, we'd expect partitioning with custom schemes to utilise more than one part, increasing the cost of partitioning proportionally. This is a lower-bound measurement! 2023-06-02 14:04:09 +00:00			`criterion_group!(benches, schema_validator_benchmarks);`
feat: schema validation benchmarks Useful for confirming the scalability of the schema check algorithm. 2022-03-04 00:28:02 +00:00			`criterion_main!(benches);`