diff --git a/Cargo.lock b/Cargo.lock index e5b29895a1..0ffd1160a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -271,7 +271,7 @@ dependencies = [ "md5", "oauth2", "paste", - "quick-error", + "quick-error 1.2.3", "reqwest", "serde", "serde-xml-rs", @@ -300,7 +300,7 @@ dependencies = [ "md5", "mime", "percent-encoding", - "quick-error", + "quick-error 1.2.3", "ring", "serde", "serde-xml-rs", @@ -398,13 +398,34 @@ dependencies = [ "constant_time_eq", ] +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder", + "generic-array 0.12.4", +] + [[package]] name = "block-buffer" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" dependencies = [ - "generic-array", + "generic-array 0.14.4", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", ] [[package]] @@ -446,6 +467,12 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + [[package]] name = "bytemuck" version = "1.7.2" @@ -525,6 +552,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "chrono-english" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be5180df5f7c41fc2416bc038bc8d78d44db8136c415b94ccbc95f523dc38e9" +dependencies = [ + "chrono", + "scanlex", + "time 0.1.43", +] + [[package]] name = "clang-sys" version = "1.2.0" @@ -784,7 +822,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714" dependencies = [ - "generic-array", + "generic-array 0.14.4", "subtle", ] @@ -917,13 +955,22 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array 0.12.4", +] + [[package]] name = "digest" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" dependencies = [ - "generic-array", + "generic-array 0.14.4", ] [[package]] @@ -1095,6 +1142,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + [[package]] name = "fd-lock" version = "2.0.0" @@ -1320,6 +1373,15 @@ dependencies = [ "tonic-build 0.5.2", ] +[[package]] +name = "generic-array" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" +dependencies = [ + "typenum", +] + [[package]] name = "generic-array" version = "0.14.4" @@ -1433,6 +1495,20 @@ version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" +[[package]] +name = "handlebars" +version = "3.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4498fc115fa7d34de968184e473529abb40eeb6be8bc5f7faba3d08c316cb3e3" +dependencies = [ + "log", + "pest", + "pest_derive", + "quick-error 2.0.1", + "serde", + "serde_json", +] + [[package]] name = "hashbrown" version = "0.11.2" @@ -1488,7 +1564,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2a2320eb7ec0ebe8da8f744d7812d9fc4cb4d09344ac01898dbcb6a20ae69b" dependencies = [ "crypto-mac", - "digest", + "digest 0.9.0", ] [[package]] @@ -1816,6 +1892,36 @@ dependencies = [ "tokio", ] +[[package]] +name = "iox_data_generator" +version = "0.1.0" +dependencies = [ + "chrono", + "chrono-english", + "clap", + "criterion", + "data_types", + "futures", + "generated_types", + "handlebars", + "influxdb2_client", + "influxdb_iox_client", + "itertools 0.9.0", + "packers", + "rand 0.8.4", + "rand_core 0.6.3", + "rand_seeder", + "serde", + "snafu", + "test_helpers", + "tokio", + "toml", + "tracing", + "tracing-futures", + "tracing-subscriber", + "uuid", +] + [[package]] name = "iox_object_store" version = "0.1.0" @@ -2014,6 +2120,12 @@ dependencies = [ "libc", ] +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matchers" version = "0.0.1" @@ -2035,9 +2147,9 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" dependencies = [ - "block-buffer", - "digest", - "opaque-debug", + "block-buffer 0.9.0", + "digest 0.9.0", + "opaque-debug 0.3.0", ] [[package]] @@ -2484,6 +2596,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -2788,6 +2906,49 @@ dependencies = [ "test_helpers", ] +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest", + "sha-1", +] + [[package]] name = "petgraph" version = "0.5.1" @@ -3188,6 +3349,12 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quick-xml" version = "0.20.0" @@ -3313,6 +3480,15 @@ dependencies = [ "rand_core 0.6.3", ] +[[package]] +name = "rand_seeder" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612dd698949d531335b4c29d1c64fb11942798decfc08abc218578942e66d7d0" +dependencies = [ + "rand_core 0.6.3", +] + [[package]] name = "rayon" version = "1.5.1" @@ -3605,7 +3781,7 @@ dependencies = [ "base64 0.13.0", "bytes", "chrono", - "digest", + "digest 0.9.0", "futures", "hex", "hmac", @@ -3728,6 +3904,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scanlex" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "088c5d71572124929ea7549a8ce98e1a6fd33d0a38367b09027b382e67c033db" + [[package]] name = "schannel" version = "0.1.19" @@ -3950,6 +4132,18 @@ dependencies = [ "tokio", ] +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer 0.7.3", + "digest 0.8.1", + "fake-simd", + "opaque-debug 0.2.3", +] + [[package]] name = "sha1" version = "0.6.0" @@ -3962,11 +4156,11 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b362ae5752fd2137731f9fa25fd4d9058af34666ca1966fb969119cc35719f12" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", - "opaque-debug", + "digest 0.9.0", + "opaque-debug 0.3.0", ] [[package]] @@ -4892,6 +5086,12 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + [[package]] name = "unicode-bidi" version = "0.3.6" diff --git a/Cargo.toml b/Cargo.toml index 47688a809e..eaa94efcf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "influxdb_line_protocol", "influxdb_tsm", "internal_types", + "iox_data_generator", "iox_object_store", "logfmt", "lifecycle", diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml new file mode 100644 index 0000000000..ecf863a780 --- /dev/null +++ b/iox_data_generator/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "iox_data_generator" +version = "0.1.0" +authors = ["Paul Dix "] +edition = "2018" +default-run = "iox_data_generator" + +[dependencies] +chrono = "0.4.13" +chrono-english = "0.1.4" +clap = "2.33.1" +futures = "0.3.5" +handlebars = "3.3.0" +data_types = { path = "../data_types" } +generated_types = { path = "../generated_types" } +influxdb2_client = { path = "../influxdb2_client" } +influxdb_iox_client = { path = "../influxdb_iox_client" } +packers = { path = "../packers" } +itertools = "0.9.0" +rand = { version = "0.8.3", features = ["small_rng"] } +rand_core = "0.6.2" +rand_seeder = "0.2.1" +serde = { version = "1.0", features = ["derive"] } +snafu = "0.6.8" +tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] } +toml = "0.5.6" +tracing = "0.1" +tracing-futures = "0.2.4" +tracing-subscriber = "0.2.11" +uuid = { version = "0.8.1", default_features = false } + +[dev-dependencies] +criterion = "0.3.3" +test_helpers = { path = "../test_helpers" } + +[[bench]] +name = "point_generation" +harness = false \ No newline at end of file diff --git a/iox_data_generator/README.md b/iox_data_generator/README.md new file mode 100644 index 0000000000..a6b1f717fb --- /dev/null +++ b/iox_data_generator/README.md @@ -0,0 +1,106 @@ +# `iox_data_generator` + +The `iox_data_generator` tool creates random data points according to a specification and loads them +into an `iox` instance to simulate real data. + +To build and run, [first install Rust](https://www.rust-lang.org/tools/install). Then from root of the `influxdb_iox` repo run: + +``` +cargo build --release +``` + +And the built binary has command line help: + +``` +./target/release/iox_data_generator --help +``` + +For examples of specifications see the [schemas folder](schemas) + +## Use with two IOx servers and Kafka + +The data generator tool can be used to simulate data being written to IOx in various shapes. This +is how to set up a local experiment for profiling or debugging purposes using a database in two IOx +instances: one writing to Kafka and one reading from Kafka. + +If you're profiling IOx, be sure you've compiled and are running a release build using either: + +``` +cargo build --release +./target/release/influxdb_iox run --server-id 1 +``` + +or: + +``` +cargo run --release -- run --server-id 1 +``` + +Server ID is the only required attribute for running IOx; see `influxdb_iox run --help` for all the +other configuration options for the server you may want to set for your experiment. Note that the +default HTTP API address is `127.0.0.1:8080` unless you set something different with `--api-bind` +and the default gRPC address is `127.0.0.1:8082` unless you set something different using +`--grpc-bind`. + +For the Kafka setup, you'll need to start two IOx servers, so you'll need to set the bind addresses +for at least one of them. Here's an example of the two commands to run: + +``` +cargo run --release -- run --server-id 1 +cargo run --release -- run --server-id 2 --api-bind 127.0.0.1:8084 --grpc-bind 127.0.0.1:8086 +``` + +You'll also need to run a Kafka instance. There's a Docker compose script in the influxdb_iox +repo you can run with: + +``` +docker-compose -f docker/ci-kafka-docker-compose.yml up kafka +``` + +The Kafka instance will be accessible from `127.0.0.1:9093` if you run it with this script. + +Once you have the two IOx servers and one Kafka instance running, create a database with a name in +the format `[orgname]_[bucketname]`. For example, create a database in IOx named `mlb_pirates`, and +the org you'll use in the data generator will be `mlb` and the bucket will be `pirates`. The +`DatabaseRules` defined in `src/bin/create_database.rs` will set up a database in the "writer" IOx +instance to write to Kafka and the database in the "reader" IOx instance to read from Kafka if +you run it with: + +``` +cargo run -p iox_data_generator --bin create_database -- --writer 127.0.0.1:8082 --reader 127.0.0.1:8086 mlb_pirates +``` + +This script adds 3 rows to a `writer_test` table because [this issue with the Kafka Consumer +needing data before it can find partitions](https://github.com/influxdata/influxdb_iox/issues/2189). + +Once the database is created, decide what kind of data you would like to send it. You can use an +existing data generation schema in the `schemas` directory or create a new one, perhaps starting +from an existing schema as a guide. In this example, we're going to use +`iox_data_generator/schemas/cap-write.toml`. + +Next, run the data generation tool as follows: + +``` +cargo run -p iox_data_generator -- --spec iox_data_generator/schemas/cap-write.toml --continue --host 127.0.0.1:8080 --token arbitrary --org mlb --bucket pirates +``` + +- `--spec iox_data_generator/schemas/cap-write.toml` sets the schema you want to use to generate the data +- `--continue` means the data generation tool should generate data every `sampling_interval` (which + is set in the schema) until we stop it +- `--host 127.0.0.1:8080` means to write to the writer IOx server running at the default HTTP API address + of `127.0.0.1:8080` (note this is NOT the gRPC address used by the `create_database` command) +- `--token arbitrary` - the data generator requires a token value but IOx doesn't use it, so this + can be any value. +- `--org mlb` is the part of the database name you created before the `_` +- `--bucket pirates` is the part of the database name you created after the `_` + +You should be able to use `influxdb_iox sql -h http://127.0.0.1:8086` to connect to the gRPC of the reader +then `use database mlb_pirates;` and query the tables to see that the data is being inserted. That +is, + +``` +# in your influxdb_iox checkout +cargo run -- sql -h http://127.0.0.1:8086 +``` + +Connecting to the writer instance won't show any data. diff --git a/iox_data_generator/benches/point_generation.rs b/iox_data_generator/benches/point_generation.rs new file mode 100644 index 0000000000..44115bac82 --- /dev/null +++ b/iox_data_generator/benches/point_generation.rs @@ -0,0 +1,66 @@ +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use iox_data_generator::{ + specification::{AgentSpec, DataSpec, FieldSpec, FieldValueSpec, MeasurementSpec}, + write::PointsWriterBuilder, +}; + +pub fn single_agent(c: &mut Criterion) { + let spec = DataSpec { + base_seed: Some("faster faster faster".into()), + name: "benchmark".into(), + agents: vec![AgentSpec { + name: "agent-1".into(), + count: None, + sampling_interval: Some(1), + name_tag_key: None, + tags: vec![], + measurements: vec![MeasurementSpec { + name: "measurement-1".into(), + count: None, + tags: vec![], + fields: vec![FieldSpec { + name: "field-1".into(), + field_value_spec: FieldValueSpec::Bool(true), + count: None, + }], + }], + }], + }; + + let mut points_writer = PointsWriterBuilder::new_no_op(true); + + let start_datetime = Some(0); + let one_hour_s = 60 * 60; + let ns_per_second = 1_000_000_000; + let end_datetime = Some(one_hour_s * ns_per_second); + + let expected_points = 3601; + + let mut group = c.benchmark_group("single_agent"); + group.throughput(Throughput::Elements(expected_points)); + + group.bench_function("single agent with basic configuration", |b| { + b.iter(|| { + let r = block_on({ + iox_data_generator::generate::( + &spec, + &mut points_writer, + start_datetime, + end_datetime, + 0, + false, + ) + }); + let n_points = r.expect("Could not generate data"); + assert_eq!(n_points, expected_points as usize); + }) + }); +} + +#[tokio::main] +async fn block_on(f: F) -> F::Output { + f.await +} + +criterion_group!(benches, single_agent); +criterion_main!(benches); diff --git a/iox_data_generator/schemas/cap-write.toml b/iox_data_generator/schemas/cap-write.toml new file mode 100644 index 0000000000..e796682098 --- /dev/null +++ b/iox_data_generator/schemas/cap-write.toml @@ -0,0 +1,428 @@ +# This config file aims to replicate the data produced by the capwrite tool: +# https://github.com/influxdata/idpe/tree/e493a8e9b6b773e9374a8542ddcab7d8174d320d/performance/capacity/write +name = "cap_write" +base_seed = "correct horse battery staple" + +[[agents]] +name = "cap_write_{{agent_id}}" +count = 3 +sampling_interval = 10 + +[[agents.measurements]] +name = "system" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "n_cpus" + i64_range = [8, 8] + + [[agents.measurements.fields]] + name = "n_users" + i64_range = [2, 11] + + [[agents.measurements.fields]] + name = "uptime" + uptime = "i64" + + [[agents.measurements.fields]] + name = "uptime_format" + uptime = "telegraf" + + [[agents.measurements.fields]] + name = "load1" + f64_range = [0.0, 8.0] + + [[agents.measurements.fields]] + name = "load5" + f64_range = [0.0, 8.0] + + [[agents.measurements.fields]] + name = "load15" + f64_range = [0.0, 8.0] + + +[[agents.measurements]] +name = "mem" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "active" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "available" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "buffered" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "cached" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "inactive" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "slab" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "used" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "avaiable_percent" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "used_percent" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "wired" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "commit_limit" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "committed_as" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "dirty" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "high_free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "high_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "huge_page_size" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "huge_pages_free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "huge_pages_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "low_free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "low_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "mapped" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "page_tables" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "shared" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "swap_cached" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "swap_free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "swap_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "vmalloc_chunk" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "vmalloc_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "vmalloc_used" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "write_back" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "write_back_tmp" + i64_range = [0, 10000000] + +[[agents.measurements]] +name = "disk" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "used" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "used_percent" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "inodes_free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "inodes_total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "inodes_used" + i64_range = [0, 10000000] + +[[agents.measurements]] +name = "swap" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "free" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "total" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "used" + i64_range = [0, 1000000] # Note this is an order of magnitude less deliberately to match + # https://github.com/influxdata/idpe/blob/ffbceb04dd4b3aa0828d039135977a4f36f7b822/performance/capacity/write/swap.go#L17 + # not sure if that value was intentional, perhaps it is to ensure used < total? + + [[agents.measurements.fields]] + name = "used_percent" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "in" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "out" + i64_range = [0, 10000000] + +[[agents.measurements]] +name = "cpu" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.tags]] + name = "cpu" + value = "cpu-total" + + [[agents.measurements.fields]] + name = "usage_user" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_nice" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_system" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_idle" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_irq" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_softirq" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_steal" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_guest" + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "usage_guest_nice" + f64_range = [0.0, 100.0] + +[[agents.measurements]] +name = "processes" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "blocked" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "running" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "sleeping" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "stopped" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "total" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "zombie" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "dead" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "wait" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "idle" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "paging" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "total_threads" + i64_range = [0, 255] + + [[agents.measurements.fields]] + name = "unknown" + i64_range = [0, 255] + +[[agents.measurements]] +name = "net" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "bytes_recv" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "bytes_sent" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "packets_sent" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "packets_recv" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "err_in" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "err_out" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "drop_in" + i64_range = [0, 10000000] + + [[agents.measurements.fields]] + name = "drop_out" + i64_range = [0, 10000000] + +[[agents.measurements]] +name = "diskio" + [[agents.measurements.tags]] + name = "host" + value = "host-{{agent_id}}" + + [[agents.measurements.fields]] + name = "reads" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "writes" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "read_bytes" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "write_bytes" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "read_time" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "write_time" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "io_time" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "weighted_io_time" + i64_range = [0, 1000000] + + [[agents.measurements.fields]] + name = "iops_in_progress" + i64_range = [0, 1000000] diff --git a/iox_data_generator/schemas/fully-supported.toml b/iox_data_generator/schemas/fully-supported.toml new file mode 100644 index 0000000000..c832a177f3 --- /dev/null +++ b/iox_data_generator/schemas/fully-supported.toml @@ -0,0 +1,39 @@ +# Every feature demonstrated in this schema is fully supported in the current implementation. +# Other schemas may demonstrate future features. + +# Every point generated by this schema will contain a tag `data_spec=[this_value]`. +name = "demo_schema" +# This seed can be any string and will be used to seed all random number generators. To change +# the randomness in the points generated by this schema, change this value to something else. +# To generate the same data in the same order as previous runs with this schema (except for any +# elements in this schema you have changed), keep this value the same. +base_seed = "this is a demo" + +[[agents]] +name = "basic" +sampling_interval = 10 # in seconds. TODO: parse nice durations like "12m" and "30s" + +[[agents.measurements]] +name = "cpu" + +[[agents.measurements.fields]] +name = "temp" +f64_range = [0.0, 100.0] + +[[agents.measurements.fields]] +name = "location" +pattern = "{{city}}, {{country}}" +replacements = [ + {replace = "city", with = ["San Jose", "San Antonio", "Santa Maria"]}, + {replace = "country", with = ["United States", "Costa Rica", ["Argentina", 10]]}, +] + +[[agents.measurements.fields]] +name = "wave_height" +i64_range = [0, 10] +increment = true +reset_after = 20 + +[[agents.measurements.fields]] +name = "uptime" +uptime = "i64" diff --git a/iox_data_generator/schemas/telegraf.toml b/iox_data_generator/schemas/telegraf.toml new file mode 100644 index 0000000000..ff24680e56 --- /dev/null +++ b/iox_data_generator/schemas/telegraf.toml @@ -0,0 +1,141 @@ +name = "demo_schema" +base_seed = "correct horse battery staple" + +# the most basic spec with no auto generating of agents, measurements, tags or fields +[[agents]] +name = "demo" +sampling_interval = 10 + + [[agents.measurements]] + name = "some_measurement" + + [[agents.measurements.tags]] + name = "foo" + value = "bar" + + [[agents.measurements.fields]] + name = "field1" + # it's a boolean field, the true means to generate the boolean randomly with equal probability + bool = true + + [[agents.measurements.fields]] + name = "field2" + # it's an i64 field, values will be generated using a pseudo random number generator + # with a set seed and values in the range [3, 200). Setting it to [3, 3] or [3, 4] will + # make the value always be 3 + i64_range = [3, 200] + + [[agents.measurements.fields]] + name = "field3" + # it's an i64 field, values will be generated using a pseudo random number generator + # with a set seed and values in the range in the range [1000, 5000) + i64_range = [1000, 5000] + # The value after each same will be incremented by the next random amount. This is + # useful when simulating a counter. + increment = true + + [[agents.measurements.fields]] + name = "field4" + # it's an f64 field, values will be generated using a pseudo random number generator + # with a set seed with values in the range [0.0, 100.0). Setting both values to the same + # number will make every value that number. + f64_range = [0.0, 100.0] + + [[agents.measurements.fields]] + name = "field5" + # this is a string field. Parts of the string will be replaced. {{agent_name}} will be replaced + # with the name of the agent, {{random 200}} will be replaced with a random alphanumeric string + # of the length specified. {{format-time "%Y-%m-%d %H:%M"}} will be replaced with the time for + # this line in the simulation (that is, the same value that this line will have for its + # timestamp) formatted using a strftime specifier. Other patterns will be looked for based on + # the keys in replacements. + pattern = "{{agent_name}} foo {{level}} {{format-time \"%Y-%m-%d %H:%M\"}} {{random 200}}" + # each key in string replacements will be replaced in the pattern with a value randomly + # selected from the array of strings. Specify a weight as an integer greater than 1 to change + # the probability that a given string will be selected. + replacements = [ + {replace = "color", with = ["red", "blue", "green"]}, + {replace = "level", with = [ + ["info", 800], + ["warn", 195], + ["error", 5] + ]} + ] + +[[agents]] +name = "some-server-{{agent_id}}" +count = 10 +sampling_interval = 22 + +# Optional: every measurement (row) this agent produces will include a tag with the agent_id filled +# in: +# agent_name=some-server-{{agent_id}} +name_tag_key = "agent_name" + +# Optional: these values will be rotated through so that each agent that gets created will have one. +# e.g: the first agent will always inject region=us-west and secnod will be region=us-east, etc. +tags = [ + {key = "region", values = ["us-west", "us-east", "dublin", "frankfurt"]}, + {key = "foo", values = ["bar", "asdf"]}, +] + + [[agents.measurements]] + name = "few-tags-measurement-{{measurement_id}}" + count = 20 + [[agents.measurements.tags]] + # {{measurement_id}} will be replaced with the id of the measurement this tag is for + name = "tag-1-{{measurement_id}}" + value = "value-1" + + [[agents.measurements.tags]] + name = "tag-2" + # {{cardinality}} will be replaced with the cardinality counter + value = "value-{{cardinality}}" + # Optional: This means each collection on this agent will have 4 rows of this measurement with + # unique values for this tag. This could be for things like org_id as a tag or for + # something like cpu measurements in Telegraf where you have a separate line for each cpu: + # cpu,cpu=cpu-total,host=foo usage_user=23.2,usage_system=33.3 + # cpu,cpu=cpu-0,host=foo usage_user=22.2,usage_system=34.5 + # cpu,cpu=cpu-1,host=foo usage_user=11.2,usage_system=56.5 + cardinality = 4 + + [[agents.measurements.tags]] + name = "tag-3" + # {{counter}} will be replaced with the increment counter + value = "value-{{counter}}" + # Optional: This means that {{counter}} will increase by 1 after every 10 samples that are + # pulled. + # This option simulates temporal tag values like process IDs or container IDs in tags + increment_every = 10 + + [[agents.measurements.tags]] + name = "tag-4" + # {{counter}} will be replaced with the increment counter and {{cardinality}} will be replaced + # with the cardinality counter + value = "value-{{counter}}-{{cardinality}}" + # Optional: This means that {{counter}} will increment by 1 after every 100 samples that are + # pulled. + # This option simulates temporal tag values like process IDs or container IDs in tags + increment_every = 100 + # when paired with cardinality, this can simulate having many containers running on a single + # host + cardinality = 10 + + [[agents.measurements.fields]] + name = "field-2" + bool = true + + # This example shows generating 10 different measurements that each have their own set of + # tags (10 of them) and each have their own set of fields (4 of them) + [[agents.measurements]] + name = "mid-tags-measurement-{{measurement_id}}" + count = 10 + [[agents.measurements.tags]] + name = "tag-{{tag_id}}-{{measurement_id}}" + count = 10 + value = "value-{{cardinality}}" + cardinality = 3 + + [[agents.measurements.fields]] + name = "field-1" + bool = true diff --git a/iox_data_generator/schemas/tracing-spec.toml b/iox_data_generator/schemas/tracing-spec.toml new file mode 100644 index 0000000000..8e23ccc0d1 --- /dev/null +++ b/iox_data_generator/schemas/tracing-spec.toml @@ -0,0 +1,52 @@ +name = "tracing_schema" +base_seed = "this is a demo" + +[[agents]] +name = "trace-sender" +sampling_interval = 10 # in seconds. TODO: parse nice durations like "12m" and "30s" + +[[agents.measurements]] +name = "traces" + + [[agents.measurements.tags]] + name = "trace_id" + value = "{{guid}}" + + [[agents.measurements.tags]] + name = "span_id" + value = "{{guid}}" + cardinality = 10 + + [[agents.measurements.tags]] + name = "host" + value = "{{host}}" + replacements = [ + {replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}, + ] + resample_every_line = true + + [[agents.measurements.tags]] + name = "region" + value = "{{region}}" + replacements = [ + {replace = "region", with = ["us-west", "us-east"]}, + ] + resample_every_line = false + + [[agents.measurements.tags]] + name = "service" + value = "{{service}}" + replacements = [ + {replace = "service", with = ["nginx", "istio", "storage", "gateway", "redis", "mysql", "s3"]}, + ] + resample_every_line = true + +[[agents.measurements.fields]] +name = "timing" +f64_range = [0.0, 500.0] + +[[agents.measurements.fields]] +name = "depth" +i64_range = [0, 3] +increment = true +reset_after = 10 \ No newline at end of file diff --git a/iox_data_generator/src/agent.rs b/iox_data_generator/src/agent.rs new file mode 100644 index 0000000000..796752d7f9 --- /dev/null +++ b/iox_data_generator/src/agent.rs @@ -0,0 +1,557 @@ +//! Agents responsible for generating points + +use crate::{ + measurement::MeasurementGeneratorSet, now_ns, specification, tag::Tag, write::PointsWriter, + DataGenRng, RandomNumberGenerator, +}; + +use influxdb2_client::models::DataPoint; +use snafu::{ResultExt, Snafu}; +use std::{fmt, time::Duration}; +use tracing::{debug, info}; + +/// Agent-specific Results +pub type Result = std::result::Result; + +/// Errors that may happen while creating points +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when generating points from measurements + #[snafu(display("{}", source))] + CouldNotGeneratePoint { + /// Underlying `measurement` module error that caused this problem + source: crate::measurement::Error, + }, + + /// Error that may happen when creating measurement generator sets + #[snafu(display("Could not create measurement generator sets, caused by:\n{}", source))] + CouldNotCreateMeasurementGeneratorSets { + /// Underlying `measurement` module error that caused this problem + source: crate::measurement::Error, + }, + + /// Error that may happen when writing points + #[snafu(display("Could not write points, caused by:\n{}", source))] + CouldNotWritePoints { + /// Underlying `write` module error that caused this problem + source: crate::write::Error, + }, +} + +/// Each `AgentSpec` informs the instantiation of an `Agent`, which coordinates +/// the generation of the measurements in their specification. +#[derive(Debug)] +pub struct Agent { + agent_id: usize, + name: String, + #[allow(dead_code)] + rng: RandomNumberGenerator, + agent_tags: Vec, + measurement_generator_sets: Vec>, + sampling_interval: Option, + /// nanoseconds since the epoch, used as the timestamp for the next + /// generated point + current_datetime: i64, + /// nanoseconds since the epoch, when current_datetime exceeds this, stop + /// generating points + end_datetime: i64, + /// whether to continue generating points after reaching the current time + continue_on: bool, + /// whether this agent is done generating points or not + finished: bool, + /// Optional interval at which to re-run the agent if generating data in + /// "continue" mode + interval: Option, +} + +impl Agent { + /// Create a new agent that will generate data points according to these + /// specs. Substitutions in `name` and `agent_tags` should be made + /// before using them to instantiate an agent. + #[allow(clippy::too_many_arguments)] + pub fn new( + agent_spec: &specification::AgentSpec, + agent_name: impl Into, + agent_id: usize, + parent_seed: impl fmt::Display, + agent_tags: Vec, + start_datetime: Option, // in nanoseconds since the epoch, defaults to now + end_datetime: Option, // also in nanoseconds since the epoch, defaults to now + execution_start_time: i64, + continue_on: bool, // If true, run in "continue" mode after historical data is generated + ) -> Result { + let name = agent_name.into(); + // Will agents actually need rngs? Might just need seeds... + let seed = format!("{}-{}", parent_seed, name); + let rng = RandomNumberGenerator::::new(&seed); + + let measurement_generator_sets = agent_spec + .measurements + .iter() + .map(|spec| { + MeasurementGeneratorSet::new( + &name, + agent_id, + spec, + &seed, + &agent_tags, + execution_start_time, + ) + }) + .collect::>() + .context(CouldNotCreateMeasurementGeneratorSets)?; + + let current_datetime = start_datetime.unwrap_or_else(now_ns); + let end_datetime = end_datetime.unwrap_or_else(now_ns); + + // Convert to nanoseconds + let sampling_interval = agent_spec + .sampling_interval + .map(|s| s as i64 * 1_000_000_000); + + Ok(Self { + agent_id, + name, + rng, + agent_tags, + measurement_generator_sets, + sampling_interval, + current_datetime, + end_datetime, + continue_on, + finished: false, + interval: None, + }) + } + + /// Generate and write points in batches until `generate` doesn't return any + /// points. Meant to be called in a `tokio::task`. + pub async fn generate_all(&mut self, mut points_writer: PointsWriter) -> Result { + let mut total_points = 0; + + let mut points = self.generate().await?; + while !points.is_empty() { + info!("[agent {}] sending {} points", self.name, points.len()); + total_points += points.len(); + points_writer + .write_points(points) + .await + .context(CouldNotWritePoints)?; + points = self.generate().await?; + } + Ok(total_points) + } + + /// Generate data points from the configuration in this agent, one point per + /// measurement contained in this agent's configuration. + pub async fn generate(&mut self) -> Result> { + let mut points = Vec::new(); + + debug!( + "[agent {}] generate more? {} current: {}, end: {}", + self.name, self.finished, self.current_datetime, self.end_datetime + ); + + if !self.finished { + // Save the current_datetime to use in the set of points that we're generating + // because we might increment current_datetime to see if we're done + // or not. + let point_timestamp = self.current_datetime; + + if let Some(i) = &mut self.interval { + i.tick().await; + self.current_datetime = now_ns(); + } else if let Some(ns) = self.sampling_interval { + self.current_datetime += ns; + + if self.current_datetime > self.end_datetime { + if self.continue_on { + let mut i = tokio::time::interval(Duration::from_nanos(ns as u64)); + i.tick().await; // first tick completes immediately + self.current_datetime = now_ns(); + self.interval = Some(i); + } else { + self.finished = true; + } + } + } else { + self.finished = true; + } + + for mgs in &mut self.measurement_generator_sets { + for point in mgs + .generate(point_timestamp) + .context(CouldNotGeneratePoint)? + { + points.push(point); + } + } + } + + Ok(points) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{now_ns, specification::*, ZeroRng}; + use influxdb2_client::models::WriteDataPoint; + + type Error = Box; + type Result = std::result::Result; + + impl Agent { + /// Instantiate an agent only with the parameters we're interested in + /// testing, keeping everything else constant across different + /// tests. + fn test_instance( + sampling_interval: Option, + continue_on: bool, + current_datetime: i64, + end_datetime: i64, + ) -> Self { + let measurement_spec = MeasurementSpec { + name: "measurement-{{agent_id}}-{{measurement_id}}".into(), + count: Some(2), + tags: vec![], + fields: vec![FieldSpec { + name: "field-{{agent_id}}-{{measurement_id}}-{{field_id}}".into(), + field_value_spec: FieldValueSpec::I64 { + range: 0..60, + increment: false, + reset_after: None, + }, + count: Some(2), + }], + }; + + let measurement_generator_set = + MeasurementGeneratorSet::new("test", 42, &measurement_spec, "spec-test", &[], 0) + .unwrap(); + + Self { + agent_id: 0, + name: String::from("test"), + rng: RandomNumberGenerator::::new("spec-test"), + agent_tags: vec![], + measurement_generator_sets: vec![measurement_generator_set], + finished: false, + interval: None, + + sampling_interval, + current_datetime, + end_datetime, + continue_on, + } + } + } + + fn timestamps(points: &[influxdb2_client::models::DataPoint]) -> Result> { + points + .iter() + .map(|point| { + let mut v = Vec::new(); + point.write_data_point_to(&mut v)?; + let line = String::from_utf8(v)?; + + Ok(line.split(' ').last().unwrap().trim().parse()?) + }) + .collect() + } + + #[rustfmt::skip] + // # Summary: No Sampling Interval + // + // If there isn't a sampling interval, we don't know how often to run, so we can neither + // generate historical data nor can we continue into the future. The only thing we'll do is + // generate once then stop. + // + // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome | + // |-------------------+----------+-----------------------------+------------------| + // | None | false | Less | gen 1x, stop | + // | None | false | Equal | gen 1x, stop | + // | None | false | Greater | gen 1x, stop | + // | None | true | Less | gen 1x, stop | + // | None | true | Equal | gen 1x, stop | + // | None | true | Greater | gen 1x, stop | + + mod without_sampling_interval { + use super::*; + + mod without_continue { + use super::*; + + #[tokio::test] + async fn current_time_less_than_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, false, 0, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_equal_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, false, 10, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_greater_than_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, false, 11, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + } + + mod with_continue { + use super::*; + + #[tokio::test] + async fn current_time_less_than_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, true, 0, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_equal_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, true, 10, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_greater_than_end_time() -> Result<()> { + let mut agent = Agent::::test_instance(None, true, 11, 10); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + } + } + + mod with_sampling_interval { + use super::*; + + // The tests take about 5 ms to run on my computer, so set the sampling interval + // to 10 ms to be able to test that the delay is happening when + // `continue` is true without making the tests too artificially slow. + const TEST_SAMPLING_INTERVAL: i64 = 10_000_000; + + #[rustfmt::skip] + // # Summary: Not continuing + // + // If there is a sampling interval but we're not continuing, we should generate points at + // least once but if the current time is greater than the ending time (which might be set + // to `now`), we've generated everything we need to and should stop. + // + // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome | + // |-------------------+----------+-----------------------------+------------------| + // | Some(_) | false | Less | gen & increment | + // | Some(_) | false | Equal | gen 1x, stop | + // | Some(_) | false | Greater | gen 1x, stop | + + mod without_continue { + use super::*; + + #[tokio::test] + async fn current_time_less_than_end_time() -> Result<()> { + let current = 0; + let end = TEST_SAMPLING_INTERVAL; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_equal_end_time() -> Result<()> { + let current = TEST_SAMPLING_INTERVAL; + let end = current; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + + #[tokio::test] + async fn current_time_greater_than_end_time() -> Result<()> { + let current = 2 * TEST_SAMPLING_INTERVAL; + let end = TEST_SAMPLING_INTERVAL; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let points = agent.generate().await?; + assert!(points.is_empty(), "expected no points, got {:?}", points); + + Ok(()) + } + } + + #[rustfmt::skip] + // # Summary: After generating historical data, continue sampling in "real time" + // + // If there is a sampling interval and we are continuing, generate points as fast as + // possible (but with timestamps separated by sampling_interval amounts) until we catch up + // to `now`. Then add pauses of the sampling_interval's duration, generating points with + // their timestamps set to the current time to simulate "real" point generation. + // + // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome | + // |-------------------+----------+-----------------------------+------------------| + // | Some(_) | true | Less | gen, no delay | + // | Some(_) | true | Equal | gen, delay | + // | Some(_) | true | Greater | gen, delay | + + mod with_continue { + use super::*; + + #[tokio::test] + async fn current_time_less_than_end_time() -> Result<()> { + let end = now_ns(); + let current = end - TEST_SAMPLING_INTERVAL; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let times = timestamps(&points).unwrap(); + assert_eq!(vec![current, current], times); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let times = timestamps(&points).unwrap(); + assert_eq!(vec![end, end], times); + + Ok(()) + } + + #[tokio::test] + async fn current_time_equal_end_time() -> Result<()> { + let end = now_ns(); + let current = end; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let times = timestamps(&points).unwrap(); + assert_eq!(vec![end, end], times); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let real_now = now_ns(); + + let times = timestamps(&points).unwrap(); + for time in times { + assert!( + time <= real_now, + "expected timestamp {} to be generated before now ({}); \ + was {} nanoseconds greater", + time, + real_now, + time - real_now + ); + } + + Ok(()) + } + + #[tokio::test] + async fn current_time_greater_than_end_time() -> Result<()> { + let end = now_ns(); + let current = end + TEST_SAMPLING_INTERVAL; + + let mut agent = + Agent::::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let times = timestamps(&points).unwrap(); + assert_eq!(vec![current, current], times); + + let points = agent.generate().await?; + assert_eq!(points.len(), 2); + + let real_now = now_ns(); + + let times = timestamps(&points).unwrap(); + for time in times { + assert!( + time <= real_now, + "expected timestamp {} to be generated before now ({}); \ + was {} nanoseconds greater", + time, + real_now, + time - real_now + ); + } + + Ok(()) + } + } + } +} diff --git a/iox_data_generator/src/bin/create_database.rs b/iox_data_generator/src/bin/create_database.rs new file mode 100644 index 0000000000..9ec747d8fe --- /dev/null +++ b/iox_data_generator/src/bin/create_database.rs @@ -0,0 +1,157 @@ +#![deny(rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_debug_implementations, + clippy::explicit_iter_loop, + clippy::use_self +)] + +use clap::{App, Arg}; +use generated_types::influxdata::iox::management::v1::{ + self as management, database_rules::*, lifecycle_rules::*, *, +}; + +#[tokio::main] +async fn main() { + let help = r#"IOx database creator + +Examples: + # Create a database named `foo_bar` with the IOx server listening at the default gRPC address: + create_database foo_bar + + # Create a database named `myorg_mybucket` with the IOx server listening at + # 127.0.0.1:9000: + create_database --grpc-bind 127.0.0.1:9000 myorg_mybucket +"#; + + let matches = App::new(help) + .about("IOx Database creation script") + .arg( + Arg::with_name("DATABASE_NAME") + .help("Name of the database to create") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("WRITER") + .long("writer") + .help("The gRPC host and port of the IOx server that should write to Kafka") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("READER") + .long("reader") + .help("The gRPC host and port of the IOx server that should read from Kafka") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("KAFKA") + .long("kafka") + .help("The connection address of the Kafka instance") + .takes_value(true) + .default_value("127.0.0.1:9093"), + ) + .get_matches(); + + let db_name = matches + .value_of("DATABASE_NAME") + .expect("DATABASE_NAME is required") + .to_string(); + let writer = matches.value_of("WRITER").expect("WRITER is required"); + let reader = matches.value_of("READER").expect("READER is required"); + let kafka = matches + .value_of("KAFKA") + .expect("KAFKA has a default value"); + + // Edit these to whatever DatabaseRules you want to use + let writer_database_rules = DatabaseRules { + name: db_name.clone(), + partition_template: Some(PartitionTemplate { + parts: vec![partition_template::Part { + part: Some(partition_template::part::Part::Time( + "%Y-%m-%d %H:00:00".into(), + )), + }], + }), + lifecycle_rules: Some(LifecycleRules { + immutable: true, + ..Default::default() + }), + worker_cleanup_avg_sleep: None, + routing_rules: Some(RoutingRules::RoutingConfig(RoutingConfig { + sink: Some(management::Sink { + sink: Some(management::sink::Sink::Kafka(KafkaProducer {})), + }), + })), + write_buffer_connection: Some(WriteBufferConnection::Writing(kafka.to_string())), + }; + let reader_database_rules = DatabaseRules { + name: db_name.clone(), + partition_template: Some(PartitionTemplate { + parts: vec![partition_template::Part { + part: Some(partition_template::part::Part::Time( + "%Y-%m-%d %H:00:00".into(), + )), + }], + }), + lifecycle_rules: Some(LifecycleRules { + buffer_size_soft: 1024 * 1024 * 1024, + buffer_size_hard: 1024 * 1024 * 1024 * 2, + worker_backoff_millis: 100, + max_active_compactions_cfg: Some(MaxActiveCompactionsCfg::MaxActiveCompactions(1)), + persist: true, + persist_row_threshold: 10 * 1000 * 1000, + ..Default::default() + }), + worker_cleanup_avg_sleep: None, + routing_rules: Some(RoutingRules::RoutingConfig(RoutingConfig { + sink: Some(management::Sink { + sink: Some(management::sink::Sink::Kafka(KafkaProducer {})), + }), + })), + write_buffer_connection: Some(WriteBufferConnection::Reading(kafka.to_string())), + }; + + // Create the writer db + let writer_grpc_bind_addr = format!("http://{}", writer); + let writer_grpc_channel = influxdb_iox_client::connection::Builder::default() + .build(writer_grpc_bind_addr) + .await + .unwrap(); + let mut writer_management_client = + influxdb_iox_client::management::Client::new(writer_grpc_channel.clone()); + writer_management_client + .create_database(writer_database_rules) + .await + .expect("create writer database failed"); + + // Write a few points + let mut write_client = influxdb_iox_client::write::Client::new(writer_grpc_channel); + let lp_lines = [ + "write_test,region=west user=23.2 100", + "write_test,region=west user=21.0 150", + "write_test,region=east bytes=99i 200", + ]; + let num_lines_written = write_client + .write(&db_name, lp_lines.join("\n")) + .await + .expect("cannot write"); + assert_eq!(num_lines_written, 3); + + // Create the reader db + let reader_grpc_bind_addr = format!("http://{}", reader); + let reader_grpc_channel = influxdb_iox_client::connection::Builder::default() + .build(reader_grpc_bind_addr) + .await + .unwrap(); + let mut reader_management_client = + influxdb_iox_client::management::Client::new(reader_grpc_channel.clone()); + reader_management_client + .create_database(reader_database_rules) + .await + .expect("create reader database failed"); + + println!("Created database {}", db_name); +} diff --git a/iox_data_generator/src/field.rs b/iox_data_generator/src/field.rs new file mode 100644 index 0000000000..dd857496d9 --- /dev/null +++ b/iox_data_generator/src/field.rs @@ -0,0 +1,777 @@ +//! Generating a set of field keys and values given a specification + +use crate::{ + now_ns, specification, + substitution::{pick_from_replacements, Substitute}, + DataGenRng, RandomNumberGenerator, +}; + +use influxdb2_client::models::FieldValue; +use rand::Rng; +use serde::Serialize; +use snafu::{ResultExt, Snafu}; +use std::{collections::BTreeMap, fmt, ops::Range, time::Duration}; + +/// Field-specific Results +pub type Result = std::result::Result; + +/// Errors that may happen while creating fields +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when substituting placeholder values + #[snafu(display("Could not create field name, caused by:\n{}", source))] + CouldNotCreateFieldName { + /// Underlying `substitution` module error that caused this problem + source: crate::substitution::Error, + }, + + /// Error that may happen when substituting placeholder values + #[snafu(display("Could not compile field name template, caused by:\n{}", source))] + CouldNotCompileStringTemplate { + /// Underlying `substitution` module error that caused this problem + source: crate::substitution::Error, + }, +} + +/// A generated field value that will be used in a generated data point. +#[derive(Debug, PartialEq)] +pub struct Field { + /// The key for the field + pub key: String, + /// The value for the field + pub value: FieldValue, +} + +impl Field { + /// Create a new field with the given key and value. + pub fn new(key: impl Into, value: impl Into) -> Self { + Self { + key: key.into(), + value: value.into(), + } + } +} + +/// A set of `count` fields that have the same configuration but different +/// `field_id`s. +pub struct FieldGeneratorSet { + field_generators: Vec>, +} + +// field_generators doesn't implement Debug +impl fmt::Debug for FieldGeneratorSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FieldGeneratorSet") + .field("field_generators", &"(dynamic)") + .finish() + } +} + +impl FieldGeneratorSet { + /// Create a new set of field generators for a particular agent, + /// measurement, and field specification. + pub fn new( + agent_name: &str, + agent_id: usize, + measurement_id: usize, + spec: &specification::FieldSpec, + parent_seed: &str, + execution_start_time: i64, + ) -> Result { + let count = spec.count.unwrap_or(1); + + let field_generators = (0..count) + .map(|field_id| { + field_spec_to_generator::( + agent_name, + agent_id, + measurement_id, + field_id, + spec, + parent_seed, + execution_start_time, + ) + }) + .collect::>()?; + + Ok(Self { field_generators }) + } + + /// Create one set of fields + pub fn generate(&mut self, timestamp: i64) -> Vec { + self.field_generators + .iter_mut() + .map(|fg| fg.generate(timestamp)) + .collect() + } +} + +trait FieldGenerator { + fn generate(&mut self, timestamp: i64) -> Field; +} + +/// Generate boolean field names and values. +#[derive(Debug)] +pub struct BooleanFieldGenerator { + name: String, + rng: RandomNumberGenerator, +} + +impl BooleanFieldGenerator { + /// Create a new boolean field generator that will always use the specified + /// name. + pub fn new(name: &str, parent_seed: &str) -> Self { + let name = name.into(); + let seed = format!("{}-{}", parent_seed, name); + let rng = RandomNumberGenerator::::new(seed); + + Self { name, rng } + } +} + +impl FieldGenerator for BooleanFieldGenerator { + fn generate(&mut self, _timestamp: i64) -> Field { + let b: bool = self.rng.gen(); + Field::new(&self.name, b) + } +} + +/// Generate integer field names and values. +#[derive(Debug)] +pub struct I64FieldGenerator { + name: String, + range: Range, + increment: bool, + rng: RandomNumberGenerator, + previous_value: i64, + reset_after: Option, + current_tick: usize, +} + +impl I64FieldGenerator { + /// Create a new integer field generator that will always use the specified + /// name. + pub fn new( + name: impl Into, + range: &Range, + increment: bool, + reset_after: Option, + parent_seed: impl fmt::Display, + ) -> Self { + let name = name.into(); + let seed = format!("{}-{}", parent_seed, name); + let rng = RandomNumberGenerator::::new(seed); + + Self { + name, + range: range.to_owned(), + increment, + rng, + previous_value: 0, + reset_after, + current_tick: 0, + } + } +} + +impl FieldGenerator for I64FieldGenerator { + fn generate(&mut self, _timestamp: i64) -> Field { + let mut value = if self.range.start == self.range.end { + self.range.start + } else { + self.rng.gen_range(self.range.clone()) + }; + + if self.increment { + self.previous_value = self.previous_value.wrapping_add(value); + value = self.previous_value; + + if let Some(reset) = self.reset_after { + self.current_tick += 1; + if self.current_tick >= reset { + self.previous_value = 0; + self.current_tick = 0; + } + } + } + + Field::new(&self.name, value) + } +} + +/// Generate floating point field names and values. +#[derive(Debug)] +pub struct F64FieldGenerator { + name: String, + range: Range, + rng: RandomNumberGenerator, +} + +impl F64FieldGenerator { + /// Create a new floating point field generator that will always use the + /// specified name. + pub fn new( + name: impl Into, + range: &Range, + parent_seed: impl fmt::Display, + ) -> Self { + let name = name.into(); + let seed = format!("{}-{}", parent_seed, name); + let rng = RandomNumberGenerator::::new(seed); + + Self { + name, + range: range.to_owned(), + rng, + } + } +} + +impl FieldGenerator for F64FieldGenerator { + fn generate(&mut self, _timestamp: i64) -> Field { + let value = if (self.range.start - self.range.end).abs() < f64::EPSILON { + self.range.start + } else { + self.rng.gen_range(self.range.clone()) + }; + + Field::new(&self.name, value) + } +} + +/// Generate string field names and values. +#[derive(Debug)] +pub struct StringFieldGenerator { + agent_name: String, + name: String, + substitute: Substitute, + rng: RandomNumberGenerator, + replacements: Vec, +} + +impl StringFieldGenerator { + /// Create a new string field generator + pub fn new( + agent_name: impl Into, + name: impl Into, + pattern: impl Into, + parent_seed: impl fmt::Display, + replacements: Vec, + ) -> Result { + let name = name.into(); + let seed = format!("{}-{}", parent_seed, name); + let rng = RandomNumberGenerator::::new(seed); + let substitute = Substitute::new(pattern, RandomNumberGenerator::::new(&rng.seed)) + .context(CouldNotCompileStringTemplate {})?; + + Ok(Self { + agent_name: agent_name.into(), + name, + substitute, + rng, + replacements, + }) + } +} + +impl FieldGenerator for StringFieldGenerator { + fn generate(&mut self, timestamp: i64) -> Field { + #[derive(Serialize)] + struct Values<'a> { + #[serde(flatten)] + replacements: BTreeMap<&'a str, &'a str>, + agent_name: &'a str, + timestamp: i64, + } + + let values = Values { + replacements: pick_from_replacements(&mut self.rng, &self.replacements), + agent_name: &self.agent_name, + timestamp, + }; + + let value = self + .substitute + .evaluate(&values) + .expect("Unable to substitute string field value"); + + Field::new(&self.name, value) + } +} + +/// Generate an i64 field that has the name `uptime` and the value of the number +/// of seconds since the data generator started running +#[derive(Debug)] +pub struct UptimeFieldGenerator { + name: String, + execution_start_time: i64, + kind: specification::UptimeKind, +} + +impl UptimeFieldGenerator { + fn new( + name: impl Into, + kind: &specification::UptimeKind, + execution_start_time: i64, + ) -> Self { + Self { + name: name.into(), + kind: *kind, + execution_start_time, + } + } +} + +impl FieldGenerator for UptimeFieldGenerator { + fn generate(&mut self, _timestamp: i64) -> Field { + use specification::UptimeKind::*; + + let elapsed = Duration::from_nanos((now_ns() - self.execution_start_time) as u64); + let elapsed_seconds = elapsed.as_secs(); + + match self.kind { + I64 => Field::new(&self.name, elapsed_seconds as i64), + Telegraf => { + let days = elapsed_seconds / (60 * 60 * 24); + let days_plural = if days == 1 { "" } else { "s" }; + + let mut minutes = elapsed_seconds / 60; + let mut hours = minutes / 60; + hours %= 24; + minutes %= 60; + + let duration_string = + format!("{} day{}, {:02}:{:02}", days, days_plural, hours, minutes); + Field::new(&self.name, duration_string) + } + } + } +} + +fn field_spec_to_generator( + agent_name: &str, + agent_id: usize, + measurement_id: usize, + field_id: usize, + spec: &specification::FieldSpec, + parent_seed: &str, + execution_start_time: i64, +) -> Result> { + use specification::FieldValueSpec::*; + + let spec_name = Substitute::once( + &spec.name, + &[ + ("agent_id", &agent_id.to_string()), + ("measurement_id", &measurement_id.to_string()), + ("field_id", &field_id.to_string()), + ], + ) + .context(CouldNotCreateFieldName)?; + + Ok(match &spec.field_value_spec { + Bool(true) => Box::new(BooleanFieldGenerator::::new(&spec_name, parent_seed)), + Bool(false) => unimplemented!("Not sure what false means for bool fields yet"), + I64 { + range, + increment, + reset_after, + } => Box::new(I64FieldGenerator::::new( + &spec_name, + range, + *increment, + *reset_after, + parent_seed, + )), + F64 { range } => Box::new(F64FieldGenerator::::new(&spec_name, range, parent_seed)), + String { + pattern, + replacements, + } => Box::new(StringFieldGenerator::::new( + agent_name, + &spec_name, + pattern, + parent_seed, + replacements.to_vec(), + )?), + Uptime { kind } => Box::new(UptimeFieldGenerator::new( + &spec_name, + kind, + execution_start_time, + )), + }) +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{DynamicRng, ZeroRng, TEST_SEED}; + use test_helpers::approximately_equal; + + type Error = Box; + type Result = std::result::Result; + + // Shortcut functions that panic for getting values out of fields for test convenience + impl Field { + fn i64(&self) -> i64 { + match self.value { + FieldValue::I64(v) => v, + ref other => panic!("expected i64, got {:?}", other), + } + } + + fn f64(&self) -> f64 { + match self.value { + FieldValue::F64(v) => v, + ref other => panic!("expected f64, got {:?}", other), + } + } + + fn bool(&self) -> bool { + match self.value { + FieldValue::Bool(v) => v, + ref other => panic!("expected bool, got {:?}", other), + } + } + + fn string(&self) -> String { + match &self.value { + FieldValue::String(v) => v.clone(), + ref other => panic!("expected String, got {:?}", other), + } + } + } + + #[test] + fn generate_boolean_field() { + let mut bfg = BooleanFieldGenerator::::new("bfg", TEST_SEED); + + assert!(!bfg.generate(1234).bool()); + } + + #[test] + fn generate_i64_field_always_the_same() { + // If the specification has the same number for the start and end of the + // range... + let mut i64fg = + I64FieldGenerator::::new("i64fg", &(3..3), false, None, TEST_SEED); + + let i64_fields: Vec<_> = (0..10).map(|_| i64fg.generate(1234).i64()).collect(); + let expected = i64_fields[0]; + + // All the values generated will always be the same. + assert!( + i64_fields.iter().all(|f| *f == expected), + "{:?}", + i64_fields + ); + + // If the specification has n for the start and n+1 for the end of the range... + let mut i64fg = + I64FieldGenerator::::new("i64fg", &(4..5), false, None, TEST_SEED); + + let i64_fields: Vec<_> = (0..10).map(|_| i64fg.generate(1234).i64()).collect(); + // We know what the value will be even though we're using a real random number generator + let expected = 4; + + // All the values generated will also always be the same, because the end of the + // range is exclusive. + assert!( + i64_fields.iter().all(|f| *f == expected), + "{:?}", + i64_fields + ); + } + + #[test] + fn generate_i64_field_within_a_range() { + let range = 3..1000; + + let mut i64fg = + I64FieldGenerator::::new("i64fg", &range, false, None, TEST_SEED); + + let val = i64fg.generate(1234).i64(); + + assert!(range.contains(&val), "`{}` was not in the range", val); + } + + #[test] + fn generate_incrementing_i64_field() { + let mut i64fg = + I64FieldGenerator::::new("i64fg", &(3..10), true, None, TEST_SEED); + + let val1 = i64fg.generate(1234).i64(); + let val2 = i64fg.generate(1234).i64(); + let val3 = i64fg.generate(1234).i64(); + let val4 = i64fg.generate(1234).i64(); + + assert!(val1 < val2, "`{}` < `{}` was false", val1, val2); + assert!(val2 < val3, "`{}` < `{}` was false", val2, val3); + assert!(val3 < val4, "`{}` < `{}` was false", val3, val4); + } + + #[test] + fn incrementing_i64_wraps() { + let rng = RandomNumberGenerator::::new(TEST_SEED); + let range = 3..10; + let previous_value = i64::MAX; + + // Construct by hand to set the previous value at the end of i64's range + let mut i64fg = I64FieldGenerator { + name: "i64fg".into(), + range: range.clone(), + increment: true, + reset_after: None, + rng, + previous_value, + current_tick: 0, + }; + + let resulting_range = + range.start.wrapping_add(previous_value)..range.end.wrapping_add(previous_value); + + let val = i64fg.generate(1234).i64(); + + assert!( + resulting_range.contains(&val), + "`{}` was not in the range", + val + ); + } + + #[test] + fn incrementing_i64_that_resets() { + let reset_after = Some(3); + let mut i64fg = + I64FieldGenerator::::new("i64fg", &(3..10), true, reset_after, TEST_SEED); + + let val1 = i64fg.generate(1234).i64(); + let val2 = i64fg.generate(1234).i64(); + let val3 = i64fg.generate(1234).i64(); + let val4 = i64fg.generate(1234).i64(); + + assert!(val1 < val2, "`{}` < `{}` was false", val1, val2); + assert!(val2 < val3, "`{}` < `{}` was false", val2, val3); + assert!(val4 < val3, "`{}` < `{}` was false", val4, val3); + } + + #[test] + fn generate_f64_field_always_the_same() { + // If the specification has the same number for the start and end of the + // range... + let start_and_end = 3.0; + let range = start_and_end..start_and_end; + let mut f64fg = F64FieldGenerator::::new("f64fg", &range, TEST_SEED); + + let f64_fields: Vec<_> = (0..10).map(|_| f64fg.generate(1234).f64()).collect(); + + // All the values generated will always be the same known value. + assert!( + f64_fields + .iter() + .all(|f| approximately_equal(*f, start_and_end)), + "{:?}", + f64_fields + ); + } + + #[test] + fn generate_f64_field_within_a_range() { + let range = 3.0..1000.0; + let mut f64fg = F64FieldGenerator::::new("f64fg", &range, TEST_SEED); + + let val = f64fg.generate(1234).f64(); + assert!(range.contains(&val), "`{}` was not in the range", val); + } + + #[test] + fn generate_string_field_without_replacements() { + let fake_now = 11111; + + let mut stringfg = StringFieldGenerator::::new( + "agent_name", + "stringfg", + "my value", + TEST_SEED, + vec![], + ) + .unwrap(); + + assert_eq!("my value", stringfg.generate(fake_now).string()); + } + + #[test] + fn generate_string_field_with_provided_replacements() { + let fake_now = 5555555555; + + let mut stringfg = StringFieldGenerator::::new( + "double-oh-seven", + "stringfg", + r#"{{agent_name}}---{{random 16}}---{{format-time "%s%f"}}"#, + TEST_SEED, + vec![], + ) + .unwrap(); + + let string_val1 = stringfg.generate(fake_now).string(); + let string_val2 = stringfg.generate(fake_now).string(); + + assert!( + string_val1.starts_with("double-oh-seven---"), + "`{}` did not start with `double-oh-seven---`", + string_val1 + ); + assert!( + string_val1.ends_with("---5555555555"), + "`{}` did not end with `---5555555555`", + string_val1 + ); + assert!( + string_val2.starts_with("double-oh-seven---"), + "`{}` did not start with `double-oh-seven---`", + string_val2 + ); + assert!( + string_val2.ends_with("---5555555555"), + "`{}` did not end with `---5555555555`", + string_val2 + ); + + assert_ne!(string_val1, string_val2, "random value should change"); + } + + #[test] + #[should_panic(expected = "Unable to substitute string field value")] + fn unknown_replacement_errors() { + let fake_now = 55555; + + let mut stringfg = StringFieldGenerator::::new( + "arbitrary", + "stringfg", + "static-{{unknown}}", + TEST_SEED, + vec![], + ) + .unwrap(); + + stringfg.generate(fake_now); + } + + #[test] + fn replacements_no_weights() -> Result<()> { + let fake_now = 55555; + + let toml: specification::FieldSpec = toml::from_str( + r#" + name = "sf" + pattern = "foo {{level}}" + replacements = [ + {replace = "level", with = ["info", "warn", "error"]} + ]"#, + ) + .unwrap(); + let mut stringfg = + field_spec_to_generator::("agent_name", 0, 0, 0, &toml, TEST_SEED, fake_now)?; + + assert_eq!("foo info", stringfg.generate(fake_now).string()); + Ok(()) + } + + #[test] + fn replacements_with_weights() -> Result<()> { + let fake_now = 55555; + + let toml: specification::FieldSpec = toml::from_str( + r#" + name = "sf" + pattern = "foo {{level}}" + replacements = [ + {replace = "level", with = [["info", 1000000], ["warn", 1], ["error", 0]]} + ]"#, + ) + .unwrap(); + let mut stringfg = + field_spec_to_generator::("agent_name", 0, 0, 0, &toml, TEST_SEED, fake_now)?; + + assert_eq!("foo info", stringfg.generate(fake_now).string()); + Ok(()) + } + + #[test] + fn uptime_i64() -> Result<()> { + let fake_now = 55555; + + // Pretend data generator started running 10 seconds ago + let seconds_ago = 10; + let fake_start_execution_time = now_ns() - seconds_ago * 1_000_000_000; + + let toml: specification::FieldSpec = toml::from_str( + r#" + name = "arbitrary" # field name doesn't have to be uptime + uptime = "i64""#, + ) + .unwrap(); + let mut uptimefg = field_spec_to_generator::( + "agent_name", + 0, + 0, + 0, + &toml, + TEST_SEED, + fake_start_execution_time, + )?; + + assert_eq!(seconds_ago, uptimefg.generate(fake_now).i64()); + Ok(()) + } + + #[test] + fn uptime_telegraf() -> Result<()> { + let fake_now = 55555; + + // Pretend data generator started running 10 days, 2 hours, and 33 minutes ago + let seconds_ago = 10 * 24 * 60 * 60 + 2 * 60 * 60 + 33 * 60; + let fake_start_execution_time = now_ns() - seconds_ago * 1_000_000_000; + + let toml: specification::FieldSpec = toml::from_str( + r#" + name = "arbitrary" # field name doesn't have to be uptime + uptime = "telegraf""#, + ) + .unwrap(); + let mut uptimefg = field_spec_to_generator::( + "agent_name", + 0, + 0, + 0, + &toml, + TEST_SEED, + fake_start_execution_time, + )?; + + assert_eq!("10 days, 02:33", uptimefg.generate(fake_now).string()); + + // Pretend data generator started running 1 day, 14 hours, and 5 minutes ago + // to exercise different formatting + let seconds_in_1_day = 24 * 60 * 60; + let seconds_in_14_hours = 14 * 60 * 60; + let seconds_in_5_minutes = 5 * 60; + + let seconds_ago = seconds_in_1_day + seconds_in_14_hours + seconds_in_5_minutes; + let fake_start_execution_time = now_ns() - seconds_ago * 1_000_000_000; + + let mut uptimefg = field_spec_to_generator::( + "agent_name", + 0, + 0, + 0, + &toml, + TEST_SEED, + fake_start_execution_time, + )?; + + assert_eq!("1 day, 14:05", uptimefg.generate(fake_now).string()); + + Ok(()) + } +} diff --git a/iox_data_generator/src/lib.rs b/iox_data_generator/src/lib.rs new file mode 100644 index 0000000000..80470b34f0 --- /dev/null +++ b/iox_data_generator/src/lib.rs @@ -0,0 +1,357 @@ +//! This crate contains structures and generators for specifying how to generate +//! historical and real-time test data for Delorean. The rules for how to +//! generate data and what shape it should take can be specified in a TOML file. +//! +//! Generators can output in line protocol, Parquet, or can be used to generate +//! real-time load on a server that implements the [InfluxDB 2.0 write +//! path][write-api]. +//! +//! [write-api]: https://v2.docs.influxdata.com/v2.0/api/#tag/Write +//! +//! While this generator could be compared to [the Go based one that creates TSM +//! data][go-gen], its purpose is meant to be more far reaching. In addition to +//! generating historical data, it should be useful for generating data in a +//! sequence as you would expect it to arrive in a production environment. That +//! means many agents sending data with their different tags and timestamps. +//! +//! [go-gen]: https://github.com/influxdata/influxdb/pull/12710 + +#![deny(rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_debug_implementations, + missing_docs, + clippy::explicit_iter_loop, + clippy::use_self +)] + +use crate::substitution::Substitute; +use rand::Rng; +use rand_seeder::Seeder; +use snafu::{ResultExt, Snafu}; +use std::{ + convert::TryFrom, + time::{SystemTime, UNIX_EPOCH}, +}; + +pub mod agent; +pub mod field; +pub mod measurement; +pub mod specification; +pub mod substitution; +pub mod tag; +pub mod write; + +/// Errors that may happen while generating points. +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when waiting on a tokio task + #[snafu(display("Could not join tokio task: {}", source))] + TokioError { + /// Underlying tokio error that caused this problem + source: tokio::task::JoinError, + }, + + /// Error that may happen when constructing an agent name + #[snafu(display("Could not create agent name, caused by:\n{}", source))] + CouldNotCreateAgentName { + /// Underlying `substitution` module error that caused this problem + source: substitution::Error, + }, + + /// Error that may happen when an agent generates points + #[snafu(display("Agent could not generate points, caused by:\n{}", source))] + AgentCouldNotGeneratePoints { + /// Underlying `agent` module error that caused this problem + source: agent::Error, + }, + + /// Error that may happen when creating agents + #[snafu(display("Could not create agent `{}`, caused by:\n{}", name, source))] + CouldNotCreateAgent { + /// The name of the relevant agent + name: String, + /// Underlying `agent` module error that caused this problem + source: agent::Error, + }, +} + +type Result = std::result::Result; + +/// Generate data from the configuration in the spec. +/// +/// Provide a writer that the line protocol should be written to. +/// +/// If `start_datetime` or `end_datetime` are `None`, the current datetime will +/// be used. +pub async fn generate( + spec: &specification::DataSpec, + points_writer_builder: &mut write::PointsWriterBuilder, + start_datetime: Option, + end_datetime: Option, + execution_start_time: i64, + continue_on: bool, +) -> Result { + let seed = spec.base_seed.to_owned().unwrap_or_else(|| { + let mut rng = rand::thread_rng(); + format!("{:04}", rng.gen_range(0..10000)) + }); + + let mut handles = vec![]; + + // for each agent specification + for agent_spec in &spec.agents { + // create iterators to `cycle` through for `agent_spec.tags` + let tag_set_iterator = tag::AgentTagIterator::new(&agent_spec.tags); + + // create `count` number of agent instances, or 1 agent if no count is specified + let n_agents = agent_spec.count.unwrap_or(1); + + for (agent_id, mut agent_tags) in tag_set_iterator.take(n_agents).enumerate() { + let agent_name = + Substitute::once(&agent_spec.name, &[("agent_id", &agent_id.to_string())]) + .context(CouldNotCreateAgentName)?; + + agent_tags.push(tag::Tag::new("data_spec", &spec.name)); + + if let Some(name_tag_key) = &agent_spec.name_tag_key { + agent_tags.push(tag::Tag::new(name_tag_key, &agent_name)); + } + + let mut agent = agent::Agent::::new( + agent_spec, + &agent_name, + agent_id, + &seed, + agent_tags, + start_datetime, + end_datetime, + execution_start_time, + continue_on, + ) + .context(CouldNotCreateAgent { name: &agent_name })?; + + let agent_points_writer = points_writer_builder.build_for_agent(&agent_name); + + handles.push(tokio::task::spawn(async move { + agent.generate_all(agent_points_writer).await + })); + } + } + + let mut total_points = 0; + for handle in handles { + total_points += handle + .await + .context(TokioError)? + .context(AgentCouldNotGeneratePoints)?; + } + + Ok(total_points) +} + +/// Shorthand trait for the functionality this crate needs a random number generator to have +pub trait DataGenRng: rand::Rng + rand::SeedableRng + Send + 'static {} + +impl DataGenRng for T {} + +/// Encapsulating the creation of an optionally-seedable random number generator +/// to make this easy to change. Uses a 4-digit number expressed as a `String` +/// as the seed type to enable easy creation of another instance using the same +/// seed. +#[derive(Debug)] +pub struct RandomNumberGenerator { + rng: T, + /// The seed used for this instance. + pub seed: String, +} + +impl Default for RandomNumberGenerator { + fn default() -> Self { + let mut rng = rand::thread_rng(); + let seed = format!("{:04}", rng.gen_range(0..10000)); + Self::new(seed) + } +} + +impl RandomNumberGenerator { + /// Create a new instance using the specified seed. + pub fn new(seed: impl Into) -> Self { + let seed = seed.into(); + Self { + rng: Seeder::from(&seed).make_rng(), + seed, + } + } + + /// Generate a random GUID + pub fn guid(&mut self) -> uuid::Uuid { + let mut bytes = [0u8; 16]; + self.rng.fill_bytes(&mut bytes); + uuid::Builder::from_bytes(bytes) + .set_variant(uuid::Variant::RFC4122) + .set_version(uuid::Version::Random) + .build() + } +} + +impl rand::RngCore for RandomNumberGenerator { + fn next_u32(&mut self) -> u32 { + self.rng.next_u32() + } + + fn next_u64(&mut self) -> u64 { + self.rng.next_u64() + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + self.rng.fill_bytes(dest); + } + + fn try_fill_bytes(&mut self, dest: &mut [u8]) -> std::result::Result<(), rand::Error> { + self.rng.try_fill_bytes(dest) + } +} + +/// Gets the current time in nanoseconds since the epoch +pub fn now_ns() -> i64 { + let since_the_epoch = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + i64::try_from(since_the_epoch.as_nanos()).expect("Time does not fit") +} + +// Always returns 0. +#[cfg(test)] +#[derive(Default)] +struct ZeroRng; + +#[cfg(test)] +impl rand::RngCore for ZeroRng { + fn next_u32(&mut self) -> u32 { + self.next_u64() as u32 + } + + fn next_u64(&mut self) -> u64 { + 0 + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + rand_core::impls::fill_bytes_via_next(self, dest) + } + + fn try_fill_bytes(&mut self, dest: &mut [u8]) -> std::result::Result<(), rand::Error> { + self.fill_bytes(dest); + Ok(()) + } +} + +#[cfg(test)] +impl rand::SeedableRng for ZeroRng { + type Seed = Vec; + + // Ignore the seed value + fn from_seed(_seed: Self::Seed) -> Self { + Self + } +} + +// The test rng ignores the seed anyway, so the seed specified doesn't matter. +#[cfg(test)] +const TEST_SEED: &str = ""; + +#[cfg(test)] +fn test_rng() -> RandomNumberGenerator { + RandomNumberGenerator::::new(TEST_SEED) +} + +// A random number type that does *not* have a predictable sequence of values for use in tests +// that assert on properties rather than exact values. Aliased for convenience in changing to +// a different Rng type. +#[cfg(test)] +type DynamicRng = rand::rngs::SmallRng; + +#[cfg(test)] +mod test { + use super::*; + use crate::specification::*; + use influxdb2_client::models::WriteDataPoint; + use std::str::FromStr; + + type Error = Box; + type Result = std::result::Result; + + #[tokio::test] + async fn historical_data_sampling_interval() -> Result<()> { + let toml = r#" +name = "demo_schema" + +[[agents]] +name = "basic" +sampling_interval = 10 # seconds + +[[agents.measurements]] +name = "cpu" + +[[agents.measurements.fields]] +name = "up" +bool = true"#; + let data_spec = DataSpec::from_str(toml).unwrap(); + let agent_id = 0; + let agent_spec = &data_spec.agents[0]; + // Take agent_tags out of the equation for the purposes of this test + let agent_tags = vec![]; + + let execution_start_time = now_ns(); + + // imagine we've specified at the command line that we want to generate metrics + // for 1970 + let start_datetime = Some(0); + // for the first 15 seconds of the year + let end_datetime = Some(15 * 1_000_000_000); + + let mut agent = agent::Agent::::new( + agent_spec, + &agent_spec.name, + agent_id, + TEST_SEED, + agent_tags, + start_datetime, + end_datetime, + execution_start_time, + false, + )?; + + let data_points = agent.generate().await?; + let mut v = Vec::new(); + for data_point in data_points { + data_point.write_data_point_to(&mut v).unwrap(); + } + let line_protocol = String::from_utf8(v).unwrap(); + + // Get a point for time 0 + let expected_line_protocol = "cpu up=f 0\n"; + assert_eq!(line_protocol, expected_line_protocol); + + let data_points = agent.generate().await?; + let mut v = Vec::new(); + for data_point in data_points { + data_point.write_data_point_to(&mut v).unwrap(); + } + let line_protocol = String::from_utf8(v).unwrap(); + + // Get a point for time 10s + let expected_line_protocol = "cpu up=f 10000000000\n"; + assert_eq!(line_protocol, expected_line_protocol); + + // Don't get any points anymore because we're past the ending datetime + let data_points = agent.generate().await?; + assert!( + data_points.is_empty(), + "expected no data points, got {:?}", + data_points + ); + + Ok(()) + } +} diff --git a/iox_data_generator/src/main.rs b/iox_data_generator/src/main.rs new file mode 100644 index 0000000000..ab8d549938 --- /dev/null +++ b/iox_data_generator/src/main.rs @@ -0,0 +1,265 @@ +#![deny(rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_debug_implementations, + clippy::explicit_iter_loop, + clippy::use_self +)] + +use chrono::prelude::*; +use chrono_english::{parse_date_string, Dialect}; +use clap::{crate_authors, crate_version, App, Arg}; +use iox_data_generator::{specification::DataSpec, write::PointsWriterBuilder}; +use tracing::info; + +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt::init(); + + let help = r#"IOx data point generator + +Examples: + # Generate data points using the specification in `spec.toml` and save in the `lp` directory + iox_data_generator -s spec.toml -o lp + + # Generate data points and write to the server running at localhost:8080 with the provided org, + # bucket and authorization token, creating the bucket + iox_data_generator -s spec.toml -h localhost:8080 --org myorg --org_id 0000111100001111 \ + --bucket mybucket --token mytoken --create + + # Generate data points for the 24 hours between midnight 2020-01-01 and 2020-01-02 + iox_data_generator -s spec.toml -o lp --start 2020-01-01 --end 2020-01-02 + + # Generate data points starting from an hour ago until now, generating the historical data as + # fast as possible. Then generate data according to the sampling interval until terminated. + iox_data_generator -s spec.toml -o lp --start "1 hr ago" --continue + +Logging: + Use the RUST_LOG environment variable to configure the desired logging level. + For example: + + # Enable INFO level logging for all of iox_data_generator + RUST_LOG=iox_data_generator=info iox_data_generator -s spec.toml -o lp + + +"#; + + let matches = App::new(help) + .version(crate_version!()) + .author(crate_authors!()) + .about("IOx data point generator") + .arg( + Arg::with_name("SPECIFICATION") + .short("s") + .long("spec") + .help("Path to the specification TOML file describing the data generation") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("OUTPUT") + .short("o") + .long("output") + .help("The filename to write line protocol") + .takes_value(true), + ) + .arg( + Arg::with_name("HOST") + .short("h") + .long("host") + .help("The host name part of the API endpoint to write to") + .takes_value(true), + ) + .arg( + Arg::with_name("ORG") + .long("org") + .help("The organization name to write to") + .takes_value(true), + ) + .arg( + Arg::with_name("ORG_ID") + .long("org_id") + .help("The 16-digit hex ID of the organization. Only needed if passing `--create`.") + .takes_value(true), + ) + .arg( + Arg::with_name("BUCKET") + .long("bucket") + .help("The bucket name to write to") + .takes_value(true), + ) + .arg( + Arg::with_name("TOKEN") + .long("token") + .help("The API authorization token used for all requests") + .takes_value(true), + ) + .arg( + Arg::with_name("START") + .long("start") + .help( + "The date and time at which to start the timestamps of the generated data. \ + Can be an exact datetime like `2020-01-01T01:23:45-05:00` or a fuzzy \ + specification like `1 hour ago`. If not specified, defaults to now.", + ) + .takes_value(true), + ) + .arg( + Arg::with_name("END") + .long("end") + .help( + "The date and time at which to stop the timestamps of the generated data. \ + Can be an exact datetime like `2020-01-01T01:23:45-05:00` or a fuzzy \ + specification like `1 hour ago`. If not specified, defaults to now.", + ) + .takes_value(true), + ) + .arg( + Arg::with_name("create") + .long("create") + .help("Create the bucket specified before sending points. Requires `--org_id`"), + ) + .arg(Arg::with_name("continue").long("continue").help( + "Generate live data using the intervals from the spec after generating historical \ + data. This option has no effect if you specify an end time.", + )) + .get_matches(); + + let spec_filename = matches + .value_of("SPECIFICATION") + // This should never fail if clap is working properly + .expect("SPECIFICATION is a required argument"); + + let execution_start_time = Local::now(); + + let start_datetime = datetime_nanoseconds(matches.value_of("START"), execution_start_time); + let end_datetime = datetime_nanoseconds(matches.value_of("END"), execution_start_time); + + let start_display = start_datetime.unwrap_or_else(|| execution_start_time.timestamp_nanos()); + let end_display = end_datetime.unwrap_or_else(|| execution_start_time.timestamp_nanos()); + + let continue_on = matches.is_present("continue"); + + info!( + "Starting at {}, ending at {} ({}){}", + start_display, + end_display, + (end_display - start_display) / 1_000_000_000, + if continue_on { " then continuing" } else { "" }, + ); + + let data_spec = DataSpec::from_file(spec_filename)?; + + // TODO: parquet output + + let mut points_writer_builder = if let Some(line_protocol_filename) = matches.value_of("OUTPUT") + { + PointsWriterBuilder::new_file(line_protocol_filename)? + } else if let Some(host) = matches.value_of("HOST") { + let (host, org, bucket, token, create_bucket, org_id) = validate_api_arguments( + host, + matches.value_of("ORG"), + matches.value_of("BUCKET"), + matches.value_of("TOKEN"), + matches.is_present("create"), + matches.value_of("ORG_ID"), + ); + + PointsWriterBuilder::new_api(host, org, bucket, token, create_bucket, org_id).await? + } else { + panic!("One of --output or --host must be provided."); + }; + + let result = iox_data_generator::generate::( + &data_spec, + &mut points_writer_builder, + start_datetime, + end_datetime, + execution_start_time.timestamp_nanos(), + continue_on, + ) + .await; + + match result { + Ok(total_points) => eprintln!("Submitted {} total points", total_points), + Err(e) => panic!("Execution failed: \n{}", e), + } + + Ok(()) +} + +fn datetime_nanoseconds(arg: Option<&str>, now: DateTime) -> Option { + arg.map(|s| { + let datetime = parse_date_string(s, now, Dialect::Us).expect("Could not parse time"); + datetime.timestamp_nanos() + }) +} + +fn validate_api_arguments<'a>( + host: &'a str, + org: Option<&'a str>, + bucket: Option<&'a str>, + token: Option<&'a str>, + create_bucket: bool, + org_id: Option<&'a str>, +) -> (&'a str, &'a str, &'a str, &'a str, bool, Option<&'a str>) { + let mut errors = vec![]; + + if create_bucket && org_id.is_none() { + panic!("When `--create` is specified, `--org_id` is required, but it was missing."); + } + + if org.is_none() { + errors.push("`--org` is missing"); + } + if bucket.is_none() { + errors.push("`--bucket` is missing"); + } + if token.is_none() { + errors.push("`--token` is missing"); + } + + if errors.is_empty() { + // These `unwrap`s are safe because otherwise errors wouldn't be empty + ( + host, + org.unwrap(), + bucket.unwrap(), + token.unwrap(), + create_bucket, + org_id, + ) + } else { + panic!( + "When `--host` is specified, `--org`, `--bucket`, and `--token` are required, \ + but {}", + errors.join(", ") + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn none_datetime_is_none_nanoseconds() { + let ns = datetime_nanoseconds(None, Local::now()); + assert!(ns.is_none()); + } + + #[test] + #[ignore] // TODO: I think chrono-english isn't handling timezones the way I'd expect + fn rfc3339() { + let ns = datetime_nanoseconds(Some("2020-01-01T01:23:45-05:00"), Local::now()); + assert_eq!(ns, Some(1577859825000000000)); + } + + #[test] + fn relative() { + let fixed_now = Local::now(); + let ns = datetime_nanoseconds(Some("1hr ago"), fixed_now); + let expected = (fixed_now - chrono::Duration::hours(1)).timestamp_nanos(); + assert_eq!(ns, Some(expected)); + } +} diff --git a/iox_data_generator/src/measurement.rs b/iox_data_generator/src/measurement.rs new file mode 100644 index 0000000000..1a245dd98a --- /dev/null +++ b/iox_data_generator/src/measurement.rs @@ -0,0 +1,960 @@ +//! Generating a set of points for one measurement configuration + +use crate::{ + field::FieldGeneratorSet, + specification, + substitution::Substitute, + tag::{Tag, TagGeneratorSet}, + DataGenRng, RandomNumberGenerator, +}; + +use influxdb2_client::models::DataPoint; +use itertools::Itertools; +use snafu::{ResultExt, Snafu}; +use std::fmt; + +/// Measurement-specific Results +pub type Result = std::result::Result; + +/// Errors that may happen while creating measurements +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when building a data point with the Influx DB + /// client + #[snafu(display( + "Could not build data point for measurement `{}` with Influx Client, caused by:\n{}", + name, + source + ))] + InfluxDataPointError { + /// The name of the relevant measurement + name: String, + /// Underlying Influx Client error that caused this problem + source: influxdb2_client::models::data_point::DataPointError, + }, + + /// Error that may happen when substituting placeholder values + #[snafu(display("Could not create measurement name, caused by:\n{}", source))] + CouldNotCreateMeasurementName { + /// Underlying `substitution` module error that caused this problem + source: crate::substitution::Error, + }, + + /// Error that may happen when creating tag generator sets + #[snafu(display( + "Could not create tag generator sets for measurement `{}`, caused by:\n{}", + name, + source + ))] + CouldNotCreateTagGeneratorSets { + /// The name of the relevant measurement + name: String, + /// Underlying `tag` module error that caused this problem + source: crate::tag::Error, + }, + + /// Error that may happen when creating field generator sets + #[snafu(display( + "Could not create field generator sets for measurement `{}`, caused by:\n{}", + name, + source + ))] + CouldNotCreateFieldGeneratorSets { + /// The name of the relevant measurement + name: String, + /// Underlying `field` module error that caused this problem + source: crate::field::Error, + }, + + /// Error that may happen when generating a particular set of tags + #[snafu(display( + "Could not generate tags for measurement `{}`, caused by:\n{}", + name, + source + ))] + CouldNotGenerateTags { + /// The name of the relevant measurement + name: String, + /// Underlying `tag` module error that caused this problem + source: crate::tag::Error, + }, +} + +/// A set of `count` measurements that have the same configuration but different +/// `measurement_id`s. The `generate` method on a `MeasurementGeneratorSet` will +/// always return `count` points. +#[derive(Debug)] +pub struct MeasurementGeneratorSet { + measurement_generators: Vec>, +} + +impl MeasurementGeneratorSet { + /// Create a new set of measurement generators for a particular agent and + /// measurement specification. + pub fn new( + agent_name: &str, + agent_id: usize, + spec: &specification::MeasurementSpec, + parent_seed: impl fmt::Display, + static_tags: &[Tag], + execution_start_time: i64, + ) -> Result { + let count = spec.count.unwrap_or(1); + + let measurement_generators = (0..count) + .map(|measurement_id| { + MeasurementGenerator::new( + agent_name, + agent_id, + measurement_id, + spec, + &parent_seed, + static_tags, + execution_start_time, + ) + }) + .collect::>()?; + + Ok(Self { + measurement_generators, + }) + } + + /// Create one set of points + pub fn generate(&mut self, timestamp: i64) -> Result> { + let generate_results = self + .measurement_generators + .iter_mut() + .map(|mg| mg.generate(timestamp)); + + itertools::process_results(generate_results, |points| points.flatten().collect()) + } +} + +/// Generate measurements +#[derive(Debug)] +pub struct MeasurementGenerator { + #[allow(dead_code)] + rng: RandomNumberGenerator, + name: String, + static_tags: Vec, + tag_generator_sets: Vec>, + total_tag_cardinality: usize, + field_generator_sets: Vec, + count: usize, +} + +impl MeasurementGenerator { + /// Create a new way to generate measurements from a specification + pub fn new( + agent_name: impl Into, + agent_id: usize, + measurement_id: usize, + spec: &specification::MeasurementSpec, + parent_seed: impl fmt::Display, + static_tags: &[Tag], + execution_start_time: i64, + ) -> Result { + let agent_name = agent_name.into(); + let spec_name = Substitute::once( + &spec.name, + &[ + ("agent_id", &agent_id.to_string()), + ("agent_name", &agent_name), + ("measurement_id", &measurement_id.to_string()), + ], + ) + .context(CouldNotCreateMeasurementName)?; + + let seed = format!("{}-{}", parent_seed, spec_name); + let rng = RandomNumberGenerator::::new(seed); + + let tag_generator_sets: Vec> = spec + .tags + .iter() + .map(|tag_spec| TagGeneratorSet::new(agent_id, measurement_id, tag_spec, &rng.seed)) + .collect::>() + .context(CouldNotCreateTagGeneratorSets { name: &spec_name })?; + + let total_tag_cardinality = tag_generator_sets + .iter() + .map(|tgs| tgs.tag_cardinality()) + .product(); + + let field_generator_sets = spec + .fields + .iter() + .map(|field_spec| { + FieldGeneratorSet::new::( + &agent_name, + agent_id, + measurement_id, + field_spec, + &rng.seed, + execution_start_time, + ) + }) + .collect::>() + .context(CouldNotCreateFieldGeneratorSets { name: &spec_name })?; + + Ok(Self { + rng, + name: spec_name, + static_tags: static_tags.to_vec(), + tag_generator_sets, + total_tag_cardinality, + field_generator_sets, + count: spec.count.unwrap_or(1), + }) + } +} + +impl MeasurementGenerator { + fn generate(&mut self, timestamp: i64) -> Result> { + // Split out the tags that we want all combinations of. Perhaps these should be + // a different type? + let mut tags_with_cardinality: Vec<_> = itertools::process_results( + self.tag_generator_sets + .iter_mut() + .filter(|tgs| tgs.tag_cardinality() > 1) + .map(TagGeneratorSet::generate), + |tags| { + tags.multi_cartesian_product() + .map(|tag_set| tag_set.into_iter().flatten().collect()) + .collect() + }, + ) + .context(CouldNotGenerateTags { name: &self.name })?; + + // Ensure we generate something even when there are no tags. + if tags_with_cardinality.is_empty() { + tags_with_cardinality.push(Vec::new()); + } + + let total_tag_cardinality = self.total_tag_cardinality; + assert_eq!(tags_with_cardinality.len(), total_tag_cardinality); + + // Split out the tags that we don't want to include when we're generating all + // possible combinations above. Perhaps these should be a different + // type? Leaving the type annotation here because it's terrible and + // confusing otherwise. + // + // This type is made up of: + // + // - `Vec` comes from one call to `TagGenerator::generate`. Tag + // configurations with a `count` value > 1 generate multiple tags with + // different keys but the same value for each generation. The length of this + // vector is the tag configuration's `count`. + // - `Vec>` comes from one call to `TagGenerator::generate_to_zip` and + // is a list of either cloned or resampled tags from this TagGenerator. The + // length of this vector is `total_tag_cardinality`. + // - `Vec>>` comes from collecting all these lists from each + // `TagGeneratorSet` that has a cardinality of 1 (the default). Each + // `TagGeneratorSet` corresponds to one tag configuration. + let tags_without_cardinality_columns = self + .tag_generator_sets + .iter_mut() + .filter(|tgs| tgs.tag_cardinality() == 1) + .map(|tgs| tgs.generate_to_zip(total_tag_cardinality).unwrap()); + + // This is doing a zip over an arbitrary number of iterators... itertools has + // something that produces tuples but I want it to produce Vectors + let mut tags_without_cardinality_column_iters: Vec<_> = tags_without_cardinality_columns + .map(|column| column.into_iter()) + .collect(); + + // For each group of tags that will become one row, + for v in &mut tags_with_cardinality { + // Get the rest of the tags that belong with this row that were either cloned or + // resampled according to their configuration + let tag_row: Vec> = tags_without_cardinality_column_iters + .iter_mut() + .map(|column_iter| { + column_iter.next().expect( + "Should have generated `total_tag_cardinality` items, \ + which should match the length of `tags_with_cardinality`", + ) + }) + .collect(); + // If count can't be combined with replacements, this `for` loop wouldn't be + // needed + for mut tags in tag_row { + v.append(&mut tags); + } + } + + tags_with_cardinality + .iter() + .map(|tags| self.one(&tags[..], timestamp)) + .collect() + } + + fn one(&mut self, tags: &[Tag], timestamp: i64) -> Result { + let mut point = DataPoint::builder(&self.name); + + point = self + .static_tags + .iter() + .fold(point, |point, tag| point.tag(&tag.key, &tag.value)); + + point = tags + .iter() + .fold(point, |point, tag| point.tag(&tag.key, &tag.value)); + + for fgs in &mut self.field_generator_sets { + for field in fgs.generate(timestamp) { + point = point.field(&field.key, field.value); + } + } + + point = point.timestamp(timestamp); + + point + .build() + .context(InfluxDataPointError { name: &self.name }) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{specification::*, DynamicRng, ZeroRng, TEST_SEED}; + use influxdb2_client::models::WriteDataPoint; + use std::str; + + type Error = Box; + type Result = std::result::Result; + + impl MeasurementGenerator { + fn generate_string(&mut self, timestamp: i64) -> Result { + self.generate_strings(timestamp) + .map(|mut strings| strings.swap_remove(0)) + } + + fn generate_strings(&mut self, timestamp: i64) -> Result> { + let points = self.generate(timestamp)?; + points + .into_iter() + .map(|point| { + let mut v = Vec::new(); + point.write_data_point_to(&mut v)?; + Ok(String::from_utf8(v)?) + }) + .collect() + } + } + + #[test] + fn generate_measurement() -> Result { + let fake_now = 1234; + + let measurement_spec = MeasurementSpec { + name: "cpu".into(), + count: None, + tags: vec![], + fields: vec![FieldSpec { + name: "response_time".into(), + field_value_spec: FieldValueSpec::I64 { + range: 0..60, + increment: false, + reset_after: None, + }, + count: None, + }], + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_string(fake_now)?; + + assert_eq!( + line_protocol, + format!("cpu response_time=0i {}\n", fake_now) + ); + + Ok(()) + } + + #[test] + fn generate_measurement_stable_rngs() -> Result { + let fake_now = 5678; + + // This is the same as the previous test but with an additional field. + let measurement_spec = MeasurementSpec { + name: "cpu".into(), + count: Some(2), + tags: vec![], + fields: vec![ + FieldSpec { + name: "load".into(), + field_value_spec: FieldValueSpec::F64 { range: 0.0..100.0 }, + count: None, + }, + FieldSpec { + name: "response_time".into(), + field_value_spec: FieldValueSpec::I64 { + range: 0..60_000, + increment: false, + reset_after: None, + }, + count: None, + }, + ], + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = vec![measurement_generator.generate_string(fake_now)?]; + let response_times = extract_field_values("response_time", &line_protocol); + + let next_line_protocol = vec![measurement_generator.generate_string(fake_now + 1)?]; + let next_response_times = extract_field_values("response_time", &next_line_protocol); + + // Each line should have a different response time unless we get really, really unlucky + assert_ne!(response_times, next_response_times); + + Ok(()) + } + + #[test] + fn generate_measurement_always_including_some_tags() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "cpu".into(), + count: None, + tags: vec![], + fields: vec![FieldSpec { + name: "response_time".into(), + field_value_spec: FieldValueSpec::I64 { + range: 0..60, + increment: false, + reset_after: None, + }, + count: None, + }], + }; + + let always_tags = vec![Tag::new("my_tag", "my_val")]; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &always_tags, + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_string(fake_now)?; + + assert_eq!( + line_protocol, + format!("cpu,my_tag=my_val response_time=0i {}\n", fake_now), + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_basic_tags() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "measurement".into(), + tags: vec![ + TagSpec { + name: "tag_name".into(), + value: "tag_value".into(), + ..Default::default() + }, + TagSpec { + name: "some_name".into(), + value: "some_value".into(), + ..Default::default() + }, + ], + fields: vec![FieldSpec { + name: "field_name".into(), + ..FieldSpec::default() + }], + ..Default::default() + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_string(fake_now)?; + + assert_eq!( + line_protocol, + format!( + "measurement,some_name=some_value,tag_name=tag_value field_name=f {}\n", + fake_now + ) + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_tags_with_count() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "measurement".into(), + tags: vec![TagSpec { + name: "{{agent_id}}--{{measurement_id}}--tag_name--{{tag_id}}".into(), + value: "tag_value".into(), + count: Some(2), + ..Default::default() + }], + fields: vec![FieldSpec { + name: "field_name".into(), + ..FieldSpec::default() + }], + ..Default::default() + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 42, + 99, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_string(fake_now)?; + + assert_eq!( + line_protocol, + format!("measurement,42--99--tag_name--0=tag_value,42--99--tag_name--1=tag_value field_name=f {}\n", fake_now), + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_tags_with_cardinality() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "measurement".into(), + tags: vec![TagSpec { + name: "tag_name".into(), + value: "tag_value--{{cardinality}}".into(), + cardinality: Some(2), + ..Default::default() + }], + fields: vec![FieldSpec { + name: "field_name".into(), + ..FieldSpec::default() + }], + ..Default::default() + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_strings(fake_now)?; + + assert_eq!( + line_protocol[0], + format!( + "measurement,tag_name=tag_value--0 field_name=f {}\n", + fake_now + ) + ); + assert_eq!( + line_protocol[1], + format!( + "measurement,tag_name=tag_value--1 field_name=f {}\n", + fake_now + ) + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_tags_with_multiple_cardinality() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "measurement".into(), + tags: vec![ + TagSpec { + name: "alpha".into(), + value: "alpha--{{cardinality}}".into(), + cardinality: Some(2), + ..Default::default() + }, + TagSpec { + name: "beta".into(), + value: "beta--{{cardinality}}".into(), + cardinality: Some(2), + ..Default::default() + }, + ], + fields: vec![FieldSpec { + name: "field_name".into(), + ..FieldSpec::default() + }], + ..Default::default() + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol = measurement_generator.generate_strings(fake_now)?; + + assert_eq!( + line_protocol[0], + format!( + "measurement,alpha=alpha--0,beta=beta--0 field_name=f {}\n", + fake_now + ) + ); + assert_eq!( + line_protocol[1], + format!( + "measurement,alpha=alpha--0,beta=beta--1 field_name=f {}\n", + fake_now + ) + ); + assert_eq!( + line_protocol[2], + format!( + "measurement,alpha=alpha--1,beta=beta--0 field_name=f {}\n", + fake_now + ) + ); + assert_eq!( + line_protocol[3], + format!( + "measurement,alpha=alpha--1,beta=beta--1 field_name=f {}\n", + fake_now + ) + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_tags_with_increment_every() -> Result { + let fake_now = 678; + + let measurement_spec = MeasurementSpec { + name: "measurement".into(), + tags: vec![TagSpec { + name: "tag_name".into(), + value: "tag_value--{{counter}}".into(), + increment_every: Some(2), + ..Default::default() + }], + fields: vec![FieldSpec { + name: "field_name".into(), + ..FieldSpec::default() + }], + ..Default::default() + }; + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let line_protocol_1 = measurement_generator.generate_string(fake_now)?; + let line_protocol_2 = measurement_generator.generate_string(fake_now)?; + let line_protocol_3 = measurement_generator.generate_string(fake_now)?; + + assert_eq!( + line_protocol_1, + format!( + "measurement,tag_name=tag_value--0 field_name=f {}\n", + fake_now, + ), + ); + assert_eq!( + line_protocol_2, + format!( + "measurement,tag_name=tag_value--0 field_name=f {}\n", + fake_now, + ), + ); + assert_eq!( + line_protocol_3, + format!( + "measurement,tag_name=tag_value--1 field_name=f {}\n", + fake_now, + ), + ); + + Ok(()) + } + + #[test] + fn generate_measurement_with_replacement() -> Result { + let fake_now = 91011; + + let measurement_spec = MeasurementSpec { + name: "measurement-{{agent_id}}-{{measurement_id}}".into(), + count: Some(2), + tags: vec![], + fields: vec![FieldSpec { + name: "field-{{agent_id}}-{{measurement_id}}-{{field_id}}".into(), + field_value_spec: FieldValueSpec::I64 { + range: 0..60, + increment: false, + reset_after: None, + }, + count: Some(2), + }], + }; + + let mut measurement_generator_set = MeasurementGeneratorSet::::new( + "agent_name", + 42, + &measurement_spec, + TEST_SEED, + &[], + fake_now, + ) + .unwrap(); + + let points = measurement_generator_set.generate(fake_now).unwrap(); + let mut v = Vec::new(); + for point in points { + point.write_data_point_to(&mut v)?; + } + let line_protocol = str::from_utf8(&v)?; + + assert_eq!( + line_protocol, + format!( + "measurement-42-0 field-42-0-0=0i,field-42-0-1=0i {} +measurement-42-1 field-42-1-0=0i,field-42-1-1=0i {} +", + fake_now, fake_now + ) + ); + + Ok(()) + } + + #[test] + fn guid_and_guid_with_cardinality() -> Result<()> { + let fake_now = 678; + + let spec: specification::MeasurementSpec = toml::from_str( + r#" + name = "traces" + + [[tags]] + name = "trace_id" + value = "value-{{guid}}" + + [[tags]] + name = "span_id" + value = "value-{{guid}}" + cardinality = 2 + + [[fields]] + name = "timing" + i64_range = [5, 100]"#, + ) + .unwrap(); + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &spec, + TEST_SEED, + &[], + fake_now, + )?; + + let line_protocol = measurement_generator.generate_strings(fake_now)?; + + let mut trace_ids = extract_tag_values("trace_id", &line_protocol); + trace_ids.sort_unstable(); + trace_ids.dedup(); + // Both lines should have the same trace ID + assert_eq!(trace_ids.len(), 1); + + let mut span_ids = extract_tag_values("span_id", &line_protocol); + span_ids.sort_unstable(); + span_ids.dedup(); + // Each line should have a different span ID + assert_eq!(span_ids.len(), 2); + + let next_line_protocol = measurement_generator.generate_strings(fake_now)?; + + let mut next_trace_ids = extract_tag_values("trace_id", &next_line_protocol); + next_trace_ids.sort_unstable(); + next_trace_ids.dedup(); + // Both lines should have the same trace ID + assert_eq!(next_trace_ids.len(), 1); + + // On each generation, there should be a new trace id + assert_ne!(trace_ids, next_trace_ids); + + let mut next_span_ids = extract_tag_values("span_id", &next_line_protocol); + next_span_ids.sort_unstable(); + next_span_ids.dedup(); + // Each line should have a different span ID + assert_eq!(next_span_ids.len(), 2); + + // On each generation, there should be new span IDs too + assert_ne!(span_ids, next_span_ids); + + Ok(()) + } + + #[test] + fn tag_replacements_with_resampling_true() -> Result<()> { + resampling_test("resample_every_line = true", true) + } + + #[test] + fn tag_replacements_with_resampling_false() -> Result<()> { + resampling_test("resample_every_line = false", false) + } + + #[test] + fn tag_replacements_with_default_resampling_false() -> Result<()> { + resampling_test("", false) + } + + fn resampling_test(resampling_toml: &str, expect_different: bool) -> Result<()> { + let fake_now = 678; + + let spec: specification::MeasurementSpec = toml::from_str(&format!( + r#" + name = "resampling" + + [[tags]] + name = "tag-1" + value = "value-{{{{cardinality}}}}" + cardinality = 10 + + [[tags]] + name = "host" + value = "{{{{host}}}}" + replacements = [ + {{replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}}, + ] + {} + + [[fields]] + name = "timing" + i64_range = [5, 100]"#, + resampling_toml + )) + .unwrap(); + + let mut measurement_generator = MeasurementGenerator::::new( + "agent_name", + 0, + 0, + &spec, + TEST_SEED, + &[], + fake_now, + )?; + + let lines = measurement_generator.generate_strings(fake_now)?; + let mut host_values = extract_tag_values("host", &lines); + host_values.sort_unstable(); + host_values.dedup(); + + if expect_different { + assert!(host_values.len() > 1); + } else { + assert_eq!(host_values.len(), 1); + } + + Ok(()) + } + + // Hacktacular extracting of values from line protocol without pulling in another crate + fn extract_tag_values<'a>(tag_name: &str, lines: &'a [String]) -> Vec<&'a str> { + lines + .iter() + .map(|line| { + let before_space = line.splitn(2, ' ').next().unwrap(); + let prefix = format!(",{}=", tag_name); + let after = before_space.rsplitn(2, &prefix).next().unwrap(); + after.splitn(2, ',').next().unwrap() + }) + .collect() + } + + fn extract_field_values<'a>(field_name: &str, lines: &'a [String]) -> Vec<&'a str> { + lines + .iter() + .map(|line| { + let mut split = line.splitn(2, ' '); + split.next(); + let after_space = split.next().unwrap(); + let prefix = format!(",{}=", field_name); + let after = after_space.rsplitn(2, &prefix).next().unwrap(); + after.splitn(2, ',').next().unwrap() + }) + .collect() + } +} diff --git a/iox_data_generator/src/specification.rs b/iox_data_generator/src/specification.rs new file mode 100644 index 0000000000..cdace76b6f --- /dev/null +++ b/iox_data_generator/src/specification.rs @@ -0,0 +1,616 @@ +//! Reading and interpreting data generation specifications. + +use serde::Deserialize; +use snafu::{ResultExt, Snafu}; +use std::{fs, ops::Range, str::FromStr}; + +/// Errors that may happen while reading a TOML specification. +#[derive(Snafu, Debug)] +pub enum Error { + /// File-related error that may happen while reading a specification + #[snafu(display(r#"Error reading data spec from TOML file: {}"#, source))] + ReadFile { + /// Underlying I/O error that caused this problem + source: std::io::Error, + }, + + /// TOML parsing error that may happen while interpreting a specification + #[snafu(display(r#"Error parsing data spec from TOML: {}"#, source))] + Parse { + /// Underlying TOML error that caused this problem + source: toml::de::Error, + }, +} + +type Result = std::result::Result; + +/// The full specification for the generation of a data set. +#[derive(Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct DataSpec { + /// Every point generated from this configuration will contain a tag + /// `data_spec=[this value]` to identify what generated that data. This + /// name can also be used in string replacements by using the + /// placeholder `{{data_spec}}`. + pub name: String, + /// A string to be used as the seed to the random number generators. + /// + /// When specified, this is used as a base seed propagated through all + /// measurements, tags, and fields, which will each have their own + /// random number generator seeded by this seed plus their name. This + /// has the effect of keeping each value sequence generated per measurement, + /// tag, or field stable even if the configurations in other parts of the + /// schema are changed. That is, if you have a field named `temp` and on + /// the first run with base seed `foo` generates the values `[10, 50, + /// 72, 3]`, and then you add another field named `weight` to the schema + /// and run with base seed `foo` again, the values generated for `temp` + /// should again be `[10, 50, 72, 3]`. This enables incremental + /// development of a schema without churn, if that is undesired. + /// + /// When this is not specified, the base seed will be randomly generated. It + /// will be printed to stdout so that the value used can be specified in + /// future configurations if reproducing a particular set of sequences + /// is desired. + pub base_seed: Option, + /// The specification for the data-generating agents in this data set. + pub agents: Vec, +} + +impl DataSpec { + /// Given a filename, read the file and parse the specification. + pub fn from_file(file_name: &str) -> Result { + let spec_toml = fs::read_to_string(file_name).context(ReadFile)?; + Self::from_str(&spec_toml) + } +} + +impl FromStr for DataSpec { + type Err = Error; + + fn from_str(spec_toml: &str) -> std::result::Result::Err> { + let spec: Self = toml::from_str(spec_toml).context(Parse)?; + Ok(spec) + } +} + +/// The specification of the behavior of an agent, the entity responsible for +/// generating a number of data points according to its configuration. +#[derive(Deserialize, Debug)] +#[cfg_attr(test, derive(Default))] +#[serde(deny_unknown_fields)] +pub struct AgentSpec { + /// Used as the value for the `name` tag if `name_tag_key` is `Some`; has no + /// effect if `name_tag_key` is not specified. + /// + /// Can be a plain string or a string with placeholders for: + /// + /// - `{{agent_id}}` - the agent ID + pub name: String, + /// Specifies the number of agents that should be created with this spec. + /// Default value is 1. + pub count: Option, + /// How often this agent should generate samples, in number of seconds. If + /// not specified, this agent will only generate one sample. + pub sampling_interval: Option, + /// If specified, every measurement generated by this agent will include a + /// tag with this `String` as its key, and with the `AgentSpec`'s `name` + /// as the value (with any substitutions in the `name` performed) + pub name_tag_key: Option, + /// If specified, the values of the tags will be cycled through per `Agent` + /// instance such that all measurements generated by that agent will + /// contain tags with the specified name and that agent's `name` field + /// (with replacements made) as the value. + #[serde(default)] + pub tags: Vec, + /// The specifications for the measurements for the agent to generate. + pub measurements: Vec, +} + +/// Tags that are associated to all measurements that a particular agent +/// generates. The values are rotated through so that each agent gets one of the +/// specified values for this key. +#[derive(Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct AgentTag { + /// The tag key to use when adding this tag to all measurements for an agent + pub key: String, + /// The values to cycle through for each agent for this tag key + pub values: Vec, +} + +/// The specification of how to generate data points for a particular +/// measurement. +#[derive(Deserialize, Debug)] +#[cfg_attr(test, derive(Default))] +#[serde(deny_unknown_fields)] +pub struct MeasurementSpec { + /// Name of the measurement. Can be a plain string or a string with + /// placeholders for: + /// + /// - `{{agent_id}}` - the agent ID + /// - `{{measurement_id}}` - the measurement's ID, which must be used if + /// `count` > 1 so that unique measurement names are created + pub name: String, + /// The number of measurements with this configuration that should be + /// created. Default value is 1. If specified, use `{{measurement_id}}` + /// in this measurement's `name` to create unique measurements. + pub count: Option, + /// Specification of the tags for this measurement + #[serde(default)] + pub tags: Vec, + /// Specification of the fields for this measurement. At least one field is + /// required. + pub fields: Vec, +} + +/// The specification of how to generate tag keys and values for a particular +/// measurement. +#[derive(Deserialize, Debug)] +#[cfg_attr(test, derive(Default))] +#[serde(deny_unknown_fields)] +pub struct TagSpec { + /// Key/name for this tag. Can be a plain string or a string with + /// placeholders for: + /// + /// - `{{agent_id}}` - the agent ID + /// - `{{measurement_id}}` - the measurement ID + /// - `{{tag_id}}` - the tag ID, which must be used if `count` > 1 so that + /// unique tag names are created + pub name: String, + /// Value for this tag. Can be a plain string or a string with placeholders + /// for: + /// + /// - `{{agent_id}}` - the agent ID + /// - `{{measurement_id}}` - the measurement ID + /// - `{{cardinality}}` - the cardinality counter value. Must use this or + /// `{{guid}}` if `cardinality` > 1 so that unique tag values are created + /// - `{{counter}}` - the increment counter value. Only useful if + /// `increment_every` is set. + /// - `{{guid}}` - a randomly generated unique string. If `cardinality` > 1, + /// each tag will have a different GUID. + pub value: String, + /// The number of tags with this configuration that should be created. + /// Default value is 1. If specified, use `{{tag_id}}` in this tag's + /// `name` to create unique tags. + pub count: Option, + /// A number that controls how many values are generated, which impacts how + /// many rows are created for each agent generation. Default value is 1. + /// If specified, use `{{cardinality}}` or `{{guid}}` in this tag's + /// `value` to create unique values. + pub cardinality: Option, + /// How often to increment the `{{counter}}` value. For example, if + /// `increment_every` is set to 10, `{{counter}}` will increase by 1 + /// after every 10 agent generations. This simulates temporal tag values + /// like process IDs or container IDs in tags. If not specified, the value + /// of `{{counter}}` will always be 0. + pub increment_every: Option, + /// A list of replacement placeholders and the values to replace them with. + /// The values can optionally have weights associated with them to + /// change the probabilities that its value will be used. + #[serde(default)] + pub replacements: Vec, + /// When there are replacements specified and other tags in this measurement + /// with cardinality greater than 1, this option controls whether this + /// tag will get a new replacement value on every line in a generation + /// (`true`) or whether it will be sampled once and have the same value + /// on every line in a generation (`false`). If there are no replacements on + /// this tag or any other tags with a cardinality greater than one, this + /// has no effect. + #[serde(default)] + pub resample_every_line: bool, +} + +/// The specification of how to generate field keys and values for a particular +/// measurement. +#[derive(Deserialize, Debug)] +#[cfg_attr(test, derive(Default))] +#[serde(from = "FieldSpecIntermediate")] +pub struct FieldSpec { + /// Key/name for this field. Can be a plain string or a string with + /// placeholders for: + /// + /// - `{{agent_id}}` - the agent ID + /// - `{{measurement_id}}` - the measurement ID + /// - `{{field_id}}` - the field ID, which must be used if `count` > 1 so + /// that unique field names are created + pub name: String, + /// Specification for the value for this field. + pub field_value_spec: FieldValueSpec, + /// How many fields with this configuration should be created + pub count: Option, +} + +impl From for FieldSpec { + fn from(value: FieldSpecIntermediate) -> Self { + let field_value_spec = if let Some(b) = value.bool { + FieldValueSpec::Bool(b) + } else if let Some((start, end)) = value.i64_range { + FieldValueSpec::I64 { + range: (start..end), + increment: value.increment.unwrap_or(false), + reset_after: value.reset_after, + } + } else if let Some((start, end)) = value.f64_range { + FieldValueSpec::F64 { + range: (start..end), + } + } else if let Some(pattern) = value.pattern { + FieldValueSpec::String { + pattern, + replacements: value.replacements, + } + } else if let Some(kind) = value.uptime { + FieldValueSpec::Uptime { kind } + } else { + panic!( + "Can't tell what type of field value you're trying to specify with this \ + configuration: `{:?}", + value + ); + }; + + Self { + name: value.name, + field_value_spec, + count: value.count, + } + } +} + +/// The specification of a field value of a particular type. Instances should be +/// created by converting a `FieldSpecIntermediate`, which more closely matches +/// the TOML structure. +#[derive(Debug, PartialEq)] +pub enum FieldValueSpec { + /// Configuration of a boolean field. + Bool(bool), + /// Configuration of an integer field. + I64 { + /// The `Range` in which random integer values will be generated. If the + /// range only contains one value, all instances of this field + /// will have the same value. + range: Range, + /// When set to true, after an initial random value in the range is + /// generated, a random increment in the range will be generated + /// and added to the initial value. That means the + /// value for this field will always be increasing. When the value + /// reaches the max value of i64, the value will wrap around to + /// the min value of i64 and increment again. + increment: bool, + /// If `increment` is true, after this many samples, reset the value to + /// start the increasing value over. If this is `None`, the + /// value won't restart until reaching the max value of i64. If + /// `increment` is false, this has no effect. + reset_after: Option, + }, + /// Configuration of a floating point field. + F64 { + /// The `Range` in which random floating point values will be generated. + /// If start == end, all instances of this field will have the + /// same value. + range: Range, + }, + /// Configuration of a string field. + String { + /// Pattern containing placeholders that specifies how to generate the + /// string values. + /// + /// Valid placeholders include: + /// + /// - `{{agent_name}}` - the agent spec's name, with any replacements + /// done + /// - `{{time}}` - the current time in nanoseconds since the epoch. + /// TODO: support specifying a strftime + /// - any other placeholders as specified in `replacements`. If a + /// placeholder has no value specified in `replacements`, it will end + /// up as-is in the field value. + pattern: String, + /// A list of replacement placeholders and the values to replace them + /// with. The values can optionally have weights associated with + /// them to change the probabilities that its value + /// will be used. + replacements: Vec, + }, + /// Configuration of a field with the value of the number of seconds the + /// data generation tool has been running. + Uptime { + /// Format of the uptime value in this field + kind: UptimeKind, + }, +} + +/// The kind of field value to create using the data generation tool's uptime +#[derive(Debug, PartialEq, Copy, Clone, Deserialize)] +pub enum UptimeKind { + /// Number of seconds since the tool started running as an i64 field + #[serde(rename = "i64")] + I64, + /// Number of seconds since the tool started running, formatted as a string + /// field containing the value in the format "x day(s), HH:MM" + #[serde(rename = "telegraf")] + Telegraf, +} + +#[cfg(test)] +impl Default for FieldValueSpec { + fn default() -> Self { + Self::Bool(true) + } +} + +/// An intermediate representation of the field specification that more directly +/// corresponds to the way field configurations are expressed in TOML. This +/// structure is transformed into the `FieldValueSpec` enum that ensures the +/// options for the different field value types are mutually exclusive. +#[derive(Deserialize, Debug)] +#[serde(deny_unknown_fields)] +struct FieldSpecIntermediate { + /// Key/name for this field. Can be a plain string or a string with + /// placeholders for: + /// + /// - `{{agent_id}}` - the agent ID + /// - `{{measurement_id}}` - the measurement ID + /// - `{{field_id}}` - the field ID, which must be used if `count` > 1 so + /// that unique field names are created + name: String, + /// The number of fields with this configuration that should be created. + /// Default value is 1. If specified, use `{{field_id}}` in this field's + /// `name` to create unique fields. + count: Option, + /// Specify `bool` to make a field that has the Boolean type. `true` means + /// to generate the boolean randomly with equal probability. `false` + /// means...? Specifying any other optional fields along with this one + /// is invalid. + bool: Option, + /// Specify `i64_range` to make an integer field. The values will be + /// randomly generated within the specified range with equal + /// probability. If the range only contains one element, all occurrences + /// of this field will have the same value. Can be combined with + /// `increment`; specifying any other optional fields is invalid. + i64_range: Option<(i64, i64)>, + /// Specify `f64_range` to make a floating point field. The values will be + /// randomly generated within the specified range. If start == end, all + /// occurrences of this field will have that value. + /// Can this be combined with `increment`? + f64_range: Option<(f64, f64)>, + /// When set to true with an `i64_range` (is this valid with any other + /// type?), after an initial random value is generated, a random + /// increment will be generated and added to the initial value. That + /// means the value for this field will always be increasing. When the value + /// reaches the end of the range...? The end of the range will be repeated + /// forever? The series will restart at the start of the range? + /// Something else? Setting this to `Some(false)` has the same effect as + /// `None`. + increment: Option, + /// If `increment` is true, after this many samples, reset the value to + /// start the increasing value over. If this is `None`, the value won't + /// restart until reaching the max value of i64. If `increment` is + /// false, this has no effect. + reset_after: Option, + /// Set `pattern` to make a field with the string type. If this doesn't + /// include any placeholders, all occurrences of this field will have + /// this value. + /// + /// Valid placeholders include: + /// + /// - `{{agent_name}}` - the agent spec's name, with any replacements done + /// - `{{time}}` - the current time in nanoseconds since the epoch. TODO: + /// support specifying a strftime + /// - any other placeholders as specified in `replacements`. If a + /// placeholder has no value specified in `replacements`, it will end up + /// as-is in the field value. + pattern: Option, + /// A list of replacement placeholders and the values to replace them with. + /// If a placeholder specified here is not used in `pattern`, it will + /// have no effect. The values may optionally have a probability weight + /// specified with them; if not specified, the value will have weight 1. + /// If no weights are specified, the values will be generated with equal + /// probability. + #[serde(default)] + replacements: Vec, + /// The kind of uptime that should be used for this field. If specified, no + /// other options are valid. If not specified, this is not an uptime + /// field. + uptime: Option, +} + +/// The specification of what values to substitute in for placeholders specified +/// in `String` field values. +#[derive(Deserialize, Debug, PartialEq, Clone)] +#[serde(deny_unknown_fields)] +pub struct Replacement { + /// A placeholder key that can be used in field `pattern`s. + pub replace: String, + /// The possible values to use instead of the placeholder key in `pattern`. + /// Values may optionally have a weight specified. If no weights are + /// specified, the values will be randomly generated with equal + /// probability. The weights are passed to [`rand`'s `choose_weighted` + /// method][choose_weighted] and are a relative likelihood such that the + /// probability of each item being selected is its weight divided by the sum + /// of all weights in this group. + /// + /// [choose_weighted]: https://docs.rs/rand/0.7.3/rand/seq/trait.SliceRandom.html#tymethod.choose_weighted + pub with: Vec, +} + +#[derive(Debug, Deserialize, PartialEq, Clone)] +#[serde(untagged, deny_unknown_fields)] +/// A possible value to use instead of a placeholder key, optionally with an +/// associated weight. If no weight is specified, the weight used will be 1. +pub enum ReplacementValue { + /// Just a value without a weight + String(String), + /// A value with a specified relative likelihood weight that gets passed on + /// to [`rand`'s `choose_weighted` method][choose_weighted]. The + /// probability of each item being selected is its weight divided by the + /// sum of all weights in the `Replacement` group. + /// + /// [choose_weighted]: https://docs.rs/rand/0.7.3/rand/seq/trait.SliceRandom.html#tymethod.choose_weighted + Weighted(String, u32), +} + +impl ReplacementValue { + /// The associated replacement value + pub fn value(&self) -> &str { + use ReplacementValue::*; + match self { + String(s) => s, + Weighted(s, ..) => s, + } + } + + /// The associated weight value specified; defaults to 1. + pub fn weight(&self) -> u32 { + use ReplacementValue::*; + match self { + String(..) => 1, + Weighted(.., w) => *w, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + type Error = Box; + type Result = std::result::Result; + + static TELEGRAF_TOML: &str = include_str!("../schemas/telegraf.toml"); + + #[test] + fn parse_spec() -> Result { + let spec = DataSpec::from_str(TELEGRAF_TOML)?; + + assert_eq!(spec.name, "demo_schema"); + assert_eq!(spec.agents.len(), 2); + + let agent0 = &spec.agents[0]; + assert_eq!(agent0.name, "demo"); + + let agent0_measurements = &agent0.measurements; + assert_eq!(agent0_measurements.len(), 1); + + let a0m0 = &agent0_measurements[0]; + assert_eq!(a0m0.name, "some_measurement"); + + let a0m0_fields = &a0m0.fields; + assert_eq!(a0m0_fields.len(), 5); + + let a0m0f0 = &a0m0_fields[0]; + assert_eq!(a0m0f0.name, "field1"); + assert_eq!(a0m0f0.field_value_spec, FieldValueSpec::Bool(true)); + + let a0m0f1 = &a0m0_fields[1]; + assert_eq!(a0m0f1.name, "field2"); + assert_eq!( + a0m0f1.field_value_spec, + FieldValueSpec::I64 { + range: 3..200, + increment: false, + reset_after: None, + } + ); + + let a0m0f2 = &a0m0_fields[2]; + assert_eq!(a0m0f2.name, "field3"); + assert_eq!( + a0m0f2.field_value_spec, + FieldValueSpec::I64 { + range: 1000..5000, + increment: true, + reset_after: None, + } + ); + + let a0m0f3 = &a0m0_fields[3]; + assert_eq!(a0m0f3.name, "field4"); + assert_eq!( + a0m0f3.field_value_spec, + FieldValueSpec::F64 { range: 0.0..100.0 } + ); + + let a0m0f4 = &a0m0_fields[4]; + assert_eq!(a0m0f4.name, "field5"); + assert_eq!( + a0m0f4.field_value_spec, + FieldValueSpec::String { + pattern: + "{{agent_name}} foo {{level}} {{format-time \"%Y-%m-%d %H:%M\"}} {{random 200}}" + .into(), + replacements: vec![ + Replacement { + replace: "color".into(), + with: vec![ + ReplacementValue::String("red".into()), + ReplacementValue::String("blue".into()), + ReplacementValue::String("green".into()) + ], + }, + Replacement { + replace: "level".into(), + with: vec![ + ReplacementValue::Weighted("info".into(), 800), + ReplacementValue::Weighted("warn".into(), 195), + ReplacementValue::Weighted("error".into(), 5) + ], + } + ], + } + ); + + Ok(()) + } + + #[test] + fn parse_fully_supported_spec() -> Result<()> { + // The fully supported spec is mostly for manual testing, but we should make + // sure while developing that it's valid as well so that when we go to + // do manual testing it isn't broken + + // Also read it from the file to test `DataSpec::from_file` rather than + // include_str + + let data_spec = DataSpec::from_file("schemas/fully-supported.toml")?; + + assert_eq!(data_spec.name, "demo_schema"); + + Ok(()) + } + + #[test] + fn not_specifying_vectors_gets_default_empty_vector() { + let toml = r#" +name = "demo_schema" +base_seed = "this is a demo" + +[[agents]] +name = "basic" + +[[agents.measurements]] +name = "cpu" + +[[agents.measurements.fields]] +name = "host" +pattern = "server" +"#; + let spec = DataSpec::from_str(toml).unwrap(); + + let agent0 = &spec.agents[0]; + assert!(agent0.tags.is_empty()); + + let agent0_measurements = &agent0.measurements; + let a0m0 = &agent0_measurements[0]; + assert!(a0m0.tags.is_empty()); + + let a0m0_fields = &a0m0.fields; + let a0m0f0 = &a0m0_fields[0]; + let field_spec = &a0m0f0.field_value_spec; + + assert!( + matches!(field_spec, FieldValueSpec::String { replacements, .. } if replacements.is_empty()), + "expected a String field with empty replacements; was {:?}", + field_spec + ); + } +} diff --git a/iox_data_generator/src/substitution.rs b/iox_data_generator/src/substitution.rs new file mode 100644 index 0000000000..59aa1a3f33 --- /dev/null +++ b/iox_data_generator/src/substitution.rs @@ -0,0 +1,268 @@ +//! Substituting dynamic values into a template as specified in various places +//! in the schema. + +use crate::{specification, DataGenRng, RandomNumberGenerator}; +use chrono::prelude::*; +use handlebars::{ + Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderError, +}; +use rand::{distributions::Alphanumeric, seq::SliceRandom, Rng}; +use serde::Serialize; +use snafu::{ResultExt, Snafu}; +use std::{collections::BTreeMap, convert::TryInto, sync::Mutex}; + +/// Substitution-specific Results +pub type Result = std::result::Result; + +/// Errors that may happen while substituting values into templates. +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when substituting placeholder values + #[snafu(display( + "Could not perform text substitution in `{}`, caused by:\n{}", + template, + source + ))] + CantCompileTemplate { + /// Underlying Handlebars error that caused this problem + source: handlebars::TemplateError, + /// Template that caused this problem + template: String, + }, + + /// Error that may happen when substituting placeholder values + #[snafu(display( + "Could not perform text substitution in `{}`, caused by:\n{}", + template, + source + ))] + CantPerformSubstitution { + /// Underlying Handlebars error that caused this problem + source: handlebars::RenderError, + /// Template that caused this problem + template: String, + }, +} + +#[derive(Debug)] +struct RandomHelper(Mutex>); + +impl HelperDef for RandomHelper { + fn call<'reg: 'rc, 'rc>( + &self, + h: &Helper<'_, '_>, + _: &Handlebars<'_>, + _: &Context, + _: &mut RenderContext<'_, '_>, + out: &mut dyn Output, + ) -> HelperResult { + let param = h + .param(0) + .ok_or_else(|| RenderError::new("`random` requires a parameter"))? + .value() + .as_u64() + .ok_or_else(|| RenderError::new("`random`'s parameter must be an unsigned integer"))? + .try_into() + .map_err(|_| RenderError::new("`random`'s parameter must fit in a usize"))?; + + let rng = &mut *self.0.lock().expect("mutex poisoned"); + + let random: String = std::iter::repeat(()) + .map(|()| rng.sample(Alphanumeric)) + .map(char::from) + .take(param) + .collect(); + + out.write(&random)?; + + Ok(()) + } +} + +#[derive(Debug)] +struct FormatNowHelper; + +impl HelperDef for FormatNowHelper { + fn call<'reg: 'rc, 'rc>( + &self, + h: &Helper<'_, '_>, + _: &Handlebars<'_>, + c: &Context, + _: &mut RenderContext<'_, '_>, + out: &mut dyn Output, + ) -> HelperResult { + let format = h + .param(0) + .ok_or_else(|| RenderError::new("`format-time` requires a parameter"))? + .render(); + + let timestamp = c + .data() + .get("timestamp") + .and_then(|t| t.as_i64()) + .expect("Caller of `render` should have set `timestamp` to an `i64` value"); + + let datetime = Utc.timestamp_nanos(timestamp); + + out.write(&datetime.format(&format).to_string())?; + + Ok(()) + } +} + +/// Given a handlebars template containing placeholders within double curly +/// brackets like `{{placeholder}}` and a list of `(placeholder, substitution +/// value)` pairs, place the values in the template where the relevant +/// placeholder is. +#[derive(Debug)] +pub struct Substitute { + handlebars: Handlebars<'static>, + template: String, +} + +impl Substitute { + /// Compile and evaluate a template once. If you need to evaluate + /// it multiple times, construct an instance via [`new`]. + /// + /// If a placeholder appears in a template but not in the list of + /// substitution values, this will return an error. + pub fn once(template: &str, values: &[(&str, &str)]) -> Result { + let values = values + .iter() + .map(|&(k, v)| (k, v)) + .collect::>(); + let me = Self::new_minimal(template)?; + me.evaluate(&values) + } + + /// Compiles the handlebars template once, then allows reusing the + /// template multiple times via [`evaluate`]. If you don't need to + /// reuse the template, you can use [`once`]. + pub fn new( + template: impl Into, + rng: RandomNumberGenerator, + ) -> Result { + let mut me = Self::new_minimal(template)?; + me.set_random_number_generator(rng); + Ok(me) + } + + fn new_minimal(template: impl Into) -> Result { + let template = template.into(); + + let mut handlebars = Handlebars::new(); + handlebars.set_strict_mode(true); + + handlebars.register_helper("format-time", Box::new(FormatNowHelper)); + + handlebars + .register_template_string("template", &template) + .context(CantCompileTemplate { + template: &template, + })?; + + Ok(Self { + handlebars, + template, + }) + } + + fn set_random_number_generator(&mut self, rng: RandomNumberGenerator) { + self.handlebars + .register_helper("random", Box::new(RandomHelper(Mutex::new(rng)))); + } + + /// Interpolates the values into the compiled template. + /// + /// If a placeholder appears in a template but not in the list of + /// substitution values, this will return an error. + pub fn evaluate(&self, values: &impl Serialize) -> Result { + self.handlebars + .render("template", &values) + .context(CantPerformSubstitution { + template: &self.template, + }) + } +} + +/// Given a random number generator and replacement specification, choose a +/// particular value from the list of possible values according to any specified +/// weights (or with equal probability if there are no weights). +pub fn pick_from_replacements<'a, T: DataGenRng>( + rng: &mut RandomNumberGenerator, + replacements: &'a [specification::Replacement], +) -> BTreeMap<&'a str, &'a str> { + replacements + .iter() + .map(|replacement| { + let chosen = replacement + .with + .choose_weighted(rng, |value| value.weight()) + .expect("`Replacement` `with` should have items") + .value(); + + (replacement.replace.as_str(), chosen) + }) + .collect() +} + +#[cfg(test)] +mod test { + use super::*; + use crate::test_rng; + + type Error = Box; + type Result = std::result::Result; + + #[derive(Serialize)] + struct TimestampArgs { + timestamp: i64, + } + + #[test] + fn format_now_valid_strftime() -> Result { + let rng = test_rng(); + let args = TimestampArgs { + timestamp: 1599154445000000000, + }; + + let substitute = + Substitute::new(r#"the date is {{format-time "%Y-%m-%d"}}."#, rng).unwrap(); + + let value = substitute.evaluate(&args)?; + + assert_eq!(value, "the date is 2020-09-03."); + + Ok(()) + } + + #[test] + #[should_panic(expected = "a Display implementation returned an error unexpectedly: Error")] + fn format_now_invalid_strftime_panics() { + let rng = test_rng(); + let args = TimestampArgs { + timestamp: 1599154445000000000, + }; + + let substitute = Substitute::new(r#"the date is {{format-time "%-B"}}."#, rng).unwrap(); + + substitute.evaluate(&args).expect("This is unreachable"); + } + + #[test] + fn format_now_missing_strftime() -> Result { + let rng = test_rng(); + let args = TimestampArgs { + timestamp: 1599154445000000000, + }; + + let substitute = Substitute::new(r#"the date is {{format-time}}."#, rng).unwrap(); + + let result = substitute.evaluate(&args); + + // TODO: better matching on the error + assert!(result.is_err()); + + Ok(()) + } +} diff --git a/iox_data_generator/src/tag.rs b/iox_data_generator/src/tag.rs new file mode 100644 index 0000000000..33d02c86af --- /dev/null +++ b/iox_data_generator/src/tag.rs @@ -0,0 +1,495 @@ +//! Generating a set of tag keys and values given a specification + +use crate::{ + specification, + substitution::{pick_from_replacements, Substitute}, + DataGenRng, RandomNumberGenerator, +}; +use snafu::{ResultExt, Snafu}; +use std::fmt; + +/// Tag-specific Results +pub type Result = std::result::Result; + +/// Errors that may happen while creating tags +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when substituting placeholder values in tag keys + #[snafu(display("Could not create tag key, caused by:\n{}", source))] + CouldNotCreateTagKey { + /// Underlying `substitution` module error that caused this problem + source: crate::substitution::Error, + }, + + /// Error that may happen when substituting placeholder values in tag values + #[snafu(display( + "Could not generate tag value for tag `{}`, caused by:\n{}", + key, + source + ))] + CouldNotGenerateTagValue { + /// The key of the tag we couldn't create a value for + key: String, + /// Underlying `substitution` module error that caused this problem + source: crate::substitution::Error, + }, +} + +/// A generated tag value that will be used in a generated data point. +#[derive(Debug, Clone, PartialEq)] +pub struct Tag { + /// The key for the tag + pub key: String, + /// The value for the tag + pub value: String, +} + +impl Tag { + /// Create a new tag with the given key and value. + pub fn new(key: impl Into, value: impl Into) -> Self { + Self { + key: key.into(), + value: value.into(), + } + } +} + +/// A set of `count` tags that have the same configuration but different +/// `tag_id`s. +#[derive(Debug)] +pub struct TagGeneratorSet { + tags: Vec>, +} + +impl TagGeneratorSet { + /// Create a new set of tag generators for a particular agent, measurement, + /// and tag specification. + pub fn new( + agent_id: usize, + measurement_id: usize, + spec: &specification::TagSpec, + parent_seed: impl fmt::Display, + ) -> Result { + let cardinality = spec.cardinality.unwrap_or(1); + + let seed = format!("{}-{}", parent_seed, spec.name); + + let tags = (0..cardinality) + .map(|cardinality| { + TagGenerator::new(agent_id, measurement_id, spec, cardinality, &seed) + }) + .collect::>()?; + + Ok(Self { tags }) + } + + /// Generate one set of tags + pub fn generate(&mut self) -> Result>> { + self.tags.iter_mut().map(TagGenerator::generate).collect() + } + + /// For tags that shouldn't be included in the multi cartesian product + /// because they have cardinality 1, this method takes the number of + /// lines needed, looks at whether this tag should be resampled or not, + /// and generates the number of lines worth of tags requested. + pub fn generate_to_zip(&mut self, num_lines: usize) -> Result>> { + // This is a hack. A better way would be to have a different type for tags with + // cardinality = 1, and only that type has this method. + if self.tags.len() != 1 { + panic!("generate_to_zip is only for use with cardinality 1") + } + (&mut self.tags[0]).generate_to_zip(num_lines) + } + + /// The cardinality of this tag configuration, used to figure out how many + /// rows each generation will create in total. + pub fn tag_cardinality(&self) -> usize { + self.tags.len() + } +} + +#[derive(Debug)] +struct TagGenerator { + agent_id: String, + measurement_id: String, + tags: Vec, + cardinality: u32, + counter: usize, + current_tick: usize, + increment_every: Option, + rng: RandomNumberGenerator, + replacements: Vec, + resample_every_line: bool, +} + +impl TagGenerator { + fn new( + agent_id: usize, + measurement_id: usize, + spec: &specification::TagSpec, + cardinality: u32, + parent_seed: impl fmt::Display, + ) -> Result { + let count = spec.count.unwrap_or(1); + let increment_every = spec.increment_every; + let agent_id = agent_id.to_string(); + let measurement_id = measurement_id.to_string(); + + let seed = format!("{}-{}-{}", parent_seed, spec.name, cardinality); + let rng = RandomNumberGenerator::::new(seed); + + let tags = (0..count) + .map(|tag_id| { + let key = Substitute::once( + &spec.name, + &[ + ("agent_id", &agent_id), + ("measurement_id", &measurement_id), + ("tag_id", &tag_id.to_string()), + ], + ) + .context(CouldNotCreateTagKey)?; + + Ok(Tag { + key, + value: spec.value.clone(), + }) + }) + .collect::>()?; + + Ok(Self { + agent_id, + measurement_id, + tags, + cardinality, + counter: 0, + current_tick: 0, + increment_every, + rng, + replacements: spec.replacements.clone(), + resample_every_line: spec.resample_every_line, + }) + } + + fn generate(&mut self) -> Result> { + let counter = self.increment().to_string(); + let cardinality_string = self.cardinality.to_string(); + let guid = self.rng.guid().to_string(); + + let mut substitutions = pick_from_replacements(&mut self.rng, &self.replacements); + substitutions.insert("agent_id", &self.agent_id); + substitutions.insert("measurement_id", &self.measurement_id); + substitutions.insert("counter", &counter); + substitutions.insert("cardinality", &cardinality_string); + substitutions.insert("guid", &guid); + let substitutions: Vec<_> = substitutions.into_iter().collect(); + + self.tags + .iter() + .map(|tag| { + let key = tag.key.clone(); + let value = Substitute::once(&tag.value, &substitutions) + .context(CouldNotGenerateTagValue { key: &key })?; + + Ok(Tag { key, value }) + }) + .collect() + } + + // if count and replacements/resampling could never be used on the same tag + // configuration, then this could return `Result>` I think. This + // could also possibly return an iterator rather than a Vec; the measurement + // immediately iterates over it + fn generate_to_zip(&mut self, num_lines: usize) -> Result>> { + if self.resample_every_line { + Ok((0..num_lines) + .map(|_| self.generate()) + .collect::>()?) + } else { + let tags = self.generate()?; + Ok(std::iter::repeat(tags).take(num_lines).collect()) + } + } + + /// Returns the current value and potentially increments the counter for + /// next time. + fn increment(&mut self) -> usize { + let counter = self.counter; + + if let Some(increment) = self.increment_every { + self.current_tick += 1; + if self.current_tick >= increment { + self.counter += 1; + self.current_tick = 0; + } + } + + counter + } +} + +/// Cycles through each value for each agent tag +pub struct AgentTagIterator { + iters: Vec>>, +} + +impl fmt::Debug for AgentTagIterator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AgentTagIterator") + .field("iters", &"(dynamic)") + .finish() + } +} + +impl AgentTagIterator { + /// Create a new iterator to manage the cycling + pub fn new(agent_tags: &[specification::AgentTag]) -> Self { + Self { + iters: agent_tags + .iter() + .map(|agent_tag| { + boxed_cycling_iter(agent_tag.key.clone(), agent_tag.values.clone()) + }) + .collect(), + } + } +} + +fn boxed_cycling_iter(key: String, values: Vec) -> Box> { + Box::new(values.into_iter().cycle().map(move |v| Tag::new(&key, &v))) +} + +impl Iterator for AgentTagIterator { + type Item = Vec; + + fn next(&mut self) -> Option { + Some(self.iters.iter_mut().flat_map(|i| i.next()).collect()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{specification::*, ZeroRng, TEST_SEED}; + + #[test] + fn empty_agent_spec_tag_set_always_returns_empty_vec() { + let agent = AgentSpec { + tags: vec![], + ..AgentSpec::default() + }; + + let mut iter = AgentTagIterator::new(&agent.tags); + + assert_eq!(iter.next().unwrap(), vec![]); + } + + #[test] + fn agent_spec_tag_set() { + let tag_alpha = toml::from_str( + r#"key = "alpha" + values = ["1", "2", "3"]"#, + ) + .unwrap(); + let tag_omega = toml::from_str( + r#"key = "omega" + values = ["apple", "grape"]"#, + ) + .unwrap(); + + let agent = AgentSpec { + tags: vec![tag_alpha, tag_omega], + ..AgentSpec::default() + }; + + let mut iter = AgentTagIterator::new(&agent.tags); + + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "1"), Tag::new("omega", "apple"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "2"), Tag::new("omega", "grape"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "3"), Tag::new("omega", "apple"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "1"), Tag::new("omega", "grape"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "2"), Tag::new("omega", "apple"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "3"), Tag::new("omega", "grape"),] + ); + assert_eq!( + iter.next().unwrap(), + vec![Tag::new("alpha", "1"), Tag::new("omega", "apple"),] + ); + } + + #[test] + fn all_the_tag_substitutions_everywhere() -> Result<()> { + let spec = TagSpec { + name: "{{agent_id}}x{{measurement_id}}x{{tag_id}}".into(), + value: "{{agent_id}}v{{measurement_id}}v{{cardinality}}v{{counter}}".into(), + count: Some(2), + cardinality: Some(3), + increment_every: Some(1), + ..Default::default() + }; + + let mut tg = TagGeneratorSet::::new(22, 33, &spec, TEST_SEED)?; + + let tags = tg.generate()?; + assert_eq!( + vec![ + vec![ + Tag::new("22x33x0", "22v33v0v0"), + Tag::new("22x33x1", "22v33v0v0"), + ], + vec![ + Tag::new("22x33x0", "22v33v1v0"), + Tag::new("22x33x1", "22v33v1v0"), + ], + vec![ + Tag::new("22x33x0", "22v33v2v0"), + Tag::new("22x33x1", "22v33v2v0"), + ], + ], + tags + ); + + let tags = tg.generate()?; + assert_eq!( + vec![ + vec![ + Tag::new("22x33x0", "22v33v0v1"), + Tag::new("22x33x1", "22v33v0v1"), + ], + vec![ + Tag::new("22x33x0", "22v33v1v1"), + Tag::new("22x33x1", "22v33v1v1"), + ], + vec![ + Tag::new("22x33x0", "22v33v2v1"), + Tag::new("22x33x1", "22v33v2v1"), + ], + ], + tags + ); + + Ok(()) + } + + #[test] + fn string_replacements() -> Result<()> { + let host_tag_spec: specification::TagSpec = toml::from_str( + r#"name = "host" + value = "{{host}}" + replacements = [ + {replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}, + ]"#, + ) + .unwrap(); + + let mut tg = TagGeneratorSet::::new(22, 33, &host_tag_spec, TEST_SEED)?; + + let tags = tg.generate()?; + + assert_eq!(vec![vec![Tag::new("host", "serverA")]], tags); + + Ok(()) + } + + #[test] + fn generate_to_zip_with_resample() -> Result<()> { + let host_tag_spec: specification::TagSpec = toml::from_str( + r#"name = "host" + value = "{{host}}" + replacements = [ + {replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}, + ] + resample_every_line = true + "#, + ) + .unwrap(); + + let mut tg = TagGeneratorSet::::new(22, 33, &host_tag_spec, TEST_SEED)?; + + let tags = tg.generate_to_zip(3)?; + + assert_eq!( + vec![ + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")], + ], + tags + ); + + Ok(()) + } + + #[test] + fn generate_to_zip_without_resample() -> Result<()> { + let host_tag_spec: specification::TagSpec = toml::from_str( + r#"name = "host" + value = "{{host}}" + replacements = [ + {replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}, + ] + resample_every_line = false + "#, + ) + .unwrap(); + + let mut tg = TagGeneratorSet::::new(22, 33, &host_tag_spec, TEST_SEED)?; + + let tags = tg.generate_to_zip(3)?; + + assert_eq!( + vec![ + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")], + ], + tags + ); + + Ok(()) + } + + #[test] + fn generate_to_zip_with_default_no_resample() -> Result<()> { + let host_tag_spec: specification::TagSpec = toml::from_str( + r#"name = "host" + value = "{{host}}" + replacements = [ + {replace = "host", with = ["serverA", "serverB", "serverC", "serverD"]}, + ]"#, + ) + .unwrap(); + + let mut tg = TagGeneratorSet::::new(22, 33, &host_tag_spec, TEST_SEED)?; + + let tags = tg.generate_to_zip(3)?; + + assert_eq!( + vec![ + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")], + vec![Tag::new("host", "serverA")] + ], + tags + ); + + Ok(()) + } +} diff --git a/iox_data_generator/src/write.rs b/iox_data_generator/src/write.rs new file mode 100644 index 0000000000..9e03effc2c --- /dev/null +++ b/iox_data_generator/src/write.rs @@ -0,0 +1,361 @@ +//! Writing generated points + +use futures::stream; +use influxdb2_client::models::{DataPoint, PostBucketRequest, WriteDataPoint}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; +#[cfg(test)] +use std::{ + collections::BTreeMap, + sync::{Arc, Mutex}, +}; +use std::{ + fs, + fs::OpenOptions, + path::{Path, PathBuf}, +}; +use tracing::info; + +/// Errors that may happen while writing points. +#[derive(Snafu, Debug)] +pub enum Error { + /// Error that may happen when writing line protocol to a no-op sink + #[snafu(display("Could not generate line protocol: {}", source))] + CantWriteToNoOp { + /// Underlying IO error that caused this problem + source: std::io::Error, + }, + + /// Error that may happen when writing line protocol to a file + #[snafu(display("Could not write line protocol to file: {}", source))] + CantWriteToLineProtocolFile { + /// Underlying IO error that caused this problem + source: std::io::Error, + }, + + /// Error that may happen when creating a directory to store files to write + /// to + #[snafu(display("Could not create directory: {}", source))] + CantCreateDirectory { + /// Underlying IO error that caused this problem + source: std::io::Error, + }, + + /// Error that may happen when checking a path's metadata to see if it's a + /// directory + #[snafu(display("Could not get metadata: {}", source))] + CantGetMetadata { + /// Underlying IO error that caused this problem + source: std::io::Error, + }, + + /// Error that may happen if the path given to the file-based writer isn't a + /// directory + #[snafu(display("Expected to get a directory"))] + MustBeDirectory, + + /// Error that may happen while writing points to the API + #[snafu(display("Could not write points to API: {}", source))] + CantWriteToApi { + /// Underlying Influx client request error that caused this problem + source: influxdb2_client::RequestError, + }, + + /// Error that may happen while trying to create a bucket via the API + #[snafu(display("Could not create bucket: {}", source))] + CantCreateBucket { + /// Underlying Influx client request error that caused this problem + source: influxdb2_client::RequestError, + }, + + /// Error that may happen if attempting to create a bucket without + /// specifying the org ID + #[snafu(display("Could not create a bucket without an `org_id`"))] + OrgIdRequiredToCreateBucket, +} + +type Result = std::result::Result; + +/// Responsible for holding shared configuration needed to construct per-agent +/// points writers +#[derive(Debug)] +pub struct PointsWriterBuilder { + config: PointsWriterConfig, +} + +#[derive(Debug)] +enum PointsWriterConfig { + Api { + client: influxdb2_client::Client, + org: String, + bucket: String, + }, + Directory(PathBuf), + NoOp { + perform_write: bool, + }, + #[cfg(test)] + Vector(BTreeMap>>>), +} + +impl PointsWriterBuilder { + /// Write points to the API at the specified host and put them in the + /// specified org and bucket. + pub async fn new_api( + host: impl Into, + org: impl Into, + bucket: impl Into, + token: impl Into, + create_bucket: bool, + org_id: Option<&str>, + ) -> Result { + let host = host.into(); + + // Be somewhat lenient on what we accept as far as host; the client expects the + // protocol to be included. We could pull in the url crate and do more + // verification here. + let host = if host.starts_with("http") { + host + } else { + format!("http://{}", host) + }; + + let client = influxdb2_client::Client::new(host, token.into()); + let org = org.into(); + let bucket = bucket.into(); + + if create_bucket { + let org_id = org_id.context(OrgIdRequiredToCreateBucket)?.to_string(); + let bucket = PostBucketRequest { + org_id, + name: bucket.clone(), + ..Default::default() + }; + + client + .create_bucket(Some(bucket)) + .await + .context(CantCreateBucket)?; + } + + Ok(Self { + config: PointsWriterConfig::Api { + client, + org, + bucket, + }, + }) + } + + /// Write points to a file in the directory specified. + pub fn new_file>(path: P) -> Result { + fs::create_dir_all(&path).context(CantCreateDirectory)?; + let metadata = fs::metadata(&path).context(CantGetMetadata)?; + ensure!(metadata.is_dir(), MustBeDirectory); + + Ok(Self { + config: PointsWriterConfig::Directory(PathBuf::from(path.as_ref())), + }) + } + + /// Generate points but do not write them anywhere + pub fn new_no_op(perform_write: bool) -> Self { + Self { + config: PointsWriterConfig::NoOp { perform_write }, + } + } + + /// Create a writer out of this writer's configuration for a particular + /// agent that runs in a separate thread/task. + pub fn build_for_agent(&mut self, agent_name: &str) -> PointsWriter { + let inner_writer = match &mut self.config { + PointsWriterConfig::Api { + client, + org, + bucket, + } => InnerPointsWriter::Api { + client: client.clone(), + org: org.clone(), + bucket: bucket.clone(), + }, + PointsWriterConfig::Directory(dir_path) => { + let mut filename = dir_path.clone(); + filename.push(agent_name); + filename.set_extension("txt"); + InnerPointsWriter::File(filename) + } + PointsWriterConfig::NoOp { perform_write } => InnerPointsWriter::NoOp { + perform_write: *perform_write, + }, + #[cfg(test)] + PointsWriterConfig::Vector(ref mut agents_by_name) => { + let v = agents_by_name + .entry(agent_name.to_string()) + .or_insert_with(|| Arc::new(Mutex::new(Vec::new()))); + InnerPointsWriter::Vec(Arc::clone(v)) + } + }; + + PointsWriter { inner_writer } + } +} + +/// Responsible for writing points to the location it's been configured for. +#[derive(Debug)] +pub struct PointsWriter { + inner_writer: InnerPointsWriter, +} + +impl PointsWriter { + /// Write these points + pub async fn write_points(&mut self, points: Vec) -> Result<()> { + self.inner_writer.write_points(points).await + } +} + +#[derive(Debug)] +enum InnerPointsWriter { + Api { + client: influxdb2_client::Client, + org: String, + bucket: String, + }, + File(PathBuf), + NoOp { + perform_write: bool, + }, + #[cfg(test)] + Vec(Arc>>), +} + +impl InnerPointsWriter { + async fn write_points(&mut self, points: Vec) -> Result<()> { + match self { + Self::Api { + client, + org, + bucket, + } => { + client + .write(org, bucket, stream::iter(points)) + .await + .context(CantWriteToApi)?; + } + Self::File(filename) => { + info!("Opening file {:?}", filename); + let num_points = points.len(); + let file = OpenOptions::new() + .append(true) + .create(true) + .open(&filename) + .context(CantWriteToLineProtocolFile)?; + + let mut file = std::io::BufWriter::new(file); + for point in points { + point + .write_data_point_to(&mut file) + .context(CantWriteToLineProtocolFile)?; + } + info!("Wrote {} points to {:?}", num_points, filename); + } + Self::NoOp { perform_write } => { + if *perform_write { + let mut sink = std::io::sink(); + + for point in points { + point + .write_data_point_to(&mut sink) + .context(CantWriteToNoOp)?; + } + } + } + #[cfg(test)] + Self::Vec(ref mut vec) => { + let vec_ref = Arc::clone(vec); + let mut vec = vec_ref.lock().expect("Should be able to get lock"); + for point in points { + point + .write_data_point_to(&mut *vec) + .expect("Should be able to write to vec"); + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{generate, now_ns, specification::*, ZeroRng}; + use std::str::FromStr; + + type Error = Box; + type Result = std::result::Result; + + impl PointsWriterBuilder { + fn new_vec() -> Self { + Self { + config: PointsWriterConfig::Vector(BTreeMap::new()), + } + } + + fn written_data(self, agent_name: &str) -> String { + match self.config { + PointsWriterConfig::Vector(agents_by_name) => { + let bytes_ref = agents_by_name + .get(agent_name) + .expect("Should have written some data, did not find any for this agent") + .clone(); + let bytes = bytes_ref + .lock() + .expect("Should have been able to get a lock"); + String::from_utf8(bytes.to_vec()).expect("we should be generating valid UTF-8") + } + _ => unreachable!("this method is only valid when writing to a vector for testing"), + } + } + } + + #[tokio::test] + async fn test_generate() -> Result<()> { + let toml = r#" +name = "demo_schema" +base_seed = "this is a demo" + +[[agents]] +name = "basic" + +[[agents.measurements]] +name = "cpu" + +[[agents.measurements.fields]] +name = "up" +bool = true"#; + + let data_spec = DataSpec::from_str(toml).unwrap(); + let mut points_writer_builder = PointsWriterBuilder::new_vec(); + + let now = now_ns(); + + generate::( + &data_spec, + &mut points_writer_builder, + Some(now), + Some(now), + now, + false, + ) + .await?; + + let line_protocol = points_writer_builder.written_data("basic"); + + let expected_line_protocol = format!( + r#"cpu,data_spec=demo_schema up=f {} +"#, + now + ); + assert_eq!(line_protocol, expected_line_protocol); + + Ok(()) + } +} diff --git a/scripts/genlp.py b/scripts/genlp.py index d8f8b01768..d1c4a6fd55 100755 --- a/scripts/genlp.py +++ b/scripts/genlp.py @@ -6,8 +6,7 @@ # ./scripts/genlp.py | head -n 2000 # ``` # -# Please use https://github.com/influxdata/iox_data_generator for anything -# more complicated. +# Please use iox_data_generator for anything more complicated. # from signal import signal, SIGPIPE, SIG_DFL