influxdb/read_buffer/Cargo.toml

[package]
name = "read_buffer"
version = "0.1.0"
authors = ["Edd Robinson <me@edd.io>"]
edition = "2018"

# Note this crate is designed to be standalone, and should not depend
# on the IOx Query Engine. The rationale is:
#
# 1. Keep the API clean (and concerns separated) and allow for potential future reuse outside of IOx
# 2. Keep change/compile/link time down during development when working on just this crate

[dependencies] # In alphabetical order
arrow = { version = "5.0", features = ["prettyprint"] }
arrow_util = { path = "../arrow_util" }
chrono = "0.4"
croaring = "0.5"
data_types = { path = "../data_types" }
datafusion = { path = "../datafusion" }
either = "1.6.1"
hashbrown = "0.11"
internal_types = { path = "../internal_types" }
itertools = "0.10.1"
metrics = { path = "../metrics" }
observability_deps = { path = "../observability_deps" }
packers = { path = "../packers" }
parking_lot = "0.11"
permutation = "0.2.5"
snafu = "0.6"

[dev-dependencies] # In alphabetical order
criterion = "0.3.3"
rand = "0.8.3"
rand_distr = "0.4.0"
test_helpers = { path = "../test_helpers" }

[[bench]]
name = "database"
harness = false

[[bench]]
name = "sum_fixed"
harness = false

[[bench]]
name = "string"
harness = false

[[bench]]
name = "read"
harness = false
feat: segment store shell 2020-09-28 17:13:06 +00:00			`[package]`
refactor: rename segment_store crate to read_buffer 2020-12-18 18:07:34 +00:00			`name = "read_buffer"`
feat: segment store shell 2020-09-28 17:13:06 +00:00			`version = "0.1.0"`
			`authors = ["Edd Robinson <me@edd.io>"]`
			`edition = "2018"`

docs: Document desired crate dependencies in comments (#638) * docs: Document the desire for read buffer and mutable buffer to be independent of query layer * docs: Document desire for the query layer to not depend on storage systems * fix: Apply suggestions from code review Co-authored-by: Edd Robinson <me@edd.io> Co-authored-by: Edd Robinson <me@edd.io> 2021-01-12 22:49:03 +00:00			`# Note this crate is designed to be standalone, and should not depend`
			`# on the IOx Query Engine. The rationale is:`
			`#`
			`# 1. Keep the API clean (and concerns separated) and allow for potential future reuse outside of IOx`
			`# 2. Keep change/compile/link time down during development when working on just this crate`

chore: Clean up Cargo.tomls (#754) * fix: test_helpers crate should only be a dev-dep * fix: object_store no longer has a build script, so no longer needs a build dep * chore: Alphabetize all Cargo.tomls 2021-02-04 23:56:02 +00:00			`[dependencies] # In alphabetical order`
chore: update to arrow 5.0 and master datafusion (#2049) * chore: update to arrow 5.0 and master datafusion * fix: Update test for change in object size 2021-07-19 12:49:51 +00:00			`arrow = { version = "5.0", features = ["prettyprint"] }`
feat: add Dictionary Values type 2021-05-19 11:13:31 +00:00			`arrow_util = { path = "../arrow_util" }`
feat: Record first/last write times for RUB chunks 2021-07-14 17:34:21 +00:00			`chrono = "0.4"`
chore: update croaring to 0.5.0 Upstreame changelog: - CRoaring updated to 0.3.1 - `-march=native` is not a default for croaring-sys anymore - Impl Default for `Bitmap` and `Treemap` 2021-07-13 13:15:41 +00:00			`croaring = "0.5"`
feat: Add TableSummary calculation to ReadBuffer (#1092) 2021-03-31 16:26:37 +00:00			`data_types = { path = "../data_types" }`
refactor: explode arrow_deps (#1425) * refactor: explode arrow_deps * chore: workaround doctest bug 2021-05-05 16:59:12 +00:00			`datafusion = { path = "../datafusion" }`
refactor: return groups as vectors 2020-12-04 19:19:09 +00:00			`either = "1.6.1"`
chore: update hashbrown (#1430) Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2021-05-05 22:32:46 +00:00			`hashbrown = "0.11"`
refactor: Move some types (not yet exposed to clients) into internal_types (#1015) * refactor: Move some types (not yet exposed to clients) into internal_types * docs: Add README.md explaining the rationale * refactor: remove some stragglers * fix: fix benches * fix: Apply suggestions from code review Co-authored-by: Carol (Nichols \|\| Goulding) <193874+carols10cents@users.noreply.github.com> * fix: add clippy lints * fix: fmt * docs: Apply suggestions from code review fix typos Co-authored-by: Carol (Nichols \|\| Goulding) <193874+carols10cents@users.noreply.github.com> Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2021-03-19 16:27:57 +00:00			`internal_types = { path = "../internal_types" }`
chore: Update itertools deps (#1750) 2021-06-17 17:56:44 +00:00			`itertools = "0.10.1"`
feat: wire in rb column metrics 2021-05-10 21:22:59 +00:00			`metrics = { path = "../metrics" }`
chore: fix observability dependency key The observability_deps crate name was erroneously wrapped in double quotes. 2021-04-26 11:22:21 +00:00			`observability_deps = { path = "../observability_deps" }`
chore: Clean up Cargo.tomls (#754) * fix: test_helpers crate should only be a dev-dep * fix: object_store no longer has a build script, so no longer needs a build dep * chore: Alphabetize all Cargo.tomls 2021-02-04 23:56:02 +00:00			`packers = { path = "../packers" }`
refactor: change sync::RwLock to parking_lot 2021-04-14 16:18:45 +00:00			`parking_lot = "0.11"`
chore: Clean up Cargo.tomls (#754) * fix: test_helpers crate should only be a dev-dep * fix: object_store no longer has a build script, so no longer needs a build dep * chore: Alphabetize all Cargo.tomls 2021-02-04 23:56:02 +00:00			`permutation = "0.2.5"`
			`snafu = "0.6"`
feat: segment store shell 2020-09-28 17:13:06 +00:00
chore: Clean up Cargo.tomls (#754) * fix: test_helpers crate should only be a dev-dep * fix: object_store no longer has a build script, so no longer needs a build dep * chore: Alphabetize all Cargo.tomls 2021-02-04 23:56:02 +00:00			`[dev-dependencies] # In alphabetical order`
test: add benchmarks for specific read_group path This commit adds benchmarks to track the performance of `read_group` when aggregating across columns that support pre-computed bit-sets of row_ids for each distinct column value. Currently this is limited to the RLE columns, and only makes sense when grouping by low-cardinality columns. The benchmarks are in three groups: * one group fixes the number of rows in the segment but varies the cardinality (that is, how many groups the query produces). * another groups fixes the cardinality and the number of rows but varies the number of columns needed to be grouped to produce the fixed cardinality. * a final group fixes the number of columns being grouped, the cardinality, and instead varies the number of rows in the segment. Some initial results from my development box are as follows: ``` time: [51.099 ms 51.119 ms 51.140 ms] thrpt: [39.108 Kelem/s 39.125 Kelem/s 39.140 Kelem/s] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/1 time: [93.162 us 93.219 us 93.280 us] thrpt: [10.720 Kelem/s 10.727 Kelem/s 10.734 Kelem/s] Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/2 time: [571.72 us 572.31 us 572.98 us] thrpt: [3.4905 Kelem/s 3.4946 Kelem/s 3.4982 Kelem/s] Found 12 outliers among 100 measurements (12.00%) 5 (5.00%) high mild 7 (7.00%) high severe Benchmarking segment_read_group_pre_computed_groups_no_predicates_group_cols/3: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.9s, enable flat sampling, or reduce sample count to 50. segment_read_group_pre_computed_groups_no_predicates_group_cols/3 time: [1.7292 ms 1.7313 ms 1.7340 ms] thrpt: [1.7301 Kelem/s 1.7328 Kelem/s 1.7349 Kelem/s] Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low mild 6 (6.00%) high mild 1 (1.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/250000 time: [562.29 us 565.19 us 568.80 us] thrpt: [439.52 Melem/s 442.33 Melem/s 444.61 Melem/s] Found 18 outliers among 100 measurements (18.00%) 6 (6.00%) high mild 12 (12.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/500000 time: [561.32 us 561.85 us 562.47 us] thrpt: [888.93 Melem/s 889.92 Melem/s 890.76 Melem/s] Found 11 outliers among 100 measurements (11.00%) 5 (5.00%) high mild 6 (6.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/750000 time: [573.75 us 574.27 us 574.85 us] thrpt: [1.3047 Gelem/s 1.3060 Gelem/s 1.3072 Gelem/s] Found 13 outliers among 100 measurements (13.00%) 5 (5.00%) high mild 8 (8.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/1000000 time: [586.36 us 586.74 us 587.19 us] thrpt: [1.7030 Gelem/s 1.7043 Gelem/s 1.7054 Gelem/s] Found 9 outliers among 100 measurements (9.00%) 4 (4.00%) high mild 5 (5.00%) high severe ``` 2020-12-03 16:04:08 +00:00			`criterion = "0.3.3"`
fix: use consistent crate versions (#989) Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> 2021-03-15 15:42:19 +00:00			`rand = "0.8.3"`
			`rand_distr = "0.4.0"`
refactor: log new row groups added to RB 2021-04-16 21:19:20 +00:00			`test_helpers = { path = "../test_helpers" }`
feat: fixed encoding for non-null numerics 2020-10-01 17:26:31 +00:00
perf: add benchmark for table_names This commit adds some benchmarks for `table_names` against the read buffer's Database implementation. On my laptop these look like: database_table_names_all_tables time: [2.2104 us 2.2242 us 2.2381 us] Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe database_table_names_meta_pred_no_match time: [1.8389 us 1.8488 us 1.8593 us] Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe database_table_names_single_pred_match time: [5.5457 us 5.5694 us 5.5919 us] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe database_table_names_multi_pred_match time: [478.85 us 480.32 us 481.83 us] Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe database_table_names_multi_pred_match_multi_tables time: [476.47 us 478.93 us 482.25 us] Found 11 outliers among 100 measurements (11.00%) 4 (4.00%) high mild 7 (7.00%) high severe 2021-01-25 21:06:04 +00:00			`[[bench]]`
			`name = "database"`
			`harness = false`

feat: fixed encoding for non-null numerics 2020-10-01 17:26:31 +00:00			`[[bench]]`
refactor: clarify benchmark 2021-05-11 20:17:38 +00:00			`name = "sum_fixed"`
feat: fixed encoding for non-null numerics 2020-10-01 17:26:31 +00:00			`harness = false`
test: add benchmark for selecting RLE col 2020-11-10 11:05:21 +00:00
			`[[bench]]`
refactor: move encodings to scalar module 2021-05-11 21:39:38 +00:00			`name = "string"`
test: add benchmark for selecting RLE col 2020-11-10 11:05:21 +00:00			`harness = false`
test: add benchmarks for specific read_group path This commit adds benchmarks to track the performance of `read_group` when aggregating across columns that support pre-computed bit-sets of row_ids for each distinct column value. Currently this is limited to the RLE columns, and only makes sense when grouping by low-cardinality columns. The benchmarks are in three groups: * one group fixes the number of rows in the segment but varies the cardinality (that is, how many groups the query produces). * another groups fixes the cardinality and the number of rows but varies the number of columns needed to be grouped to produce the fixed cardinality. * a final group fixes the number of columns being grouped, the cardinality, and instead varies the number of rows in the segment. Some initial results from my development box are as follows: ``` time: [51.099 ms 51.119 ms 51.140 ms] thrpt: [39.108 Kelem/s 39.125 Kelem/s 39.140 Kelem/s] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/1 time: [93.162 us 93.219 us 93.280 us] thrpt: [10.720 Kelem/s 10.727 Kelem/s 10.734 Kelem/s] Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/2 time: [571.72 us 572.31 us 572.98 us] thrpt: [3.4905 Kelem/s 3.4946 Kelem/s 3.4982 Kelem/s] Found 12 outliers among 100 measurements (12.00%) 5 (5.00%) high mild 7 (7.00%) high severe Benchmarking segment_read_group_pre_computed_groups_no_predicates_group_cols/3: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.9s, enable flat sampling, or reduce sample count to 50. segment_read_group_pre_computed_groups_no_predicates_group_cols/3 time: [1.7292 ms 1.7313 ms 1.7340 ms] thrpt: [1.7301 Kelem/s 1.7328 Kelem/s 1.7349 Kelem/s] Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low mild 6 (6.00%) high mild 1 (1.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/250000 time: [562.29 us 565.19 us 568.80 us] thrpt: [439.52 Melem/s 442.33 Melem/s 444.61 Melem/s] Found 18 outliers among 100 measurements (18.00%) 6 (6.00%) high mild 12 (12.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/500000 time: [561.32 us 561.85 us 562.47 us] thrpt: [888.93 Melem/s 889.92 Melem/s 890.76 Melem/s] Found 11 outliers among 100 measurements (11.00%) 5 (5.00%) high mild 6 (6.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/750000 time: [573.75 us 574.27 us 574.85 us] thrpt: [1.3047 Gelem/s 1.3060 Gelem/s 1.3072 Gelem/s] Found 13 outliers among 100 measurements (13.00%) 5 (5.00%) high mild 8 (8.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/1000000 time: [586.36 us 586.74 us 587.19 us] thrpt: [1.7030 Gelem/s 1.7043 Gelem/s 1.7054 Gelem/s] Found 9 outliers among 100 measurements (9.00%) 4 (4.00%) high mild 5 (5.00%) high severe ``` 2020-12-03 16:04:08 +00:00
			`[[bench]]`
refactor: reorganise benchmarks 2021-04-29 18:57:32 +00:00			`name = "read"`
test: add benchmarks for specific read_group path This commit adds benchmarks to track the performance of `read_group` when aggregating across columns that support pre-computed bit-sets of row_ids for each distinct column value. Currently this is limited to the RLE columns, and only makes sense when grouping by low-cardinality columns. The benchmarks are in three groups: * one group fixes the number of rows in the segment but varies the cardinality (that is, how many groups the query produces). * another groups fixes the cardinality and the number of rows but varies the number of columns needed to be grouped to produce the fixed cardinality. * a final group fixes the number of columns being grouped, the cardinality, and instead varies the number of rows in the segment. Some initial results from my development box are as follows: ``` time: [51.099 ms 51.119 ms 51.140 ms] thrpt: [39.108 Kelem/s 39.125 Kelem/s 39.140 Kelem/s] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/1 time: [93.162 us 93.219 us 93.280 us] thrpt: [10.720 Kelem/s 10.727 Kelem/s 10.734 Kelem/s] Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe segment_read_group_pre_computed_groups_no_predicates_group_cols/2 time: [571.72 us 572.31 us 572.98 us] thrpt: [3.4905 Kelem/s 3.4946 Kelem/s 3.4982 Kelem/s] Found 12 outliers among 100 measurements (12.00%) 5 (5.00%) high mild 7 (7.00%) high severe Benchmarking segment_read_group_pre_computed_groups_no_predicates_group_cols/3: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.9s, enable flat sampling, or reduce sample count to 50. segment_read_group_pre_computed_groups_no_predicates_group_cols/3 time: [1.7292 ms 1.7313 ms 1.7340 ms] thrpt: [1.7301 Kelem/s 1.7328 Kelem/s 1.7349 Kelem/s] Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low mild 6 (6.00%) high mild 1 (1.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/250000 time: [562.29 us 565.19 us 568.80 us] thrpt: [439.52 Melem/s 442.33 Melem/s 444.61 Melem/s] Found 18 outliers among 100 measurements (18.00%) 6 (6.00%) high mild 12 (12.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/500000 time: [561.32 us 561.85 us 562.47 us] thrpt: [888.93 Melem/s 889.92 Melem/s 890.76 Melem/s] Found 11 outliers among 100 measurements (11.00%) 5 (5.00%) high mild 6 (6.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/750000 time: [573.75 us 574.27 us 574.85 us] thrpt: [1.3047 Gelem/s 1.3060 Gelem/s 1.3072 Gelem/s] Found 13 outliers among 100 measurements (13.00%) 5 (5.00%) high mild 8 (8.00%) high severe segment_read_group_pre_computed_groups_no_predicates_rows/1000000 time: [586.36 us 586.74 us 587.19 us] thrpt: [1.7030 Gelem/s 1.7043 Gelem/s 1.7054 Gelem/s] Found 9 outliers among 100 measurements (9.00%) 4 (4.00%) high mild 5 (5.00%) high severe ``` 2020-12-03 16:04:08 +00:00			`harness = false`