Merge branch 'main' into dom/sharded-cache

pull/24376/head
kodiakhq[bot] 2022-02-08 16:09:48 +00:00 committed by GitHub
commit ace76cef14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
97 changed files with 6563 additions and 2214 deletions

View File

@ -104,7 +104,7 @@ jobs:
command: find scripts -type f ! \( -iname '*.py' -or -iname '*.supp' \) -exec shellcheck {} +
- run:
name: Yamllint
command: yamllint --strict .
command: yamllint --config-file .circleci/yamllint.yml --strict .
- cache_save
cargo_audit:
docker:

3
.circleci/yamllint.yml Normal file
View File

@ -0,0 +1,3 @@
rules:
truthy:
check-keys: false

67
.github/workflows/semantic_check.sh vendored Executable file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env bash
shopt -s nocasematch
semantic_pattern='(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\([^)]+\))?: +[^ ]'
if [[ $1 == "test" ]]; then
exit_code=0
echo checking strings that should be OK
expect_ok="chore: foo
chore(hello): foo
CHORE: foo"
while read -r s; do
if [[ ! $s =~ $semantic_pattern ]]; then
echo got FAIL, expected OK: "$s"
exit_code=1
fi
done <<< "$expect_ok"
echo checking strings that should FAIL
expect_fail="more: foo
chore(: foo
chore : foo
chore:
chore:
chore:foo
"
while read -r s; do
if [[ $s =~ $semantic_pattern ]]; then
echo got OK, expected FAIL: "$s"
exit_code=1
fi
done <<< "$expect_fail"
exit $exit_code
fi
# nb: quotes are often not required around env var names between [[ and ]]
if [[ -z $PR_TITLE || -z $COMMITS_URL ]]; then
echo ::error::required env vars: PR_TITLE, COMMITS_URL
exit 1
fi
exit_code=0
if [[ ! $PR_TITLE =~ $semantic_pattern ]]; then
echo ::error::PR title not semantic: "$PR_TITLE"
exit_code=1
else
echo PR title OK: "$PR_TITLE"
fi
json=$(curl --silent "$COMMITS_URL")
commits=$(echo "$json" | jq --raw-output '.[] | [.sha, .commit.message] | join(" ") | split("\n") | first')
while read -r commit; do
commit_title=$(echo "$commit" | cut -c 42-999)
if [[ ! $commit_title =~ $semantic_pattern ]]; then
echo ::error::Commit title not semantic: "$commit"
exit_code=1
else
echo Commit title OK: "$commit"
fi
done <<< "$commits"
exit $exit_code

20
.github/workflows/semantic_check.yml vendored Normal file
View File

@ -0,0 +1,20 @@
---
name: "Semantic PR and Commit Messages"
on:
pull_request:
types: [opened, reopened, synchronize, edited]
env:
PR_TITLE: ${{ github.event.pull_request.title }}
COMMITS_URL: ${{ github.event.pull_request.commits_url }}
jobs:
main:
name: Semantic PR and commit messages
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
ref: ${{ github.event.pull_request.head.sha }}
- run: bash .github/workflows/semantic_check.sh

395
Cargo.lock generated
View File

@ -678,9 +678,9 @@ dependencies = [
[[package]]
name = "crc32fast"
version = "1.3.1"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2209c310e29876f7f0b2721e7e26b84aff178aa3da5d091f9bfbf47669e60e3"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [
"cfg-if",
]
@ -841,6 +841,51 @@ dependencies = [
"syn",
]
[[package]]
name = "darling"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.10.0",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "dashmap"
version = "4.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c"
dependencies = [
"cfg-if",
"num_cpus",
]
[[package]]
name = "data_types"
version = "0.1.0"
@ -962,6 +1007,17 @@ dependencies = [
"uuid",
]
[[package]]
name = "derivative"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "diff"
version = "0.1.12"
@ -1269,9 +1325,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3dda0b6588335f360afc675d0564c17a77a2bda81ca178a4b6081bd86c7f0b"
checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010"
dependencies = [
"futures-core",
"futures-sink",
@ -1279,15 +1335,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0c8ff0461b82559810cdccfde3215c3f373807f5e5232b71479bff7bb2583d7"
checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
[[package]]
name = "futures-executor"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29d6d2ff5bb10fb95c85b8ce46538a2e5f5e7fdc755623a7d4529ab8a4ed9d2a"
checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6"
dependencies = [
"futures-core",
"futures-task",
@ -1307,15 +1363,15 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f9d34af5a1aac6fb380f735fe510746c38067c5bf16c7fd250280503c971b2"
checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
[[package]]
name = "futures-macro"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dbd947adfffb0efc70599b3ddcf7b5597bb5fa9e245eb99f62b3a5f7bb8bd3c"
checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512"
dependencies = [
"proc-macro2",
"quote",
@ -1324,21 +1380,21 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3055baccb68d74ff6480350f8d6eb8fcfa3aa11bdc1a1ae3afdd0514617d508"
checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868"
[[package]]
name = "futures-task"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ee7c6485c30167ce4dfb83ac568a849fe53274c831081476ee13e0dce1aad72"
checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a"
[[package]]
name = "futures-test"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e741bc851e1e90ad08901b329389ae77e02d5e9a0ec61955b80834630fbdc2f"
checksum = "8c3e9379dbbfb35dd6df79e895d73c0f75558827fe68eb853b858ff417a8ee98"
dependencies = [
"futures-core",
"futures-executor",
@ -1353,9 +1409,9 @@ dependencies = [
[[package]]
name = "futures-util"
version = "0.3.19"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5cf40b47a271f77a8b1bec03ca09044d99d2372c0de244e66430761127164"
checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a"
dependencies = [
"futures-channel",
"futures-core",
@ -1428,6 +1484,36 @@ version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
[[package]]
name = "gitops_adapter"
version = "0.1.0"
dependencies = [
"assert_matches",
"async-trait",
"chrono",
"clap 3.0.13",
"dotenv",
"futures",
"glob",
"k8s-openapi",
"kube",
"kube-derive",
"kube-runtime",
"parking_lot 0.11.2",
"pbjson-build",
"prost",
"schemars",
"serde",
"serde_json",
"thiserror",
"tokio",
"tonic",
"tonic-build",
"tracing",
"trogging",
"workspace-hack",
]
[[package]]
name = "glob"
version = "0.3.0"
@ -1690,6 +1776,12 @@ dependencies = [
"tokio-native-tls",
]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.2.3"
@ -1929,6 +2021,7 @@ version = "0.1.0"
dependencies = [
"arrow",
"arrow_util",
"async-trait",
"base64 0.13.0",
"bytes",
"chrono",
@ -2021,6 +2114,7 @@ dependencies = [
"schema",
"snafu",
"sqlx",
"test_helpers",
"tokio",
"uuid",
"workspace-hack",
@ -2129,6 +2223,28 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "json-patch"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f995a3c8f2bc3dd52a18a583e90f9ec109c047fa1603a853e46bcda14d2e279d"
dependencies = [
"serde",
"serde_json",
"treediff",
]
[[package]]
name = "jsonpath_lib"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eaa63191d68230cccb81c5aa23abd53ed64d83337cacbb25a7b8c7979523774f"
dependencies = [
"log",
"serde",
"serde_json",
]
[[package]]
name = "jsonwebtoken"
version = "7.2.0"
@ -2136,13 +2252,126 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32"
dependencies = [
"base64 0.12.3",
"pem",
"pem 0.8.3",
"ring",
"serde",
"serde_json",
"simple_asn1",
]
[[package]]
name = "k8s-openapi"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f8de9873b904e74b3533f77493731ee26742418077503683db44e1b3c54aa5c"
dependencies = [
"base64 0.13.0",
"bytes",
"chrono",
"schemars",
"serde",
"serde-value",
"serde_json",
]
[[package]]
name = "kube"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84dcc2f8ca3f2427a72acc31fa9538159f6b33a97002e315a3fcd5323cf51a2b"
dependencies = [
"k8s-openapi",
"kube-client",
"kube-core",
]
[[package]]
name = "kube-client"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8957106140aa24a76de3f7d005966f381b30a4cd6a9c003b3bba6828e9617535"
dependencies = [
"base64 0.13.0",
"bytes",
"chrono",
"dirs-next",
"either",
"futures",
"http",
"http-body",
"hyper",
"hyper-timeout",
"hyper-tls",
"jsonpath_lib",
"k8s-openapi",
"kube-core",
"openssl",
"pem 1.0.2",
"pin-project",
"serde",
"serde_json",
"serde_yaml",
"thiserror",
"tokio",
"tokio-native-tls",
"tokio-util",
"tower",
"tower-http",
"tracing",
]
[[package]]
name = "kube-core"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ec73e7d8e937dd055d962af06e635e262fdb6ed341c36ecf659d4fece0a8005"
dependencies = [
"chrono",
"form_urlencoded",
"http",
"json-patch",
"k8s-openapi",
"once_cell",
"serde",
"serde_json",
"thiserror",
]
[[package]]
name = "kube-derive"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6651bfae82bc23439da1099174b52bcbf68df065dc33317c912e3c5c5cea43c"
dependencies = [
"darling",
"proc-macro2",
"quote",
"serde_json",
"syn",
]
[[package]]
name = "kube-runtime"
version = "0.64.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b090d3d7b43e2d60fa93ca51b19fe9f2e05a5252c97880fe834f8fa9f2de605"
dependencies = [
"dashmap",
"derivative",
"futures",
"json-patch",
"k8s-openapi",
"kube-client",
"pin-project",
"serde",
"serde_json",
"smallvec",
"thiserror",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
@ -2221,9 +2450,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.116"
version = "0.2.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "565dbd88872dbe4cc8a46e527f26483c1d1f7afa6b884a3bd6cd893d4f98da74"
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
[[package]]
name = "libloading"
@ -2257,6 +2486,12 @@ dependencies = [
"workspace-hack",
]
[[package]]
name = "linked-hash-map"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3"
[[package]]
name = "linux-raw-sys"
version = "0.0.37"
@ -3149,6 +3384,15 @@ dependencies = [
"regex",
]
[[package]]
name = "pem"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947"
dependencies = [
"base64 0.13.0",
]
[[package]]
name = "percent-encoding"
version = "2.1.0"
@ -4083,6 +4327,30 @@ dependencies = [
"workspace-hack",
]
[[package]]
name = "schemars"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6b5a3c80cea1ab61f4260238409510e814e38b4b563c06044edf91e7dc070e3"
dependencies = [
"dyn-clone",
"schemars_derive",
"serde",
"serde_json",
]
[[package]]
name = "schemars_derive"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41ae4dce13e8614c46ac3c38ef1c0d668b101df6ac39817aebdaa26642ddae9b"
dependencies = [
"proc-macro2",
"quote",
"serde_derive_internals",
"syn",
]
[[package]]
name = "scopeguard"
version = "1.1.0"
@ -4137,6 +4405,16 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-value"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
dependencies = [
"ordered-float 2.10.0",
"serde",
]
[[package]]
name = "serde-xml-rs"
version = "0.4.1"
@ -4170,6 +4448,17 @@ dependencies = [
"syn",
]
[[package]]
name = "serde_derive_internals"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dbab34ca63057a1f15280bdf3c39f2b1eb1b54c17e98360e511637aef7418c6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.78"
@ -4203,6 +4492,18 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_yaml"
version = "0.8.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a521f2940385c165a24ee286aa8599633d162077a54bdcae2a6fd5a7bfa7a0"
dependencies = [
"indexmap",
"ryu",
"serde",
"yaml-rust",
]
[[package]]
name = "server"
version = "0.1.0"
@ -4905,6 +5206,7 @@ dependencies = [
"futures-sink",
"log",
"pin-project-lite",
"slab",
"tokio",
]
@ -5011,6 +5313,24 @@ dependencies = [
"tracing",
]
[[package]]
name = "tower-http"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81eca72647e58054bbfa41e6f297c23436f1c60aff6e5eb38455a0f9ca420bb5"
dependencies = [
"base64 0.13.0",
"bytes",
"futures-core",
"futures-util",
"http",
"http-body",
"pin-project",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-layer"
version = "0.3.1"
@ -5095,11 +5415,12 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.21"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23"
dependencies = [
"lazy_static",
"valuable",
]
[[package]]
@ -5171,6 +5492,15 @@ dependencies = [
"workspace-hack",
]
[[package]]
name = "treediff"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "761e8d5ad7ce14bb82b7e61ccc0ca961005a275a060b9644a2431aa11553c2ff"
dependencies = [
"serde_json",
]
[[package]]
name = "trogging"
version = "0.1.0"
@ -5276,6 +5606,12 @@ dependencies = [
"getrandom",
]
[[package]]
name = "valuable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
version = "0.2.15"
@ -5669,6 +6005,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3"
[[package]]
name = "yaml-rust"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85"
dependencies = [
"linked-hash-map",
]
[[package]]
name = "zeroize"
version = "1.5.2"

View File

@ -9,6 +9,7 @@ members = [
"db",
"dml",
"generated_types",
"gitops_adapter",
"grpc-router",
"grpc-router-test-gen",
"influxdb_iox",

View File

@ -14,11 +14,11 @@ use job_registry::JobRegistry;
use metric::{Attributes, DurationCounter, Metric, U64Counter};
use observability_deps::tracing::debug;
use parking_lot::Mutex;
use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
use query::{
provider::{ChunkPruner, ProviderBuilder},
pruning::{prune_chunks, PruningObserver},
QueryChunkMeta, QueryCompletedToken, QueryDatabase, DEFAULT_SCHEMA,
QueryChunkMeta, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
};
use schema::Schema;
use std::time::Instant;
@ -27,7 +27,7 @@ use system_tables::{SystemSchemaProvider, SYSTEM_SCHEMA};
use time::TimeProvider;
/// The number of entries to store in the circular query buffer log
const QUERY_LOG_SIZE: usize = 100;
const QUERY_LOG_SIZE: usize = 10_000;
/// Metrics related to chunk access (pruning specifically)
#[derive(Debug)]
@ -290,7 +290,7 @@ impl QueryDatabase for QueryCatalogAccess {
fn record_query(
&self,
query_type: impl Into<String>,
query_text: impl Into<String>,
query_text: QueryText,
) -> QueryCompletedToken<'_> {
// When the query token is dropped the query entry's completion time
// will be set.
@ -398,7 +398,7 @@ mod tests {
use super::*;
use crate::test_helpers::write_lp;
use crate::utils::make_db;
use predicate::predicate::PredicateBuilder;
use predicate::PredicateBuilder;
#[tokio::test]
async fn test_filtered_chunks() {

View File

@ -15,7 +15,7 @@ use mutable_buffer::snapshot::ChunkSnapshot;
use observability_deps::tracing::debug;
use parquet_file::chunk::ParquetChunk;
use partition_metadata::TableSummary;
use predicate::predicate::{Predicate, PredicateMatch};
use predicate::{Predicate, PredicateMatch};
use query::{exec::stringset::StringSet, QueryChunk, QueryChunkMeta};
use read_buffer::RBChunk;
use schema::InfluxColumnType;

View File

@ -16,7 +16,7 @@ use crate::{
};
use ::lifecycle::select_persistable_chunks;
pub use ::lifecycle::{LifecycleChunk, LockableChunk, LockablePartition};
use ::write_buffer::core::WriteBufferReading;
use ::write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
use async_trait::async_trait;
use data_types::{
chunk_metadata::{ChunkId, ChunkLifecycleAction, ChunkOrder, ChunkSummary},
@ -42,10 +42,10 @@ use parquet_catalog::{
prune::prune_history as prune_catalog_transaction_history,
};
use persistence_windows::{checkpoint::ReplayPlan, persistence_windows::PersistenceWindows};
use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
use query::{
exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext},
QueryCompletedToken, QueryDatabase,
QueryCompletedToken, QueryDatabase, QueryText,
};
use rand_distr::{Distribution, Poisson};
use schema::selection::Selection;
@ -53,7 +53,7 @@ use schema::Schema;
use snafu::{ensure, OptionExt, ResultExt, Snafu};
use std::{
any::Any,
collections::{HashMap, HashSet},
collections::{BTreeMap, HashMap, HashSet},
sync::{
atomic::{AtomicUsize, Ordering},
Arc,
@ -112,6 +112,11 @@ pub enum Error {
source: persistence_windows::checkpoint::Error,
},
#[snafu(display("Cannot setup write buffer: {}", source))]
WriteBuffer {
source: ::write_buffer::core::WriteBufferError,
},
#[snafu(display("Cannot replay: {}", source))]
ReplayError { source: crate::replay::Error },
@ -889,16 +894,23 @@ impl Db {
pub async fn perform_replay(
&self,
replay_plan: Option<&ReplayPlan>,
consumer: &mut dyn WriteBufferReading,
) -> Result<()> {
consumer: Arc<dyn WriteBufferReading>,
) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
use crate::replay::{perform_replay, seek_to_end};
if let Some(replay_plan) = replay_plan {
perform_replay(self, replay_plan, consumer)
let streams = consumer.stream_handlers().await.context(WriteBufferSnafu)?;
let streams = if let Some(replay_plan) = replay_plan {
perform_replay(self, replay_plan, streams)
.await
.context(ReplaySnafu)
.context(ReplaySnafu)?
} else {
seek_to_end(self, consumer).await.context(ReplaySnafu)
}
seek_to_end(self, consumer.as_ref(), streams)
.await
.context(ReplaySnafu)?
};
Ok(streams)
}
/// Background worker function
@ -1218,7 +1230,7 @@ impl QueryDatabase for Db {
fn record_query(
&self,
query_type: impl Into<String>,
query_text: impl Into<String>,
query_text: QueryText,
) -> QueryCompletedToken<'_> {
self.catalog_access.record_query(query_type, query_text)
}

View File

@ -30,6 +30,9 @@ pub enum Error {
#[snafu(context(false))]
Aborted { source: futures::future::Aborted },
#[snafu(context(false))]
Timeout { source: tokio::time::error::Elapsed },
#[snafu(display("Read Buffer Error in chunk {}{} : {}", chunk_id, table_name, source))]
ReadBufferChunkError {
source: read_buffer::Error,

View File

@ -14,6 +14,7 @@ use crate::{
DbChunk,
};
use ::lifecycle::LifecycleWriteGuard;
use data_types::error::ErrorLogger;
use data_types::{chunk_metadata::ChunkLifecycleAction, job::Job};
use observability_deps::tracing::{debug, warn};
use parquet_catalog::interface::CatalogParquetInfo;
@ -29,9 +30,13 @@ use persistence_windows::{
use query::QueryChunk;
use schema::selection::Selection;
use snafu::ResultExt;
use std::time::Duration;
use std::{future::Future, sync::Arc};
use tokio::time::timeout;
use tracker::{TaskTracker, TrackedFuture, TrackedFutureExt};
const TIMEOUT: Duration = Duration::from_secs(300);
/// The implementation for writing a chunk to the object store
///
/// `flush_handle` describes both what to persist and also acts as a transaction
@ -111,7 +116,9 @@ pub(super) fn write_chunk_to_object_store(
// catalog-level transaction for preservation layer
{
// fetch shared (= read) guard preventing the cleanup job from deleting our files
let _guard = db.cleanup_lock.read().await;
let _guard = timeout(TIMEOUT, db.cleanup_lock.read())
.await
.log_if_error("write chunk cleanup lock")?;
// Write this table data into the object store
//
@ -128,9 +135,13 @@ pub(super) fn write_chunk_to_object_store(
time_of_last_write,
chunk_order,
};
let written_result = storage
.write_to_object_store(addr.clone(), stream, metadata)
let written_result = timeout(
TIMEOUT,
storage.write_to_object_store(addr.clone(), stream, metadata),
)
.await
.log_if_error("write chunk to object store")?
.context(WritingToObjectStoreSnafu)?;
// the stream was empty
@ -160,13 +171,17 @@ pub(super) fn write_chunk_to_object_store(
//
// This ensures that any deletes encountered during or prior to the replay window
// must have been made durable within the catalog for any persisted chunks
let delete_handle = db.delete_predicates_mailbox.consume().await;
let delete_handle = timeout(TIMEOUT, db.delete_predicates_mailbox.consume())
.await
.log_if_error("delete handle")?;
// IMPORTANT: Start transaction AFTER writing the actual parquet file so we do not hold
// the transaction lock (that is part of the PreservedCatalog) for too long.
// By using the cleanup lock (see above) it is ensured that the file that we
// have written is not deleted in between.
let mut transaction = db.preserved_catalog.open_transaction().await;
let mut transaction = timeout(TIMEOUT, db.preserved_catalog.open_transaction())
.await
.log_if_error("preserved catalog transaction")?;
// add parquet file
let info = CatalogParquetInfo {
@ -194,7 +209,10 @@ pub(super) fn write_chunk_to_object_store(
}
// preserved commit
let ckpt_handle = transaction.commit().await.context(CommitSnafu)?;
let ckpt_handle = timeout(TIMEOUT, transaction.commit())
.await
.log_if_error("preserved catalog commit")?
.context(CommitSnafu)?;
// Deletes persisted correctly
delete_handle.flush();
@ -216,10 +234,14 @@ pub(super) fn write_chunk_to_object_store(
// NOTE: There can only be a single transaction in this section because the checkpoint handle holds
// transaction lock. Therefore we don't need to worry about concurrent modifications of
// preserved chunks.
if let Err(e) = ckpt_handle
.create_checkpoint(checkpoint_data_from_catalog(&db.catalog))
let checkpoint_result = timeout(
TIMEOUT,
ckpt_handle.create_checkpoint(checkpoint_data_from_catalog(&db.catalog)),
)
.await
{
.log_if_error("create checkpoint")?;
if let Err(e) = checkpoint_result {
warn!(%e, "cannot create catalog checkpoint");
// That's somewhat OK. Don't fail the entire task, because the actual preservation was completed

View File

@ -3,7 +3,7 @@
use std::convert::TryFrom;
use predicate::predicate::Predicate;
use predicate::Predicate;
use snafu::Snafu;
#[derive(Debug, Snafu)]
@ -55,7 +55,7 @@ pub mod test {
use datafusion::logical_plan::{col, lit, Expr};
use datafusion::scalar::ScalarValue;
use predicate::predicate::PredicateBuilder;
use predicate::PredicateBuilder;
use read_buffer::BinaryExpr as RBBinaryExpr;
use read_buffer::Predicate as RBPredicate;

View File

@ -7,19 +7,19 @@ use std::{
};
use parking_lot::Mutex;
use query::QueryText;
use time::{Time, TimeProvider};
// The query duration used for queries still running.
const UNCOMPLETED_DURATION: i64 = -1;
/// Information about a single query that was executed
#[derive(Debug)]
pub struct QueryLogEntry {
/// The type of query
pub query_type: String,
/// The text of the query (SQL for sql queries, pbjson for storage rpc queries)
pub query_text: String,
pub query_text: QueryText,
/// Time at which the query was run
pub issue_time: Time,
@ -29,9 +29,20 @@ pub struct QueryLogEntry {
query_completed_duration: atomic::AtomicI64,
}
impl std::fmt::Debug for QueryLogEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("QueryLogEntry")
.field("query_type", &self.query_type)
.field("query_text", &self.query_text.to_string())
.field("issue_time", &self.issue_time)
.field("query_completed_duration", &self.query_completed_duration)
.finish()
}
}
impl QueryLogEntry {
/// Creates a new QueryLogEntry -- use `QueryLog::push` to add new entries to the log
fn new(query_type: String, query_text: String, issue_time: Time) -> Self {
fn new(query_type: String, query_text: QueryText, issue_time: Time) -> Self {
Self {
query_type,
query_text,
@ -77,14 +88,10 @@ impl QueryLog {
}
}
pub fn push(
&self,
query_type: impl Into<String>,
query_text: impl Into<String>,
) -> Arc<QueryLogEntry> {
pub fn push(&self, query_type: impl Into<String>, query_text: QueryText) -> Arc<QueryLogEntry> {
let entry = Arc::new(QueryLogEntry::new(
query_type.into(),
query_text.into(),
query_text,
self.time_provider.now(),
));
@ -126,7 +133,7 @@ mod test_super {
let entry = Arc::new(QueryLogEntry::new(
"sql".into(),
"SELECT 1".into(),
Box::new("SELECT 1"),
time_provider.now(),
));
// query has not completed

View File

@ -20,7 +20,7 @@ use std::{
time::Duration,
};
use time::Time;
use write_buffer::core::WriteBufferReading;
use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
#[allow(clippy::enum_variant_names)]
#[derive(Debug, Snafu)]
@ -85,22 +85,34 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
/// operation fails. In that case some of the sequencers in the write buffers might already be seeked and others not.
/// The caller must NOT use the write buffer in that case without ensuring that it is put into some proper state, e.g.
/// by retrying this function.
pub async fn seek_to_end(db: &Db, write_buffer: &mut dyn WriteBufferReading) -> Result<()> {
let mut watermarks = vec![];
for (sequencer_id, stream) in write_buffer.streams() {
let watermark = (stream.fetch_high_watermark)()
.await
.context(SeekSnafu { sequencer_id })?;
watermarks.push((sequencer_id, watermark));
}
pub async fn seek_to_end(
db: &Db,
write_buffer: &dyn WriteBufferReading,
write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
// need to convert the btree into a vec because the btree iterator is not `Send`
let write_buffer_streams: Vec<_> = write_buffer_streams.into_iter().collect();
for (sequencer_id, watermark) in &watermarks {
write_buffer
.seek(*sequencer_id, *watermark)
let mut watermarks = vec![];
for (sequencer_id, _handler) in &write_buffer_streams {
let watermark = write_buffer
.fetch_high_watermark(*sequencer_id)
.await
.context(SeekSnafu {
sequencer_id: *sequencer_id,
})?;
watermarks.push((*sequencer_id, watermark));
}
let mut write_buffer_streams_res = BTreeMap::new();
for ((sequencer_id, watermark), (sequencer_id_2, mut handler)) in
watermarks.iter().zip(write_buffer_streams)
{
assert_eq!(*sequencer_id, sequencer_id_2);
handler.seek(*watermark).await.context(SeekSnafu {
sequencer_id: *sequencer_id,
})?;
write_buffer_streams_res.insert(*sequencer_id, handler);
}
// remember max seen sequence numbers
@ -142,24 +154,20 @@ pub async fn seek_to_end(db: &Db, write_buffer: &mut dyn WriteBufferReading) ->
}
}
Ok(())
Ok(write_buffer_streams_res)
}
/// Perform sequencer-driven replay for this DB.
pub async fn perform_replay(
db: &Db,
replay_plan: &ReplayPlan,
write_buffer: &mut dyn WriteBufferReading,
) -> Result<()> {
write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>> {
let db_name = db.rules.read().db_name().to_string();
info!(%db_name, "starting replay");
// check if write buffer and replay plan agree on the set of sequencer ids
let sequencer_ids: BTreeSet<_> = write_buffer
.streams()
.into_iter()
.map(|(sequencer_id, _stream)| sequencer_id)
.collect();
let sequencer_ids: BTreeSet<_> = write_buffer_streams.keys().copied().collect();
for sequencer_id in replay_plan.sequencer_ids() {
if !sequencer_ids.contains(&sequencer_id) {
return Err(Error::UnknownSequencer {
@ -179,31 +187,30 @@ pub async fn perform_replay(
})
.collect();
// need to convert the btree into a vec because the btree iterator is not `Send`
let mut write_buffer_streams: Vec<_> = write_buffer_streams.into_iter().collect();
// seek write buffer according to the plan
for (sequencer_id, min_max) in &replay_ranges {
for (sequencer_id, handler) in write_buffer_streams.iter_mut() {
if let Some(min_max) = replay_ranges.get(sequencer_id) {
if let Some(min) = min_max.min() {
info!(%db_name, sequencer_id, sequence_number=min, "seek sequencer in preperation for replay");
write_buffer
.seek(*sequencer_id, min)
.await
.context(SeekSnafu {
handler.seek(min).await.context(SeekSnafu {
sequencer_id: *sequencer_id,
})?;
} else {
let sequence_number = min_max.max() + 1;
info!(%db_name, sequencer_id, sequence_number, "seek sequencer that did not require replay");
write_buffer
.seek(*sequencer_id, sequence_number)
.await
.context(SeekSnafu {
handler.seek(sequence_number).await.context(SeekSnafu {
sequencer_id: *sequencer_id,
})?;
}
}
}
// replay ranges
for (sequencer_id, mut stream) in write_buffer.streams() {
if let Some(min_max) = replay_ranges.get(&sequencer_id) {
for (sequencer_id, handler) in write_buffer_streams.iter_mut() {
if let Some(min_max) = replay_ranges.get(sequencer_id) {
if min_max.min().is_none() {
// no replay required
continue;
@ -216,19 +223,17 @@ pub async fn perform_replay(
"replay sequencer",
);
while let Some(dml_operation) = stream
.stream
.try_next()
.await
.context(EntrySnafu { sequencer_id })?
{
let mut stream = handler.stream();
while let Some(dml_operation) = stream.try_next().await.context(EntrySnafu {
sequencer_id: *sequencer_id,
})? {
let sequence = *dml_operation
.meta()
.sequence()
.expect("entry must be sequenced");
if sequence.number > min_max.max() {
return Err(Error::EntryLostError {
sequencer_id,
sequencer_id: *sequencer_id,
actual_sequence_number: sequence.number,
expected_sequence_number: min_max.max(),
});
@ -253,6 +258,7 @@ pub async fn perform_replay(
}
Err(crate::DmlError::HardLimitReached {}) if n_try < n_tries => {
if !logged_hard_limit {
let sequencer_id: u32 = *sequencer_id;
info!(
%db_name,
sequencer_id,
@ -313,7 +319,7 @@ pub async fn perform_replay(
}
}
Ok(())
Ok(write_buffer_streams.into_iter().collect())
}
#[derive(Debug, Copy, Clone)]
@ -610,8 +616,12 @@ mod tests {
let mut lifecycle = LifecycleWorker::new(Arc::clone(&test_db.db));
let write_buffer =
Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
let streams = write_buffer.stream_handlers().await.unwrap();
let mut maybe_consumer = Some(WriteBufferConsumer::new(
Box::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap()),
write_buffer,
streams,
Arc::clone(&test_db.db),
&registry,
));
@ -664,16 +674,17 @@ mod tests {
_ => unreachable!(),
};
let mut write_buffer =
MockBufferForReading::new(write_buffer_state.clone(), None).unwrap();
let write_buffer: Arc<dyn WriteBufferReading> = Arc::new(
MockBufferForReading::new(write_buffer_state.clone(), None).unwrap(),
);
test_db
let streams = test_db
.db
.perform_replay(replay_plan, &mut write_buffer)
.perform_replay(replay_plan, Arc::clone(&write_buffer))
.await
.unwrap();
maybe_write_buffer = Some(write_buffer);
maybe_write_buffer = Some((write_buffer, streams));
}
Step::Persist(partitions) => {
let db = &test_db.db;
@ -736,13 +747,20 @@ mod tests {
}
Step::Await(checks) => {
if maybe_consumer.is_none() {
let write_buffer = match maybe_write_buffer.take() {
Some(write_buffer) => write_buffer,
None => MockBufferForReading::new(write_buffer_state.clone(), None)
let (write_buffer, streams) = match maybe_write_buffer.take() {
Some(x) => x,
None => {
let write_buffer: Arc<dyn WriteBufferReading> = Arc::new(
MockBufferForReading::new(write_buffer_state.clone(), None)
.unwrap(),
);
let streams = write_buffer.stream_handlers().await.unwrap();
(write_buffer, streams)
}
};
maybe_consumer = Some(WriteBufferConsumer::new(
Box::new(write_buffer),
write_buffer,
streams,
Arc::clone(&test_db.db),
&registry,
));
@ -981,6 +999,7 @@ mod tests {
#[tokio::test]
async fn replay_ok_two_partitions_persist_second() {
test_helpers::maybe_start_logging();
// acts as regression test for the following PRs:
// - https://github.com/influxdata/influxdb_iox/pull/2079
// - https://github.com/influxdata/influxdb_iox/pull/2084
@ -1087,6 +1106,7 @@ mod tests {
#[tokio::test]
async fn replay_ok_two_partitions_persist_first() {
test_helpers::maybe_start_logging();
// acts as regression test for the following PRs:
// - https://github.com/influxdata/influxdb_iox/pull/2079
// - https://github.com/influxdata/influxdb_iox/pull/2084
@ -1193,6 +1213,7 @@ mod tests {
#[tokio::test]
async fn replay_ok_nothing_to_replay() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
Step::Restart,
@ -1227,6 +1248,7 @@ mod tests {
#[tokio::test]
async fn replay_ok_different_sequencer_situations() {
test_helpers::maybe_start_logging();
// three sequencers:
// 0: no data at all
// 1: replay required, additional incoming data during downtime
@ -1338,6 +1360,7 @@ mod tests {
#[tokio::test]
async fn replay_ok_interleaved_writes() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
// let's ingest some data for two partitions a and b
@ -1581,6 +1604,7 @@ mod tests {
#[tokio::test]
async fn replay_compacts() {
test_helpers::maybe_start_logging();
let tracing_capture = TracingCapture::new();
// these numbers are handtuned to trigger hard buffer limits w/o making the test too big
@ -1635,6 +1659,7 @@ mod tests {
#[tokio::test]
async fn replay_prune_full_partition() {
test_helpers::maybe_start_logging();
// there the following entries:
//
// 0. table 2, partition a:
@ -1723,6 +1748,7 @@ mod tests {
#[tokio::test]
async fn replay_prune_some_sequences_partition() {
test_helpers::maybe_start_logging();
// there the following entries:
//
// 0. table 2, partition a:
@ -1814,6 +1840,7 @@ mod tests {
#[tokio::test]
async fn replay_prune_rows() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
Step::Ingest(vec![
@ -1923,6 +1950,7 @@ mod tests {
#[tokio::test]
async fn replay_works_with_checkpoints_all_full_persisted_1() {
test_helpers::maybe_start_logging();
ReplayTest {
catalog_transactions_until_checkpoint: NonZeroU64::new(2).unwrap(),
steps: vec![
@ -1962,6 +1990,7 @@ mod tests {
#[tokio::test]
async fn replay_works_with_checkpoints_all_full_persisted_2() {
test_helpers::maybe_start_logging();
// try to provoke an catalog checkpoints that lists database checkpoints in the wrong order
ReplayTest {
catalog_transactions_until_checkpoint: NonZeroU64::new(2).unwrap(),
@ -2050,6 +2079,7 @@ mod tests {
#[tokio::test]
async fn replay_works_partially_persisted_1() {
test_helpers::maybe_start_logging();
// regression test for https://github.com/influxdata/influxdb_iox/issues/2185
let tracing_capture = TracingCapture::new();
@ -2121,6 +2151,7 @@ mod tests {
#[tokio::test]
async fn replay_works_partially_persisted_2() {
test_helpers::maybe_start_logging();
// regression test for https://github.com/influxdata/influxdb_iox/issues/2185
let tracing_capture = TracingCapture::new();
@ -2202,6 +2233,7 @@ mod tests {
#[tokio::test]
async fn replay_works_after_skip() {
test_helpers::maybe_start_logging();
let tracing_capture = TracingCapture::new();
ReplayTest {
@ -2272,6 +2304,7 @@ mod tests {
#[tokio::test]
async fn replay_initializes_max_seen_sequence_numbers() {
test_helpers::maybe_start_logging();
// Ensures that either replay or the catalog loading initializes the maximum seen sequence numbers (per
// partition) correctly. Before this test (and its fix), sequence numbers were only written if there was any
// unpersisted range during replay.
@ -2402,6 +2435,7 @@ mod tests {
#[tokio::test]
async fn skip_replay_initializes_max_seen_sequence_numbers() {
test_helpers::maybe_start_logging();
// Similar case to `replay_initializes_max_seen_sequence_numbers` but instead of replaying, we skip replay to
// provoke a similar outcome.
//
@ -2528,6 +2562,7 @@ mod tests {
#[tokio::test]
async fn replay_after_drop() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
Step::Ingest(vec![
@ -2630,6 +2665,7 @@ mod tests {
#[tokio::test]
async fn replay_delete() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
Step::Ingest(vec![TestSequencedEntry {
@ -2696,6 +2732,7 @@ mod tests {
#[tokio::test]
async fn replay_delete_persisted_chunks() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
Step::Ingest(vec![TestSequencedEntry {
@ -2751,6 +2788,7 @@ mod tests {
// This test replay compact os chunks with deletes and duplicates
#[tokio::test]
async fn replay_delete_compact_os_chunks() {
test_helpers::maybe_start_logging();
ReplayTest {
steps: vec![
// --------------------------
@ -2913,10 +2951,12 @@ mod tests {
#[tokio::test]
async fn replay_fail_sequencers_change() {
test_helpers::maybe_start_logging();
// create write buffer w/ sequencer 0 and 1
let write_buffer_state =
MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(2).unwrap());
let mut write_buffer = MockBufferForReading::new(write_buffer_state, None).unwrap();
let write_buffer: Arc<dyn WriteBufferReading> =
Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
// create DB
let db = TestDb::builder().build().await.db;
@ -2940,9 +2980,7 @@ mod tests {
let replay_plan = replay_planner.build().unwrap();
// replay fails
let res = db
.perform_replay(Some(&replay_plan), &mut write_buffer)
.await;
let res = db.perform_replay(Some(&replay_plan), write_buffer).await;
assert_contains!(
res.unwrap_err().to_string(),
"Replay plan references unknown sequencer"
@ -2951,12 +2989,14 @@ mod tests {
#[tokio::test]
async fn replay_fail_lost_entry() {
test_helpers::maybe_start_logging();
// create write buffer state with sequence number 0 and 2, 1 is missing
let write_buffer_state =
MockBufferSharedState::empty_with_n_sequencers(NonZeroU32::try_from(1).unwrap());
write_buffer_state.push_lp(Sequence::new(0, 0), "cpu bar=1 0");
write_buffer_state.push_lp(Sequence::new(0, 2), "cpu bar=1 10");
let mut write_buffer = MockBufferForReading::new(write_buffer_state, None).unwrap();
let write_buffer: Arc<dyn WriteBufferReading> =
Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
// create DB
let db = TestDb::builder().build().await.db;
@ -2979,9 +3019,7 @@ mod tests {
let replay_plan = replay_planner.build().unwrap();
// replay fails
let res = db
.perform_replay(Some(&replay_plan), &mut write_buffer)
.await;
let res = db.perform_replay(Some(&replay_plan), write_buffer).await;
assert_contains!(
res.unwrap_err().to_string(),
"Cannot replay: For sequencer 0 expected to find sequence 1 but replay jumped to 2"
@ -2990,6 +3028,7 @@ mod tests {
#[tokio::test]
async fn seek_to_end_works() {
test_helpers::maybe_start_logging();
// setup watermarks:
// 0 -> 3 + 1 = 4
// 1 -> 1 + 1 = 2
@ -2999,14 +3038,18 @@ mod tests {
write_buffer_state.push_lp(Sequence::new(0, 0), "cpu bar=0 0");
write_buffer_state.push_lp(Sequence::new(0, 3), "cpu bar=3 3");
write_buffer_state.push_lp(Sequence::new(1, 1), "cpu bar=11 11");
let mut write_buffer = MockBufferForReading::new(write_buffer_state.clone(), None).unwrap();
let write_buffer: Arc<dyn WriteBufferReading> =
Arc::new(MockBufferForReading::new(write_buffer_state.clone(), None).unwrap());
// create DB
let test_db = TestDb::builder().build().await;
let db = &test_db.db;
// seek
db.perform_replay(None, &mut write_buffer).await.unwrap();
let streams = db
.perform_replay(None, Arc::clone(&write_buffer))
.await
.unwrap();
// add more data
write_buffer_state.push_lp(Sequence::new(0, 4), "cpu bar=4 4");
@ -3021,7 +3064,7 @@ mod tests {
tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
let consumer =
WriteBufferConsumer::new(Box::new(write_buffer), Arc::clone(db), &Default::default());
WriteBufferConsumer::new(write_buffer, streams, Arc::clone(db), &Default::default());
// wait until checks pass
let checks = vec![Check::Query(
@ -3040,6 +3083,7 @@ mod tests {
loop {
println!("Try checks...");
if ReplayTest::eval_checks(&checks, false, &test_db).await {
println!("checks passed...");
break;
}

View File

@ -8,19 +8,19 @@
//! For example `SELECT * FROM system.chunks`
use super::{catalog::Catalog, query_log::QueryLog};
use arrow::{
datatypes::{Field, Schema, SchemaRef},
error::Result,
record_batch::RecordBatch,
};
use arrow::{datatypes::SchemaRef, error::Result, record_batch::RecordBatch};
use async_trait::async_trait;
use datafusion::execution::runtime_env::RuntimeEnv;
use datafusion::physical_plan::{
Partitioning, RecordBatchStream, SendableRecordBatchStream, Statistics,
};
use datafusion::{
catalog::schema::SchemaProvider,
datasource::TableProvider,
error::{DataFusionError, Result as DataFusionResult},
physical_plan::{memory::MemoryExec, ExecutionPlan},
catalog::schema::SchemaProvider, datasource::TableProvider, error::Result as DataFusionResult,
physical_plan::ExecutionPlan,
};
use job_registry::JobRegistry;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::{any::Any, sync::Arc};
mod chunks;
@ -65,22 +65,22 @@ impl SystemSchemaProvider {
) -> Self {
let db_name = db_name.into();
let chunks = Arc::new(SystemTableProvider {
inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
table: Arc::new(chunks::ChunksTable::new(Arc::clone(&catalog))),
});
let columns = Arc::new(SystemTableProvider {
inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
table: Arc::new(columns::ColumnsTable::new(Arc::clone(&catalog))),
});
let chunk_columns = Arc::new(SystemTableProvider {
inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
table: Arc::new(columns::ChunkColumnsTable::new(Arc::clone(&catalog))),
});
let operations = Arc::new(SystemTableProvider {
inner: operations::OperationsTable::new(db_name, jobs),
table: Arc::new(operations::OperationsTable::new(db_name, jobs)),
});
let persistence_windows = Arc::new(SystemTableProvider {
inner: persistence::PersistenceWindowsTable::new(catalog),
table: Arc::new(persistence::PersistenceWindowsTable::new(catalog)),
});
let queries = Arc::new(SystemTableProvider {
inner: queries::QueriesTable::new(query_log),
table: Arc::new(queries::QueriesTable::new(query_log)),
});
Self {
chunks,
@ -133,21 +133,20 @@ impl SchemaProvider for SystemSchemaProvider {
}
}
type BatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>> + Send + Sync>;
/// The minimal thing that a system table needs to implement
trait IoxSystemTable: Send + Sync {
/// Produce the schema from this system table
fn schema(&self) -> SchemaRef;
/// Get the contents of the system table as a single RecordBatch
fn batch(&self) -> Result<RecordBatch>;
/// Get the contents of the system table
fn scan(&self, batch_size: usize) -> Result<BatchIterator>;
}
/// Adapter that makes any `IoxSystemTable` a DataFusion `TableProvider`
struct SystemTableProvider<T>
where
T: IoxSystemTable,
{
inner: T,
struct SystemTableProvider<T: IoxSystemTable> {
table: Arc<T>,
}
#[async_trait]
@ -160,7 +159,7 @@ where
}
fn schema(&self) -> SchemaRef {
self.inner.schema()
self.table.schema()
}
async fn scan(
@ -170,134 +169,97 @@ where
_filters: &[datafusion::logical_plan::Expr],
_limit: Option<usize>,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
scan_batch(self.inner.batch()?, self.schema(), projection.as_ref())
}
}
/// Creates a DataFusion ExecutionPlan node that scans a single batch
/// of records.
fn scan_batch(
batch: RecordBatch,
schema: SchemaRef,
projection: Option<&Vec<usize>>,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
// apply projection, if any
let (schema, batch) = match projection {
None => (schema, batch),
Some(projection) => {
let projected_columns: DataFusionResult<Vec<Field>> = projection
.iter()
.map(|i| {
if *i < schema.fields().len() {
Ok(schema.field(*i).clone())
} else {
Err(DataFusionError::Internal(format!(
"Projection index out of range in ChunksProvider: {}",
i
)))
}
})
.collect();
let projected_schema = Arc::new(Schema::new(projected_columns?));
let columns = projection
.iter()
.map(|i| Arc::clone(batch.column(*i)))
.collect::<Vec<_>>();
let projected_batch = RecordBatch::try_new(Arc::clone(&projected_schema), columns)?;
(projected_schema, projected_batch)
}
let schema = self.table.schema();
let projected_schema = match projection.as_ref() {
Some(projection) => Arc::new(schema.project(projection)?),
None => schema,
};
Ok(Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)?))
}
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::{ArrayRef, UInt64Array};
use arrow_util::assert_batches_eq;
use datafusion_util::test_collect;
fn seq_array(start: u64, end: u64) -> ArrayRef {
Arc::new(UInt64Array::from_iter_values(start..end))
}
#[tokio::test]
async fn test_scan_batch_no_projection() {
let batch = RecordBatch::try_from_iter(vec![
("col1", seq_array(0, 3)),
("col2", seq_array(1, 4)),
("col3", seq_array(2, 5)),
("col4", seq_array(3, 6)),
])
.unwrap();
let projection = None;
let scan = scan_batch(batch.clone(), batch.schema(), projection).unwrap();
let collected = test_collect(scan).await;
let expected = vec![
"+------+------+------+------+",
"| col1 | col2 | col3 | col4 |",
"+------+------+------+------+",
"| 0 | 1 | 2 | 3 |",
"| 1 | 2 | 3 | 4 |",
"| 2 | 3 | 4 | 5 |",
"+------+------+------+------+",
];
assert_batches_eq!(&expected, &collected);
}
#[tokio::test]
async fn test_scan_batch_good_projection() {
let batch = RecordBatch::try_from_iter(vec![
("col1", seq_array(0, 3)),
("col2", seq_array(1, 4)),
("col3", seq_array(2, 5)),
("col4", seq_array(3, 6)),
])
.unwrap();
let projection = Some(vec![3, 1]);
let scan = scan_batch(batch.clone(), batch.schema(), projection.as_ref()).unwrap();
let collected = test_collect(scan).await;
let expected = vec![
"+------+------+",
"| col4 | col2 |",
"+------+------+",
"| 3 | 1 |",
"| 4 | 2 |",
"| 5 | 3 |",
"+------+------+",
];
assert_batches_eq!(&expected, &collected);
}
#[tokio::test]
async fn test_scan_batch_bad_projection() {
let batch = RecordBatch::try_from_iter(vec![
("col1", seq_array(0, 3)),
("col2", seq_array(1, 4)),
("col3", seq_array(2, 5)),
("col4", seq_array(3, 6)),
])
.unwrap();
// no column idex 5
let projection = Some(vec![3, 1, 5]);
let result = scan_batch(batch.clone(), batch.schema(), projection.as_ref());
let err_string = result.unwrap_err().to_string();
assert!(
err_string
.contains("Internal error: Projection index out of range in ChunksProvider: 5"),
"Actual error: {}",
err_string
);
Ok(Arc::new(SystemTableExecutionPlan {
table: Arc::clone(&self.table),
projection: projection.clone(),
projected_schema,
}))
}
}
struct SystemTableExecutionPlan<T> {
table: Arc<T>,
projected_schema: SchemaRef,
projection: Option<Vec<usize>>,
}
impl<T> std::fmt::Debug for SystemTableExecutionPlan<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SystemTableExecutionPlan")
.field("projection", &self.projection)
.finish()
}
}
#[async_trait]
impl<T: IoxSystemTable + 'static> ExecutionPlan for SystemTableExecutionPlan<T> {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> SchemaRef {
Arc::clone(&self.projected_schema)
}
fn output_partitioning(&self) -> Partitioning {
Partitioning::UnknownPartitioning(1)
}
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
vec![]
}
fn with_new_children(
&self,
_children: Vec<Arc<dyn ExecutionPlan>>,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
unimplemented!()
}
async fn execute(
&self,
_partition: usize,
runtime: Arc<RuntimeEnv>,
) -> DataFusionResult<SendableRecordBatchStream> {
Ok(Box::pin(SystemTableStream {
projected_schema: Arc::clone(&self.projected_schema),
batches: self.table.scan(runtime.batch_size)?,
projection: self.projection.clone(),
}))
}
fn statistics(&self) -> Statistics {
Statistics::default()
}
}
struct SystemTableStream {
projected_schema: SchemaRef,
projection: Option<Vec<usize>>,
batches: BatchIterator,
}
impl RecordBatchStream for SystemTableStream {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.projected_schema)
}
}
impl futures::Stream for SystemTableStream {
type Item = Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
Poll::Ready(self.batches.next().map(|maybe_batch| {
maybe_batch.and_then(|batch| match &self.projection {
Some(projection) => batch.project(projection),
None => Ok(batch),
})
}))
}
}

View File

@ -1,3 +1,4 @@
use crate::system_tables::BatchIterator;
use crate::{catalog::Catalog, system_tables::IoxSystemTable};
use arrow::{
array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array},
@ -30,9 +31,14 @@ impl IoxSystemTable for ChunksTable {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
let schema = Arc::clone(&self.schema);
let catalog = Arc::clone(&self.catalog);
Ok(Box::new(std::iter::once_with(move || {
from_chunk_summaries(schema, catalog.chunk_summaries())
.log_if_error("system.chunks table")
})))
}
}

View File

@ -1,3 +1,4 @@
use crate::system_tables::BatchIterator;
use crate::{catalog::Catalog, system_tables::IoxSystemTable};
use arrow::array::UInt32Array;
use arrow::{
@ -33,9 +34,13 @@ impl IoxSystemTable for ColumnsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_partition_summaries(self.schema(), self.catalog.partition_summaries())
fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
let schema = Arc::clone(&self.schema);
let catalog = Arc::clone(&self.catalog);
Ok(Box::new(std::iter::once_with(move || {
from_partition_summaries(schema, catalog.partition_summaries())
.log_if_error("system.columns table")
})))
}
}
@ -113,9 +118,13 @@ impl IoxSystemTable for ChunkColumnsTable {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
let schema = Arc::clone(&self.schema);
let catalog = Arc::clone(&self.catalog);
Ok(Box::new(std::iter::once_with(move || {
assemble_chunk_columns(schema, catalog.detailed_chunk_summaries())
.log_if_error("system.column_chunks table")
})))
}
}

View File

@ -1,4 +1,4 @@
use crate::system_tables::IoxSystemTable;
use crate::system_tables::{BatchIterator, IoxSystemTable};
use arrow::{
array::{ArrayRef, StringArray, Time64NanosecondArray, TimestampNanosecondArray},
datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
@ -34,9 +34,15 @@ impl IoxSystemTable for OperationsTable {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
let schema = Arc::clone(&self.schema);
let jobs = Arc::clone(&self.jobs);
let db_name = self.db_name.clone();
Ok(Box::new(std::iter::once_with(move || {
from_task_trackers(schema, &db_name, jobs.tracked())
.log_if_error("system.operations table")
})))
}
}

View File

@ -1,3 +1,4 @@
use crate::system_tables::BatchIterator;
use crate::{catalog::Catalog, system_tables::IoxSystemTable};
use arrow::{
array::{StringArray, TimestampNanosecondArray, UInt64Array},
@ -31,9 +32,14 @@ impl IoxSystemTable for PersistenceWindowsTable {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_write_summaries(self.schema(), self.catalog.persistence_summaries())
fn scan(&self, _batch_size: usize) -> Result<BatchIterator> {
let schema = Arc::clone(&self.schema);
let catalog = Arc::clone(&self.catalog);
Ok(Box::new(std::iter::once_with(move || {
from_write_summaries(schema, catalog.persistence_summaries())
.log_if_error("system.persistence_windows table")
})))
}
}

View File

@ -1,3 +1,4 @@
use crate::system_tables::BatchIterator;
use crate::{
query_log::{QueryLog, QueryLogEntry},
system_tables::IoxSystemTable,
@ -8,7 +9,7 @@ use arrow::{
error::Result,
record_batch::RecordBatch,
};
use data_types::error::ErrorLogger;
use observability_deps::tracing::error;
use std::{collections::VecDeque, sync::Arc};
/// Implementation of system.queries table
@ -32,9 +33,27 @@ impl IoxSystemTable for QueriesTable {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_query_log_entries(self.schema(), self.query_log.entries())
.log_if_error("system.chunks table")
fn scan(&self, batch_size: usize) -> Result<BatchIterator> {
let schema = self.schema();
let entries = self.query_log.entries();
let mut offset = 0;
Ok(Box::new(std::iter::from_fn(move || {
if offset >= entries.len() {
return None;
}
let len = batch_size.min(entries.len() - offset);
match from_query_log_entries(schema.clone(), &entries, offset, len) {
Ok(batch) => {
offset += len;
Some(Ok(batch))
}
Err(e) => {
error!("Error system.chunks table: {:?}", e);
Some(Err(e))
}
}
})))
}
}
@ -57,26 +76,36 @@ fn queries_schema() -> SchemaRef {
fn from_query_log_entries(
schema: SchemaRef,
entries: VecDeque<Arc<QueryLogEntry>>,
entries: &VecDeque<Arc<QueryLogEntry>>,
offset: usize,
len: usize,
) -> Result<RecordBatch> {
let issue_time = entries
.iter()
.skip(offset)
.take(len)
.map(|e| e.issue_time)
.map(|ts| Some(ts.timestamp_nanos()))
.collect::<TimestampNanosecondArray>();
let query_type = entries
.iter()
.skip(offset)
.take(len)
.map(|e| Some(&e.query_type))
.collect::<StringArray>();
let query_text = entries
.iter()
.map(|e| Some(&e.query_text))
.skip(offset)
.take(len)
.map(|e| Some(e.query_text.to_string()))
.collect::<StringArray>();
let query_runtime = entries
.iter()
.skip(offset)
.take(len)
.map(|e| e.query_completed_duration().map(|d| d.as_nanos() as i64))
.collect::<DurationNanosecondArray>();
@ -101,11 +130,14 @@ mod tests {
fn test_from_query_log() {
let now = Time::from_rfc3339("1996-12-19T16:39:57+00:00").unwrap();
let time_provider = Arc::new(time::MockProvider::new(now));
let query_log = QueryLog::new(10, Arc::clone(&time_provider) as Arc<dyn TimeProvider>);
query_log.push("sql", "select * from foo");
query_log.push("sql", Box::new("select * from foo"));
time_provider.inc(std::time::Duration::from_secs(24 * 60 * 60));
query_log.push("sql", "select * from bar");
let read_filter_entry = query_log.push("read_filter", "json goop");
query_log.push("sql", Box::new("select * from bar"));
let read_filter_entry = query_log.push("read_filter", Box::new("json goop"));
let table = QueriesTable::new(Arc::new(query_log));
let expected = vec![
"+----------------------+-------------+-------------------+--------------------+",
@ -117,9 +149,9 @@ mod tests {
"+----------------------+-------------+-------------------+--------------------+",
];
let schema = queries_schema();
let batch = from_query_log_entries(schema.clone(), query_log.entries()).unwrap();
assert_batches_eq!(&expected, &[batch]);
let entries = table.scan(3).unwrap().collect::<Result<Vec<_>>>().unwrap();
assert_eq!(entries.len(), 1);
assert_batches_eq!(&expected, &entries);
// mark one of the queries completed after 4s
let now = Time::from_rfc3339("1996-12-20T16:40:01+00:00").unwrap();
@ -135,7 +167,8 @@ mod tests {
"+----------------------+-------------+-------------------+--------------------+",
];
let batch = from_query_log_entries(schema, query_log.entries()).unwrap();
assert_batches_eq!(&expected, &[batch]);
let entries = table.scan(2).unwrap().collect::<Result<Vec<_>>>().unwrap();
assert_eq!(entries.len(), 2);
assert_batches_eq!(&expected, &entries);
}
}

View File

@ -2,11 +2,12 @@ use crate::Db;
use dml::DmlOperation;
use futures::{
future::{BoxFuture, Shared},
stream::{BoxStream, FuturesUnordered},
stream::FuturesUnordered,
FutureExt, StreamExt, TryFutureExt,
};
use observability_deps::tracing::{debug, error, info, warn};
use std::{
collections::BTreeMap,
future::Future,
sync::Arc,
time::{Duration, Instant},
@ -14,7 +15,7 @@ use std::{
use tokio::task::JoinError;
use tokio_util::sync::CancellationToken;
use trace::span::SpanRecorder;
use write_buffer::core::{FetchHighWatermark, WriteBufferError, WriteBufferReading};
use write_buffer::core::{WriteBufferErrorKind, WriteBufferReading, WriteBufferStreamHandler};
use self::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
pub mod metrics;
@ -32,7 +33,8 @@ pub struct WriteBufferConsumer {
impl WriteBufferConsumer {
pub fn new(
mut write_buffer: Box<dyn WriteBufferReading>,
write_buffer: Arc<dyn WriteBufferReading>,
write_buffer_streams: BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>,
db: Arc<Db>,
registry: &metric::Registry,
) -> Self {
@ -42,16 +44,15 @@ impl WriteBufferConsumer {
let shutdown_captured = shutdown.clone();
let join = tokio::spawn(async move {
let mut futures: FuturesUnordered<_> = write_buffer
.streams()
let mut futures: FuturesUnordered<_> = write_buffer_streams
.into_iter()
.map(|(sequencer_id, stream)| {
.map(|(sequencer_id, handler)| {
let metrics = ingest_metrics.new_sequencer_metrics(sequencer_id);
stream_in_sequenced_entries(
Arc::clone(&db),
Arc::clone(&write_buffer),
sequencer_id,
stream.stream,
stream.fetch_high_watermark,
handler,
metrics,
)
})
@ -100,14 +101,15 @@ impl Drop for WriteBufferConsumer {
/// buffer are ignored.
async fn stream_in_sequenced_entries<'a>(
db: Arc<Db>,
write_buffer: Arc<dyn WriteBufferReading>,
sequencer_id: u32,
mut stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
f_mark: FetchHighWatermark<'a>,
mut stream_handler: Box<dyn WriteBufferStreamHandler>,
mut metrics: SequencerMetrics,
) {
let db_name = db.rules().name.to_string();
let mut watermark_last_updated: Option<Instant> = None;
let mut watermark = 0_u64;
let mut stream = stream_handler.stream();
while let Some(db_write_result) = stream.next().await {
// maybe update sequencer watermark
@ -118,7 +120,7 @@ async fn stream_in_sequenced_entries<'a>(
.map(|ts| now.duration_since(ts) > Duration::from_secs(10))
.unwrap_or(true)
{
match f_mark().await {
match write_buffer.fetch_high_watermark(sequencer_id).await {
Ok(w) => {
watermark = w;
}
@ -140,14 +142,27 @@ async fn stream_in_sequenced_entries<'a>(
// get entry from sequencer
let dml_operation = match db_write_result {
Ok(db_write) => db_write,
// skip over invalid data in the write buffer so recovery can succeed
Err(e) => {
warn!(
error!(
%e,
%db_name,
sequencer_id,
"Error converting write buffer data to SequencedEntry",
"Error reading record from write buffer",
);
match e.kind() {
// If invalid data, simply skip over it
WriteBufferErrorKind::InvalidData => {}
// Otherwise backoff for a period
WriteBufferErrorKind::Unknown
| WriteBufferErrorKind::IO
// TODO: Should probably bail on invalid input error
| WriteBufferErrorKind::InvalidInput => {
// TODO: Exponential backoff
tokio::time::sleep(std::time::Duration::from_secs(10)).await;
}
}
continue;
}
};
@ -251,11 +266,10 @@ mod tests {
let join_handle =
tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
let consumer = WriteBufferConsumer::new(
Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
Arc::clone(&db),
&Default::default(),
);
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
let streams = write_buffer.stream_handlers().await.unwrap();
let consumer =
WriteBufferConsumer::new(write_buffer, streams, Arc::clone(&db), &Default::default());
// check: after a while the persistence windows should have the expected data
let t_0 = Instant::now();
@ -314,8 +328,11 @@ mod tests {
let join_handle =
tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
let streams = write_buffer.stream_handlers().await.unwrap();
let consumer = WriteBufferConsumer::new(
Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
write_buffer,
streams,
Arc::clone(&db),
test_db.metric_registry.as_ref(),
);
@ -457,8 +474,11 @@ mod tests {
let join_handle =
tokio::spawn(async move { db_captured.background_worker(shutdown_captured).await });
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
let streams = write_buffer.stream_handlers().await.unwrap();
let consumer = WriteBufferConsumer::new(
Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap()),
write_buffer,
streams,
Arc::clone(&db),
metric_registry.as_ref(),
);

44
gitops_adapter/Cargo.toml Normal file
View File

@ -0,0 +1,44 @@
[package]
name = "gitops_adapter"
version = "0.1.0"
authors = ["Luke Bond <luke.n.bond@gmail.com>"]
edition = "2021"
# Prevent this from being published to crates.io!
publish = false
[[bin]]
name = "iox-gitops-adapter"
path = "src/main.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
async-trait = "0.1"
chrono = "0.4.15"
clap = { version = "3", features = ["derive", "env"] }
dotenv = "0.15"
futures = "0.3"
k8s-openapi = { version = "0.13.1", features = ["v1_17", "schemars"], default-features = false }
kube = "0.64"
kube-derive = { version = "0.64", default-features = false } # only needed to opt out of schema
kube-runtime = "0.64"
prost = "0.9"
schemars = "0.8.3"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
thiserror = "1.0"
tokio = { version = "1.0", features = ["rt-multi-thread", "macros", "parking_lot"] }
tonic = "0.6"
tracing = { version = "0.1", features = ["release_max_level_debug"] }
workspace-hack = { path = "../workspace-hack"}
trogging = { path = "../trogging", default-features = false, features = ["clap"] }
[build-dependencies]
glob = "0.3.0"
pbjson-build = "0.2"
tonic-build = "0.6"
[dev-dependencies]
assert_matches = "1.5"
parking_lot = { version = "0.11.1" }

25
gitops_adapter/build.rs Normal file
View File

@ -0,0 +1,25 @@
use std::process::Command;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Populate env!(GIT_HASH) with the current git commit
println!("cargo:rustc-env=GIT_HASH={}", get_git_hash());
Ok(())
}
fn get_git_hash() -> String {
let out = match std::env::var("VERSION_HASH") {
Ok(v) => v,
Err(_) => {
let output = Command::new("git")
.args(&["describe", "--always", "--dirty", "--abbrev=64"])
.output()
.expect("failed to execute git rev-parse to read the current git hash");
String::from_utf8(output.stdout).expect("non-utf8 found in git hash")
}
};
assert!(!out.is_empty(), "attempting to embed empty git hash");
out
}

View File

@ -0,0 +1,49 @@
use async_trait::async_trait;
use kube::{
api::{Patch, PatchParams},
Api,
};
use serde_json::json;
use crate::kafka_topic_list::resources::{KafkaTopicList, KafkaTopicListStatus};
#[async_trait]
pub trait KafkaTopicListApi: Send + Sync + Clone + 'static {
/// Gets a KafkaTopicList resource by name.
async fn get_kafka_topic_list(
&self,
kafka_topic_list_name: String,
) -> Result<KafkaTopicList, kube::Error>;
/// Patch status block, if it exists, with the given status.
async fn patch_resource_status(
&self,
kafka_topic_list_name: String,
status: KafkaTopicListStatus,
) -> Result<KafkaTopicList, kube::Error>;
}
#[async_trait]
impl KafkaTopicListApi for Api<KafkaTopicList> {
async fn get_kafka_topic_list(
&self,
kafka_topic_list_name: String,
) -> Result<KafkaTopicList, kube::Error> {
self.get(kafka_topic_list_name.as_str()).await
}
async fn patch_resource_status(
&self,
kafka_topic_list_name: String,
status: KafkaTopicListStatus,
) -> Result<KafkaTopicList, kube::Error> {
let patch_params = PatchParams::default();
let s = json!({ "status": status });
self.patch_status(
kafka_topic_list_name.as_str(),
&patch_params,
&Patch::Merge(&s),
)
.await
}
}

View File

@ -0,0 +1,129 @@
#![allow(missing_docs)]
use std::sync::{mpsc::SyncSender, Arc};
use async_trait::async_trait;
use parking_lot::Mutex;
use crate::kafka_topic_list::{
api::KafkaTopicListApi,
resources::{KafkaTopicList, KafkaTopicListStatus},
};
#[derive(Debug, Clone, PartialEq)]
#[allow(clippy::large_enum_variant)]
pub enum MockKafkaTopicListApiCall {
Get(String),
PatchStatus {
kafka_topic_list_name: String,
status: KafkaTopicListStatus,
},
}
#[derive(Debug, Default)]
pub struct ClientInner {
/// A channel to push call notifications into as they occur.
pub notify: Option<SyncSender<MockKafkaTopicListApiCall>>,
/// A vector of calls in call order for assertions.
pub calls: Vec<MockKafkaTopicListApiCall>,
// Return values
pub get_ret: Vec<Result<KafkaTopicList, kube::Error>>,
pub patch_status_ret: Vec<Result<KafkaTopicList, kube::Error>>,
}
impl ClientInner {
fn record_call(&mut self, c: MockKafkaTopicListApiCall) {
self.calls.push(c.clone());
if let Some(ref n) = self.notify {
let _ = n.send(c);
}
}
}
impl From<ClientInner> for MockKafkaTopicListApi {
fn from(state: ClientInner) -> Self {
Self {
state: Arc::new(Mutex::new(state)),
}
}
}
/// Mock helper to record a call and return the pre-configured value.
///
/// Pushes `$call` to call record, popping `self.$return` and returning it to
/// the caller. If no value exists, the pop attempt causes a panic.
macro_rules! record_and_return {
($self:ident, $call:expr, $return:ident) => {{
let mut state = $self.state.lock();
state.record_call($call);
state.$return.pop().expect("no mock result to return")
}};
}
#[derive(Debug, Default)]
pub struct MockKafkaTopicListApi {
pub state: Arc<Mutex<ClientInner>>,
}
impl MockKafkaTopicListApi {
pub fn with_notify(self, s: SyncSender<MockKafkaTopicListApiCall>) -> Self {
self.state.lock().notify = Some(s);
self
}
pub fn with_get_ret(self, ret: Vec<Result<KafkaTopicList, kube::Error>>) -> Self {
self.state.lock().get_ret = ret;
self
}
pub fn with_patch_status_ret(self, ret: Vec<Result<KafkaTopicList, kube::Error>>) -> Self {
self.state.lock().patch_status_ret = ret;
self
}
pub fn get_calls(&self) -> Vec<MockKafkaTopicListApiCall> {
self.state.lock().calls.clone()
}
}
#[async_trait]
impl KafkaTopicListApi for Arc<MockKafkaTopicListApi> {
/// Gets a KafkaTopicList resource by name.
async fn get_kafka_topic_list(
&self,
kafka_topic_list_name: String,
) -> Result<KafkaTopicList, kube::Error> {
record_and_return!(
self,
MockKafkaTopicListApiCall::Get(kafka_topic_list_name,),
get_ret
)
}
/// Patch status block, if it exists, with the given status.
async fn patch_resource_status(
&self,
kafka_topic_list_name: String,
status: KafkaTopicListStatus,
) -> Result<KafkaTopicList, kube::Error> {
record_and_return!(
self,
MockKafkaTopicListApiCall::PatchStatus {
kafka_topic_list_name,
status,
},
patch_status_ret
)
}
}
/// Cloning a client shares the same mock state across both client instances.
impl Clone for MockKafkaTopicListApi {
fn clone(&self) -> Self {
Self {
state: Arc::clone(&self.state),
}
}
}

View File

@ -0,0 +1,5 @@
pub mod api;
pub mod resources;
#[cfg(test)]
pub mod mock_api;

View File

@ -0,0 +1,108 @@
use kube_derive::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
#[kube(
group = "iox.influxdata.com",
version = "v1alpha1",
kind = "KafkaTopicList",
namespaced,
shortname = "topics"
)]
#[kube(status = "KafkaTopicListStatus")]
#[serde(rename_all = "camelCase")]
pub struct KafkaTopicListSpec {
topics: Vec<String>,
}
#[derive(Deserialize, Serialize, Clone, Debug, Default, JsonSchema, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct KafkaTopicListStatus {
conditions: Vec<KafkaTopicListStatusCondition>,
observed_generation: i64, // type matches that of metadata.generation
}
#[derive(Deserialize, Serialize, Clone, Debug, Default, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct KafkaTopicListStatusCondition {
type_: String,
status: String,
message: String,
last_transition_time: String,
last_update_time: String,
}
impl KafkaTopicListSpec {
pub fn new(topics: Vec<String>) -> Self {
Self { topics }
}
pub fn topics(&self) -> &Vec<String> {
&self.topics
}
}
impl KafkaTopicListStatus {
pub fn conditions(&self) -> &Vec<KafkaTopicListStatusCondition> {
&self.conditions
}
pub fn conditions_mut(&mut self) -> &mut Vec<KafkaTopicListStatusCondition> {
&mut self.conditions
}
pub fn observed_generation(&self) -> i64 {
self.observed_generation
}
pub fn set_observed_generation(&mut self, observed_generation: i64) {
self.observed_generation = observed_generation;
}
}
impl KafkaTopicListStatusCondition {
pub fn new(
type_: String,
status: String,
message: String,
last_transition_time: String,
last_update_time: String,
) -> Self {
Self {
type_,
status,
message,
last_transition_time,
last_update_time,
}
}
pub fn type_(&self) -> &String {
&self.type_
}
pub fn status(&self) -> &String {
&self.status
}
pub fn message(&self) -> &String {
&self.message
}
pub fn last_transition_time(&self) -> &String {
&self.last_transition_time
}
pub fn last_update_time(&self) -> &String {
&self.last_update_time
}
}
impl PartialEq for KafkaTopicListStatusCondition {
// just for assertions in tests; too tedious to have to have the items the same
// too
fn eq(&self, other: &Self) -> bool {
self.type_ == other.type_ && self.status == other.status && self.message == other.message
}
}

537
gitops_adapter/src/main.rs Normal file
View File

@ -0,0 +1,537 @@
use std::{
io::ErrorKind,
sync::Arc,
time::{Duration, SystemTime},
};
use chrono::{DateTime, Utc};
use dotenv::dotenv;
use futures::StreamExt;
use kube::{api::ListParams, Api, Client as K8sClient};
use kube_runtime::controller::{Context, Controller, ReconcilerAction};
use std::process::Command as Cmd;
use thiserror::Error;
use tracing::*;
use trogging::{cli::LoggingConfig, LogFormat};
use crate::kafka_topic_list::{
api::KafkaTopicListApi,
resources::{KafkaTopicList, KafkaTopicListStatus, KafkaTopicListStatusCondition},
};
pub mod kafka_topic_list;
static CONDITION_TYPE_RECONCILED: &str = "Reconciled";
static CONDITION_STATUS_TRUE: &str = "True";
static CONDITION_STATUS_FALSE: &str = "False";
#[derive(Debug, Error)]
enum CatalogError {
#[error("Malformed KafkaTopicList resource: {message}")]
MalformedKafkaTopicListResource { message: String },
#[error("Request to patch status of k8s custom resource failed: {0}")]
PatchStatusError(#[from] kube::Error),
#[error("Failed to execute iox binary to update catalog: {0}")]
IOxBinaryExecFailed(#[from] std::io::Error),
#[error("Request to update catalog with topic failed: {stderr}")]
UpdateTopicError { stderr: String },
#[error("Failed to parse stdout of catalog update command to ID: {0}")]
TopicIdParseError(#[from] std::num::ParseIntError),
}
// Config defines the runtime configuration variables settable on the command
// line.
//
// These fields are automatically converted into a [Clap] CLI.
//
// This has an `allow(missing_docs)` annotation as otherwise the comment is
// added to the CLI help text.
//
// [Clap]: https://github.com/clap-rs/clap
#[derive(Debug, clap::Parser)]
#[clap(
name = "iox-gitops-adapter",
about = "Adapter to configure IOx Catalog from Kubernetes Custom Resources",
long_about = r#"Kubernetes controller responsible for synchronising the IOx Catalog to cluster configuration in a Kubernetes Custom Resource.
Examples:
# Run the gitops adapter server:
iox-gitops-adapter
# See all configuration options
iox-gitops-adapter --help
"#,
version = concat!(env!("CARGO_PKG_VERSION"), " - ", env!("GIT_HASH"))
)]
#[allow(missing_docs)]
pub struct Config {
/// Configure the log level & filter.
///
/// Example values:
/// iox_gitops_adapter=debug
#[clap(flatten)]
logging_config: LoggingConfig,
/// Configure the Kubernetes namespace where custom resources are found.
///
/// Example values:
/// namespace=conductor
#[clap(long = "--namespace", env = "GITOPS_ADAPTER_NAMESPACE")]
namespace: String,
/// Configure the Catalog's Postgres DSN.
///
/// Example values:
/// catalog-dsn=postgres://postgres:postgres@localhost:5432/iox_shared
#[clap(long = "--catalog-dsn", env = "GITOPS_ADAPTER_CATALOG_DSN")]
catalog_dsn: String,
/// Configure the path to the IOx CLI.
///
/// Example values:
/// iox-cli=/usr/bin/influxdb_iox
#[clap(long = "--iox-cli", env = "GITOPS_ADAPTER_IOX_CLI")]
iox_cli: String,
}
#[derive(Debug, clap::Parser)]
enum Command {
Config,
}
impl Config {
/// Returns the (possibly invalid) log filter string.
pub fn log_filter(&self) -> &Option<String> {
&self.logging_config.log_filter
}
/// Returns the (possibly invalid) log format string.
pub fn log_format(&self) -> &LogFormat {
&self.logging_config.log_format
}
}
/// Load the config.
///
/// This pulls in config from the following sources, in order of precedence:
///
/// - command line arguments
/// - user set environment variables
/// - .env file contents
/// - pre-configured default values
pub fn load_config() -> Result<Config, Box<dyn std::error::Error>> {
// Source the .env file before initialising the Config struct - this sets
// any envs in the file, which the Config struct then uses.
//
// Precedence is given to existing env variables.
match dotenv() {
Ok(_) => {}
Err(dotenv::Error::Io(err)) if err.kind() == ErrorKind::NotFound => {
// Ignore this - a missing env file is not an error,
// defaults will be applied when initialising the Config struct.
}
Err(e) => return Err(Box::new(e)),
};
// Load the Config struct - this pulls in any envs set by the user or
// sourced above, and applies any defaults.
Ok(clap::Parser::parse())
}
/// Initialise the tracing subscribers.
fn setup_tracing(
logging_config: &LoggingConfig,
log_env_var: Option<String>,
) -> Result<trogging::TroggingGuard, trogging::Error> {
let drop_handle = logging_config
.to_builder()
.with_default_log_filter(log_env_var.unwrap_or_else(|| "info".to_string()))
.install_global()?;
trace!("logging initialised!");
Ok(drop_handle)
}
async fn reconcile_topics(
path_to_iox_binary: &str,
catalog_dsn: &str,
topics: &[String],
) -> Result<Vec<u32>, CatalogError> {
trace!(
"calling out to {} for topics {:?}",
path_to_iox_binary,
topics
);
topics
.iter()
.map(|topic| {
match Cmd::new(path_to_iox_binary)
.arg("catalog")
.arg("topic")
.arg("update")
.arg("--catalog-dsn")
.arg(catalog_dsn)
.arg(topic)
.output()
{
Ok(output) => match output.status.success() {
true => {
trace!(
"Updated catalog with kafka topic {}. stdout: {}",
topic,
String::from_utf8_lossy(&output.stdout).trim()
);
// The CLI returns an ID on success; try to parse it here to ensure it
// worked; not sure that return zero is enough? e.g. --help will return 0.
// also, we'd like to print the IDs out later
String::from_utf8_lossy(&output.stdout)
.trim()
.parse::<u32>()
.map_err(CatalogError::TopicIdParseError)
}
false => Err(CatalogError::UpdateTopicError {
stderr: String::from_utf8_lossy(&output.stderr).into(),
}),
},
Err(e) => Err(CatalogError::IOxBinaryExecFailed(e)),
}
})
.collect()
}
/// Controller triggers this whenever our main object or our children changed
async fn reconcile<T>(
topics: KafkaTopicList,
ctx: Context<Data<T>>,
) -> Result<ReconcilerAction, CatalogError>
where
T: KafkaTopicListApi,
{
debug!(
"got a change to the kafka topic list custom resource: {:?}",
topics.spec
);
let kafka_topic_list_api = ctx.get_ref().kafka_topic_list_api.clone();
let topics = Arc::new(topics);
// if CR doesn't contain status field, add it
let mut topics_status = match &topics.status {
Some(status) => status.clone(),
None => KafkaTopicListStatus::default(),
};
let kafka_topic_list_name = match &topics.metadata.name {
Some(n) => n.clone(),
None => {
return Err(CatalogError::MalformedKafkaTopicListResource {
message: "Missing metadata.name field".to_string(),
})
}
};
// have we seen this update before?
// NOTE: we may find that we'd prefer to do the reconcile anyway, if it's cheap.
// for now this seems okay
let generation = match topics.metadata.generation {
Some(gen) => {
if topics_status.observed_generation() == gen {
info!("Nothing to reconcile; observedGeneration == generation");
return Ok(ReconcilerAction {
requeue_after: None,
});
}
gen
}
_ => {
return Err(CatalogError::MalformedKafkaTopicListResource {
message: "Missing metadata.generation field".to_string(),
})
}
};
// make a note that we've seen this update
topics_status.set_observed_generation(generation);
// call out to the iox CLI to update the catalog for each topic name in the list
let reconcile_result = reconcile_topics(
&ctx.get_ref().path_to_iox_binary,
&ctx.get_ref().catalog_dsn,
topics.spec.topics(),
)
.await;
// update status subresource based on outcome of reconcile
let now: DateTime<Utc> = SystemTime::now().into();
let now_str = now.to_rfc3339();
let prev_condition = topics_status.conditions().get(0);
let last_transition_time = match prev_condition {
Some(c) if c.status() == CONDITION_STATUS_TRUE => c.last_transition_time().clone(),
_ => now_str.clone(),
};
let new_status = match &reconcile_result {
Ok(v) => {
debug!(
"Updated catalog with kafka topic list: {:?}. IDs returned: {:?}.",
topics.spec.topics(),
v
);
KafkaTopicListStatusCondition::new(
CONDITION_TYPE_RECONCILED.to_string(),
CONDITION_STATUS_TRUE.to_string(),
"".to_string(),
last_transition_time,
now_str.clone(),
)
}
Err(e) => KafkaTopicListStatusCondition::new(
CONDITION_TYPE_RECONCILED.to_string(),
CONDITION_STATUS_FALSE.to_string(),
e.to_string(),
last_transition_time,
now_str.clone(),
),
};
if topics_status.conditions().is_empty() {
topics_status.conditions_mut().insert(0, new_status);
} else {
topics_status.conditions_mut()[0] = new_status;
}
// patch the status field with the updated condition and observed generation
match kafka_topic_list_api
.patch_resource_status(kafka_topic_list_name.clone(), topics_status)
.await
{
Ok(_) => {}
Err(e) => {
// Not great to silently swallow the error here but doesn't feel warranted to requeue
// just because the status wasn't updated
error!("Failed to patch KafkaTopicList status subresource: {}", e);
}
}
reconcile_result.map(|_| ReconcilerAction {
requeue_after: None,
})
}
/// an error handler that will be called when the reconciler fails
fn error_policy<T>(error: &CatalogError, _ctx: Context<Data<T>>) -> ReconcilerAction
where
T: KafkaTopicListApi,
{
error!(%error, "reconciliation error");
ReconcilerAction {
// if a sync fails we want to retry- it could simply be in the process of
// doing another redeploy. there may be a deeper problem, in which case it'll keep trying
// and we'll see errors and investigate. arbitrary duration chosen ¯\_(ツ)_/¯
requeue_after: Some(Duration::from_secs(5)),
}
}
// Data we want access to in error/reconcile calls
struct Data<T>
where
T: KafkaTopicListApi,
{
path_to_iox_binary: String,
catalog_dsn: String,
kafka_topic_list_api: T,
}
#[tokio::main]
async fn main() {
let config = load_config().expect("failed to load config");
let _drop_handle = setup_tracing(&config.logging_config, None).unwrap();
debug!(?config, "loaded config");
info!(git_hash = env!("GIT_HASH"), "starting iox-gitops-adapter");
let k8s_client = K8sClient::try_default()
.await
.expect("couldn't create k8s client");
let topics = Api::<KafkaTopicList>::namespaced(k8s_client.clone(), config.namespace.as_str());
info!("initialised Kubernetes API client");
info!("starting IOx GitOps Adapter");
Controller::new(topics.clone(), ListParams::default())
.run(
reconcile,
error_policy,
Context::new(Data {
path_to_iox_binary: config.iox_cli.clone(),
catalog_dsn: config.catalog_dsn.clone(),
kafka_topic_list_api: topics,
}),
)
.for_each(|res| async move {
match res {
Ok(o) => info!("reconciled {:?}", o),
Err(e) => info!("reconcile failed: {:?}", e),
}
})
.await; // controller does nothing unless polled
}
#[cfg(test)]
mod tests {
use assert_matches::assert_matches;
use kafka_topic_list::{
mock_api::{MockKafkaTopicListApi, MockKafkaTopicListApiCall},
resources::KafkaTopicListSpec,
};
use super::*;
fn create_topics(
name: &str,
spec: KafkaTopicListSpec,
generation: i64,
status: KafkaTopicListStatus,
) -> KafkaTopicList {
let mut c = KafkaTopicList::new(name, spec);
c.metadata.generation = Some(generation);
c.status = Some(status);
c
}
fn create_topics_status(
observed_generation: i64,
reconciled: bool,
message: String,
t: SystemTime,
) -> KafkaTopicListStatus {
let now: DateTime<Utc> = t.into();
let now_str = now.to_rfc3339();
let mut status = KafkaTopicListStatus::default();
status
.conditions_mut()
.push(KafkaTopicListStatusCondition::new(
CONDITION_TYPE_RECONCILED.to_string(),
if reconciled {
CONDITION_STATUS_TRUE.to_string()
} else {
CONDITION_STATUS_FALSE.to_string()
},
message,
now_str.clone(),
now_str,
));
status.set_observed_generation(observed_generation);
status
}
#[tokio::test]
async fn test_single_topic_success() {
let now = SystemTime::now();
let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
vec![Ok(create_topics(
"iox",
KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
1,
create_topics_status(0, true, "".to_string(), now),
))],
));
let data = Data {
path_to_iox_binary: "test/mock-iox-single-topic.sh".to_string(),
catalog_dsn: "unused".to_string(),
kafka_topic_list_api: Arc::clone(&mock_topics_api),
};
let c = create_topics(
"iox",
KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
1,
create_topics_status(0, true, "".to_string(), now),
);
let result = reconcile(c, Context::new(data)).await;
// whole operation returns a successful result.
assert_matches!(result, Ok(ReconcilerAction { .. }));
// ensure status was updated accordingly.
// alas, we don't have a success patch result either, due to the above
assert_eq!(
mock_topics_api.get_calls(),
vec![MockKafkaTopicListApiCall::PatchStatus {
kafka_topic_list_name: "iox".to_string(),
status: create_topics_status(1, true, "".to_string(), now),
}]
);
}
#[tokio::test]
async fn test_multi_topic_success() {
let now = SystemTime::now();
let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
vec![Ok(create_topics(
"iox",
KafkaTopicListSpec::new(vec!["one".to_string(), "two".to_string()]),
1,
create_topics_status(0, true, "".to_string(), now),
))],
));
let data = Data {
path_to_iox_binary: "test/mock-iox-single-topic.sh".to_string(),
catalog_dsn: "unused".to_string(),
kafka_topic_list_api: Arc::clone(&mock_topics_api),
};
let c = create_topics(
"iox",
KafkaTopicListSpec::new(vec!["one".to_string(), "two".to_string()]),
1,
create_topics_status(0, true, "".to_string(), now),
);
let result = reconcile(c, Context::new(data)).await;
// whole operation returns a successful result.
assert_matches!(result, Ok(ReconcilerAction { .. }));
// ensure status was updated accordingly.
assert_eq!(
mock_topics_api.get_calls(),
vec![MockKafkaTopicListApiCall::PatchStatus {
kafka_topic_list_name: "iox".to_string(),
status: create_topics_status(1, true, "".to_string(), now),
}]
);
}
#[tokio::test]
async fn test_single_topic_error() {
let now = SystemTime::now();
let mock_topics_api = Arc::new(MockKafkaTopicListApi::default().with_patch_status_ret(
vec![Ok(create_topics(
"iox",
KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
1,
create_topics_status(0, true, "".to_string(), now),
))],
));
let data = Data {
path_to_iox_binary: "test/mock-iox-failure.sh".to_string(),
catalog_dsn: "unused".to_string(),
kafka_topic_list_api: Arc::clone(&mock_topics_api),
};
let c = create_topics(
"iox",
KafkaTopicListSpec::new(vec!["iox_shared".to_string()]),
1,
create_topics_status(0, false, "".to_string(), now),
);
let result = reconcile(c, Context::new(data)).await;
// whole operation returns a successful result
assert_matches!(result, Err(CatalogError::UpdateTopicError { .. }));
// Ensure status was updated accordingly
assert_eq!(
mock_topics_api.get_calls(),
vec![MockKafkaTopicListApiCall::PatchStatus {
kafka_topic_list_name: "iox".to_string(),
status: create_topics_status(
1,
false,
"Request to update catalog with topic failed: ".to_string(),
now
),
}]
);
}
}

View File

@ -0,0 +1,2 @@
#!/bin/bash
exit 1

View File

@ -0,0 +1,3 @@
#!/bin/bash
echo 42
echo 93

View File

@ -0,0 +1,2 @@
#!/bin/bash
echo 42

View File

@ -13,6 +13,7 @@ db = { path = "../db" }
dml = { path = "../dml" }
generated_types = { path = "../generated_types" }
influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format", "write_lp"] }
influxdb_storage_client = { path = "../influxdb_storage_client" }
influxdb_line_protocol = { path = "../influxdb_line_protocol" }
ingester = { path = "../ingester" }
internal_types = { path = "../internal_types" }
@ -36,6 +37,7 @@ query = { path = "../query" }
read_buffer = { path = "../read_buffer" }
router = { path = "../router" }
router2 = { path = "../router2" }
schema = { path = "../schema" }
server = { path = "../server" }
time = { path = "../time" }
trace = { path = "../trace" }

View File

@ -1,11 +1,11 @@
use std::sync::Arc;
use std::{collections::BTreeMap, sync::Arc};
use data_types::write_buffer::WriteBufferConnection;
use time::SystemProvider;
use trace::TraceCollector;
use write_buffer::{
config::WriteBufferConfigFactory,
core::{WriteBufferError, WriteBufferWriting},
core::{WriteBufferError, WriteBufferReading, WriteBufferWriting},
};
#[derive(Debug, clap::Parser)]
@ -27,27 +27,114 @@ pub struct WriteBufferConfig {
default_value = "iox-shared"
)]
pub(crate) topic: String,
/// Write buffer connection config.
///
/// The concrete options depend on the write buffer type.
///
/// Command line arguments are passed as `--write-buffer-connection-config key1=value1 key2=value2` or
/// `--write-buffer-connection-config key1=value1,key2=value2`.
///
/// Environment variables are passed as `key1=value1,key2=value2,...`.
#[clap(
long = "--write-buffer-connection-config",
env = "INFLUXDB_IOX_WRITE_BUFFER_CONNECTION_CONFIG",
default_value = "",
multiple_values = true,
use_delimiter = true
)]
pub(crate) connection_config: Vec<String>,
}
impl WriteBufferConfig {
/// Initialize the [`WriteBufferWriting`].
pub async fn init_write_buffer(
/// Initialize a [`WriteBufferWriting`].
pub async fn writing(
&self,
metrics: Arc<metric::Registry>,
trace_collector: Option<Arc<dyn TraceCollector>>,
) -> Result<Arc<dyn WriteBufferWriting>, WriteBufferError> {
let write_buffer_config = WriteBufferConnection {
let conn = self.conn();
let factory = Self::factory(metrics);
factory
.new_config_write(&self.topic, trace_collector.as_ref(), &conn)
.await
}
/// Initialize a [`WriteBufferReading`].
pub async fn reading(
&self,
metrics: Arc<metric::Registry>,
trace_collector: Option<Arc<dyn TraceCollector>>,
) -> Result<Arc<dyn WriteBufferReading>, WriteBufferError> {
let conn = self.conn();
let factory = Self::factory(metrics);
factory
.new_config_read(&self.topic, trace_collector.as_ref(), &conn)
.await
}
fn connection_config(&self) -> BTreeMap<String, String> {
let mut cfg = BTreeMap::new();
for s in &self.connection_config {
if s.is_empty() {
continue;
}
if let Some((k, v)) = s.split_once('=') {
cfg.insert(k.to_owned(), v.to_owned());
} else {
cfg.insert(s.clone(), String::from(""));
}
}
cfg
}
fn conn(&self) -> WriteBufferConnection {
WriteBufferConnection {
type_: self.type_.clone(),
connection: self.connection_string.clone(),
connection_config: Default::default(),
connection_config: self.connection_config(),
creation_config: None,
};
}
}
let write_buffer =
WriteBufferConfigFactory::new(Arc::new(SystemProvider::default()), metrics);
let write_buffer = write_buffer
.new_config_write(&self.topic, trace_collector.as_ref(), &write_buffer_config)
.await?;
Ok(write_buffer)
fn factory(metrics: Arc<metric::Registry>) -> WriteBufferConfigFactory {
WriteBufferConfigFactory::new(Arc::new(SystemProvider::default()), metrics)
}
}
#[cfg(test)]
mod tests {
use clap::StructOpt;
use super::*;
#[test]
fn test_connection_config() {
let cfg = WriteBufferConfig::try_parse_from([
"my_binary",
"--write-buffer",
"kafka",
"--write-buffer-addr",
"localhost:1234",
"--write-buffer-connection-config",
"foo=bar",
"",
"x=",
"y",
"foo=baz",
"so=many=args",
])
.unwrap();
let actual = cfg.connection_config();
let expected = BTreeMap::from([
(String::from("foo"), String::from("baz")),
(String::from("x"), String::from("")),
(String::from("y"), String::from("")),
(String::from("so"), String::from("many=args")),
]);
assert_eq!(actual, expected);
}
}

View File

@ -44,8 +44,9 @@ pub async fn command(config: Config) -> Result<(), Error> {
match config.command {
Command::Update(update) => {
let catalog = update.catalog_dsn.get_catalog("cli").await?;
let topics_repo = catalog.kafka_topics();
let topic = topics_repo.create_or_get(&update.db_name).await?;
let mut txn = catalog.start_transaction().await?;
let topic = txn.kafka_topics().create_or_get(&update.db_name).await?;
txn.commit().await?;
println!("{}", topic.id);
Ok(())
}

View File

@ -12,9 +12,9 @@ use crate::{
},
},
};
use data_types::write_buffer::WriteBufferConnection;
use ingester::{
handler::IngestHandlerImpl,
lifecycle::LifecycleConfig,
server::{grpc::GrpcDelegate, http::HttpDelegate, IngesterServer},
};
use iox_catalog::interface::KafkaPartition;
@ -23,9 +23,8 @@ use observability_deps::tracing::*;
use std::collections::BTreeMap;
use std::convert::TryFrom;
use std::sync::Arc;
use std::time::Duration;
use thiserror::Error;
use time::TimeProvider;
use write_buffer::config::WriteBufferConfigFactory;
#[derive(Debug, Error)]
pub enum Error {
@ -50,6 +49,9 @@ pub enum Error {
#[error("sequencer record not found for partition {0}")]
SequencerNotFound(KafkaPartition),
#[error("error initializing ingester: {0}")]
Ingester(#[from] ingester::handler::Error),
#[error("error initializing write buffer {0}")]
WriteBuffer(#[from] write_buffer::core::WriteBufferError),
}
@ -93,6 +95,45 @@ pub struct Config {
env = "INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END"
)]
pub write_buffer_partition_range_end: i32,
/// The ingester will continue to pull data and buffer it from Kafka
/// as long as it is below this size. If it hits this size it will pause
/// ingest from Kafka until persistence goes below this threshold.
#[clap(
long = "--pause-ingest-size-bytes",
env = "INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES"
)]
pub pause_ingest_size_bytes: usize,
/// Once the ingester crosses this threshold of data buffered across
/// all sequencers, it will pick the largest partitions and persist
/// them until it falls below this threshold. An ingester running in
/// a steady state is expected to take up this much memory.
#[clap(
long = "--persist-memory-threshold-bytes",
env = "INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES"
)]
pub persist_memory_threshold_bytes: usize,
/// If an individual partition crosses this size threshold, it will be persisted.
/// The default value is 300MB (in bytes).
#[clap(
long = "--persist-partition-size-threshold-bytes",
env = "INFLUXDB_IOX_PERSIST_PARTITION_SIZE_THRESHOLD_BYTES",
default_value = "314572800"
)]
pub persist_partition_size_threshold_bytes: usize,
/// If a partition has had data buffered for longer than this period of time
/// it will be persisted. This puts an upper bound on how far back the
/// ingester may need to read in Kafka on restart or recovery. The default value
/// is 30 minutes (in seconds).
#[clap(
long = "--persist-partition-age-threshold-seconds",
env = "INFLUXDB_IOX_PERSIST_PARTITION_AGE_THRESHOLD_SECONDS",
default_value = "1800"
)]
pub persist_partition_age_threshold_seconds: u64,
}
pub async fn command(config: Config) -> Result<()> {
@ -100,11 +141,12 @@ pub async fn command(config: Config) -> Result<()> {
let catalog = config.catalog_dsn.get_catalog("ingester").await?;
let kafka_topic = catalog
let mut txn = catalog.start_transaction().await?;
let kafka_topic = txn
.kafka_topics()
.get_by_name(&config.write_buffer_config.topic)
.await?
.ok_or(Error::KafkaTopicNotFound(config.write_buffer_config.topic))?;
.ok_or_else(|| Error::KafkaTopicNotFound(config.write_buffer_config.topic.clone()))?;
if config.write_buffer_partition_range_start > config.write_buffer_partition_range_end {
return Err(Error::KafkaRange);
@ -122,46 +164,45 @@ pub async fn command(config: Config) -> Result<()> {
let mut sequencers = BTreeMap::new();
for k in kafka_partitions {
let s = catalog
let s = txn
.sequencers()
.get_by_topic_id_and_partition(kafka_topic.id, k)
.await?
.ok_or(Error::SequencerNotFound(k))?;
sequencers.insert(k, s);
}
txn.commit().await?;
let metric_registry: Arc<metric::Registry> = Default::default();
let trace_collector = common_state.trace_collector();
let time_provider: Arc<dyn TimeProvider> = Arc::new(time::SystemProvider::new());
let write_buffer_factory =
WriteBufferConfigFactory::new(Arc::clone(&time_provider), Arc::clone(&metric_registry));
let write_buffer_cfg = WriteBufferConnection {
type_: config.write_buffer_config.type_,
connection: config.write_buffer_config.connection_string,
connection_config: Default::default(),
creation_config: None,
};
let write_buffer = write_buffer_factory
.new_config_read(
&kafka_topic.name,
trace_collector.as_ref(),
&write_buffer_cfg,
)
let write_buffer = config
.write_buffer_config
.reading(Arc::clone(&metric_registry), trace_collector.clone())
.await?;
let ingest_handler = Arc::new(IngestHandlerImpl::new(
let lifecycle_config = LifecycleConfig::new(
config.pause_ingest_size_bytes,
config.persist_memory_threshold_bytes,
config.persist_partition_size_threshold_bytes,
Duration::from_secs(config.persist_partition_age_threshold_seconds),
);
let ingest_handler = Arc::new(
IngestHandlerImpl::new(
lifecycle_config,
kafka_topic,
sequencers,
catalog,
object_store,
write_buffer,
&metric_registry,
));
)
.await?,
);
let http = HttpDelegate::new(Arc::clone(&ingest_handler));
let grpc = GrpcDelegate::new(ingest_handler);
let ingester = IngesterServer::new(http, grpc);
let ingester = IngesterServer::new(metric_registry, http, grpc);
let server_type = Arc::new(IngesterServerType::new(ingester, &common_state));
info!("starting ingester");

View File

@ -111,7 +111,8 @@ pub async fn command(config: Config) -> Result<()> {
// This code / auto-creation is for architecture testing purposes only - a
// prod deployment would expect namespaces to be explicitly created and this
// layer would be removed.
let topic_id = catalog
let mut txn = catalog.start_transaction().await?;
let topic_id = txn
.kafka_topics()
.get_by_name(&config.write_buffer_config.topic)
.await?
@ -122,7 +123,7 @@ pub async fn command(config: Config) -> Result<()> {
&config.write_buffer_config.topic
)
});
let query_id = catalog
let query_id = txn
.query_pools()
.create_or_get(&config.query_pool_name)
.await
@ -133,6 +134,8 @@ pub async fn command(config: Config) -> Result<()> {
&config.write_buffer_config.topic, e
)
});
txn.commit().await?;
let handler_stack = NamespaceAutocreation::new(
catalog,
ns_cache,
@ -169,7 +172,7 @@ async fn init_write_buffer(
let write_buffer = Arc::new(
config
.write_buffer_config
.init_write_buffer(metrics, trace_collector)
.writing(metrics, trace_collector)
.await?,
);

View File

@ -102,7 +102,13 @@ async fn load_remote_system_tables(
connection: Connection,
) -> Result<()> {
// all prefixed with "system."
let table_names = vec!["chunks", "chunk_columns", "columns", "operations"];
let table_names = vec![
"chunks",
"chunk_columns",
"columns",
"operations",
"queries",
];
let start = Instant::now();

View File

@ -1,19 +1,35 @@
pub(crate) mod request;
pub(crate) mod response;
use std::num::NonZeroU64;
use snafu::{ResultExt, Snafu};
use tonic::Status;
use generated_types::Predicate;
use influxdb_storage_client::{connection::Connection, Client, OrgAndBucket};
use influxrpc_parser::predicate;
use time;
use snafu::{ResultExt, Snafu};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Unable to parse timestamp '{:?}'", t))]
TimestampParseError { t: String },
pub enum ParseError {
#[snafu(display("unable to parse timestamp '{:?}'", t))]
Timestamp { t: String },
#[snafu(display("Unable to parse predicate: {:?}", source))]
PredicateParseError { source: predicate::Error },
#[snafu(display("unable to parse database name '{:?}'", db_name))]
DBName { db_name: String },
#[snafu(display("unable to parse predicate: {:?}", source))]
Predicate { source: predicate::Error },
#[snafu(display("server error: {:?}", source))]
ServerError { source: Status },
#[snafu(display("error building response: {:?}", source))]
ResponseError { source: response::Error },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
pub type Result<T, E = ParseError> = std::result::Result<T, E>;
/// Craft and submit different types of storage read requests
#[derive(Debug, clap::Parser)]
@ -21,6 +37,10 @@ pub struct Config {
#[clap(subcommand)]
command: Command,
/// The name of the database
#[clap(parse(try_from_str = parse_db_name))]
db_name: OrgAndBucket,
/// The requested start time (inclusive) of the time-range (also accepts RFC3339 format).
#[clap(long, default_value = "-9223372036854775806", parse(try_from_str = parse_range))]
start: i64,
@ -37,12 +57,12 @@ pub struct Config {
// Attempts to parse either a stringified `i64` value. or alternatively parse an
// RFC3339 formatted timestamp into an `i64` value representing nanoseconds
// since the epoch.
fn parse_range(s: &str) -> Result<i64, Error> {
fn parse_range(s: &str) -> Result<i64, ParseError> {
match s.parse::<i64>() {
Ok(v) => Ok(v),
Err(_) => {
// try to parse timestamp
let t = time::Time::from_rfc3339(s).or_else(|_| TimestampParseSnafu { t: s }.fail())?;
let t = time::Time::from_rfc3339(s).or_else(|_| TimestampSnafu { t: s }.fail())?;
Ok(t.timestamp_nanos())
}
}
@ -50,30 +70,90 @@ fn parse_range(s: &str) -> Result<i64, Error> {
// Attempts to parse the optional predicate into an `Predicate` RPC node. This
// node is then used as part of a read request.
fn parse_predicate(expr: &str) -> Result<Predicate, Error> {
fn parse_predicate(expr: &str) -> Result<Predicate, ParseError> {
if expr.is_empty() {
return Ok(Predicate::default());
}
predicate::expr_to_rpc_predicate(expr).context(PredicateParseSnafu)
predicate::expr_to_rpc_predicate(expr).context(PredicateSnafu)
}
// Attempts to parse the database name into and org and bucket ID.
fn parse_db_name(db_name: &str) -> Result<OrgAndBucket, ParseError> {
let parts = db_name.split('_').collect::<Vec<_>>();
if parts.len() != 2 {
return DBNameSnafu {
db_name: db_name.to_owned(),
}
.fail();
}
let org_id = usize::from_str_radix(parts[0], 16).map_err(|_| ParseError::DBName {
db_name: db_name.to_owned(),
})?;
let bucket_id = usize::from_str_radix(parts[1], 16).map_err(|_| ParseError::DBName {
db_name: db_name.to_owned(),
})?;
Ok(OrgAndBucket::new(
NonZeroU64::new(org_id as u64).ok_or_else(|| ParseError::DBName {
db_name: db_name.to_owned(),
})?,
NonZeroU64::new(bucket_id as u64).ok_or_else(|| ParseError::DBName {
db_name: db_name.to_owned(),
})?,
))
}
/// All possible subcommands for storage
#[derive(Debug, clap::Parser)]
enum Command {
/// Issue a read_filter request
ReadFilter(ReadFilter),
ReadFilter,
TagValues(TagValues),
}
/// Create a new database
#[derive(Debug, clap::Parser)]
struct ReadFilter {}
struct TagValues {
// The tag key value to interrogate for tag values.
tag_key: String,
}
/// Create and issue read request
pub async fn command(config: Config) -> Result<()> {
// TODO(edd): handle command/config and execute request
println!("Unimplemented: config is {:?}", config);
Ok(())
pub async fn command(connection: Connection, config: Config) -> Result<()> {
let mut client = influxdb_storage_client::Client::new(connection);
// convert predicate with no root node into None.
let predicate = config.predicate.root.is_some().then(|| config.predicate);
let source = Client::read_source(&config.db_name, 0);
match config.command {
Command::ReadFilter => {
let result = client
.read_filter(request::read_filter(
source,
config.start,
config.stop,
predicate,
))
.await
.context(ServerSnafu)?;
response::pretty_print_frames(&result).context(ResponseSnafu)
}
Command::TagValues(tv) => {
let result = client
.tag_values(request::tag_values(
source,
config.start,
config.stop,
predicate,
tv.tag_key,
))
.await
.context(ServerSnafu)?;
response::pretty_print_strings(result).context(ResponseSnafu)
}
}
}
#[cfg(test)]

View File

@ -0,0 +1,58 @@
pub mod generated_types {
pub use generated_types::influxdata::platform::storage::*;
}
use self::generated_types::*;
use super::response::{
tag_key_is_field, tag_key_is_measurement, FIELD_TAG_KEY_BIN, MEASUREMENT_TAG_KEY_BIN,
};
use ::generated_types::google::protobuf::*;
pub fn read_filter(
org_bucket: Any,
start: i64,
stop: i64,
predicate: std::option::Option<Predicate>,
) -> ReadFilterRequest {
generated_types::ReadFilterRequest {
predicate,
read_source: Some(org_bucket),
range: Some(TimestampRange { start, end: stop }),
key_sort: read_filter_request::KeySort::Unspecified as i32, // IOx doesn't support any other sort
tag_key_meta_names: TagKeyMetaNames::Text as i32,
}
}
pub fn tag_values(
org_bucket: Any,
start: i64,
stop: i64,
predicate: std::option::Option<Predicate>,
tag_key: String,
) -> TagValuesRequest {
let tag_key = if tag_key_is_measurement(tag_key.as_bytes()) {
MEASUREMENT_TAG_KEY_BIN.to_vec()
} else if tag_key_is_field(tag_key.as_bytes()) {
FIELD_TAG_KEY_BIN.to_vec()
} else {
tag_key.as_bytes().to_vec()
};
generated_types::TagValuesRequest {
predicate,
tags_source: Some(org_bucket),
range: Some(TimestampRange { start, end: stop }),
tag_key,
}
}
// TODO Add the following helpers for building requests:
//
// * read_group
// * read_window_aggregate
// * tag_keys
// * tag_values_with_measurement_and_key
// * measurement_names
// * measurement_tag_keys
// * measurement_tag_values
// * measurement_fields

View File

@ -0,0 +1,805 @@
use arrow::{record_batch::RecordBatch, util::pretty::print_batches};
use hashbrown::HashMap;
use std::{
collections::{BTreeMap, BTreeSet},
iter,
string::FromUtf8Error,
sync::Arc,
};
use generated_types::{
read_response::{frame::Data, DataType, SeriesFrame},
Tag,
};
use schema::{builder::SchemaBuilder, InfluxColumnType, InfluxFieldType, Schema};
use snafu::{ResultExt, Snafu};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("arrow error: {:?}", source))]
Arrow { source: arrow::error::ArrowError },
#[snafu(display("frame type currently unsupported: {:?}", frame))]
UnsupportedFrameType { frame: String },
#[snafu(display("tag keys must be valid UTF-8: {:?}", source))]
InvalidTagKey { source: FromUtf8Error },
#[snafu(display("tag values must be valid UTF-8: {:?}", source))]
InvalidTagValue { source: FromUtf8Error },
#[snafu(display("measurement name must be valid UTF-8: {:?}", source))]
InvalidMeasurementName { source: FromUtf8Error },
#[snafu(display("unable to build schema: {:?}", source))]
SchemaBuilding { source: schema::builder::Error },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
// Prints the provided data frames in a tabular format grouped into tables per
// distinct measurement.
pub fn pretty_print_frames(frames: &[Data]) -> Result<()> {
let rbs = frames_to_record_batches(frames)?;
for (k, rb) in rbs {
println!("\n_measurement: {}", k);
println!("rows: {:?}", &rb.num_rows());
print_batches(&[rb]).context(ArrowSnafu)?;
println!("\n");
}
Ok(())
}
// Prints the provided set of strings in a tabular format grouped.
pub fn pretty_print_strings(values: Vec<String>) -> Result<()> {
let schema = SchemaBuilder::new()
.influx_field("values", InfluxFieldType::String)
.build()
.context(SchemaBuildingSnafu)?;
let arrow_schema: arrow::datatypes::SchemaRef = schema.into();
let rb_columns: Vec<Arc<dyn arrow::array::Array>> =
vec![Arc::new(arrow::array::StringArray::from(
values.iter().map(|x| Some(x.as_str())).collect::<Vec<_>>(),
))];
let rb = RecordBatch::try_new(arrow_schema, rb_columns).context(ArrowSnafu)?;
println!("\ntag values: {:?}", &rb.num_rows());
print_batches(&[rb]).context(ArrowSnafu)?;
println!("\n");
Ok(())
}
// This function takes a set of InfluxRPC data frames and converts them into an
// Arrow record batches, which are suitable for pretty printing.
fn frames_to_record_batches(frames: &[Data]) -> Result<BTreeMap<String, RecordBatch>> {
// Run through all the frames once to build the schema of each table we need
// to build as a record batch.
let mut table_column_mapping = determine_tag_columns(frames);
let mut all_tables = BTreeMap::new();
let mut current_table_frame: Option<(IntermediateTable, SeriesFrame)> = None;
if frames.is_empty() {
return Ok(all_tables);
}
for frame in frames {
match frame {
generated_types::read_response::frame::Data::Group(_) => {
return UnsupportedFrameTypeSnafu {
frame: "group_frame".to_owned(),
}
.fail();
}
generated_types::read_response::frame::Data::Series(sf) => {
let cur_frame_measurement = &sf.tags[0].value;
// First series frame in result set.
if current_table_frame.is_none() {
let table = IntermediateTable::try_new(
table_column_mapping
.remove(cur_frame_measurement)
.expect("table column mappings exists for measurement"),
)?;
current_table_frame = Some((table, sf.clone()));
continue;
}
// Subsequent series frames in results.
let (mut current_table, prev_series_frame) = current_table_frame.take().unwrap();
// Series frame has moved on to a different measurement. Push
// this table into a record batch and onto final results, then
// create a new table.
if measurement(&prev_series_frame) != cur_frame_measurement {
let rb: RecordBatch = current_table.try_into()?;
all_tables.insert(
String::from_utf8(measurement(&prev_series_frame).to_owned())
.context(InvalidMeasurementNameSnafu)?,
rb,
);
// Initialise next intermediate table to fill.
current_table = IntermediateTable::try_new(
table_column_mapping
.remove(cur_frame_measurement)
.expect("table column mappings exists for measurement"),
)?;
}
// Put current table (which may have been replaced with a new
// table if _measurement has changed) and series frame back. The
// field key can change on each series frame, so it's important
// to update it each time we see a new series frame, so that the
// value frames know where to push their data.
current_table_frame = Some((current_table, sf.clone()));
// no new column values written so no need to pad.
continue;
}
generated_types::read_response::frame::Data::FloatPoints(f) => {
// Get field key associated with previous series frame.
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let column = current_table.field_column(field_name(prev_series_frame));
let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
column.extend_f64(&values);
let time_column = &mut current_table.time_column;
time_column.extend_from_slice(&f.timestamps);
}
generated_types::read_response::frame::Data::IntegerPoints(f) => {
// Get field key associated with previous series frame.
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let column = current_table.field_column(field_name(prev_series_frame));
let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
column.extend_i64(&values);
let time_column = &mut current_table.time_column;
time_column.extend_from_slice(&f.timestamps);
}
generated_types::read_response::frame::Data::UnsignedPoints(f) => {
// Get field key associated with previous series frame.
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let column = current_table.field_column(field_name(prev_series_frame));
let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
column.extend_u64(&values);
let time_column = &mut current_table.time_column;
time_column.extend_from_slice(&f.timestamps);
}
generated_types::read_response::frame::Data::BooleanPoints(f) => {
// Get field key associated with previous series frame.
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let column = current_table.field_column(field_name(prev_series_frame));
let values = f.values.iter().copied().map(Some).collect::<Vec<_>>();
column.extend_bool(&values);
let time_column = &mut current_table.time_column;
time_column.extend_from_slice(&f.timestamps);
}
generated_types::read_response::frame::Data::StringPoints(f) => {
// Get field key associated with previous series frame.
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let column = current_table.field_column(field_name(prev_series_frame));
let values = f
.values
.iter()
.map(|x| Some(x.to_owned()))
.collect::<Vec<_>>();
column.extend_string(&values);
let time_column = &mut current_table.time_column;
time_column.extend_from_slice(&f.timestamps);
}
};
// If the current frame contained field values/timestamps then we need
// pad all the other columns with either values or NULL so that all
// columns remain the same length.
//
let (current_table, prev_series_frame) = current_table_frame.as_mut().unwrap();
let max_rows = current_table.max_rows();
// Pad all tag columns with keys present in the previous series frame
// with identical values.
for Tag { key, value } in &prev_series_frame.tags {
if tag_key_is_measurement(key) || tag_key_is_field(key) {
continue;
}
let idx = current_table
.tag_columns
.get(key)
.expect("tag column mapping to be present");
let column = &mut current_table.column_data[*idx];
let column_rows = column.len();
assert!(max_rows >= column_rows);
column.pad_tag(
String::from_utf8(value.to_owned()).context(InvalidTagValueSnafu)?,
max_rows - column_rows,
);
}
// Pad all tag columns that were not present in the previous series
// frame with NULL.
for (_, &idx) in &current_table.tag_columns {
let column = &mut current_table.column_data[idx];
let column_rows = column.len();
if column_rows < max_rows {
column.pad_none(max_rows - column_rows);
}
}
// Pad all field columns with NULL such that they're the same length as
// the largest column.
for (_, &idx) in &current_table.field_columns {
let column = &mut current_table.column_data[idx];
let column_rows = column.len();
if column_rows < max_rows {
column.pad_none(max_rows - column_rows);
}
}
}
// Convert and insert current table
let (current_table, prev_series_frame) = current_table_frame.take().unwrap();
let rb: RecordBatch = current_table.try_into()?;
all_tables.insert(
String::from_utf8(measurement(&prev_series_frame).to_owned())
.context(InvalidMeasurementNameSnafu)?,
rb,
);
Ok(all_tables)
}
#[derive(Debug)]
enum ColumnData {
Float(Vec<Option<f64>>),
Integer(Vec<Option<i64>>),
Unsigned(Vec<Option<u64>>),
Boolean(Vec<Option<bool>>),
String(Vec<Option<String>>),
Tag(Vec<Option<String>>),
}
impl ColumnData {
fn pad_tag(&mut self, value: String, additional: usize) {
if let Self::Tag(data) = self {
data.extend(iter::repeat(Some(value)).take(additional));
} else {
unreachable!("can't pad strings into {:?} column", self)
}
}
fn pad_none(&mut self, additional: usize) {
match self {
ColumnData::Float(data) => data.extend(iter::repeat(None).take(additional)),
ColumnData::Integer(data) => data.extend(iter::repeat(None).take(additional)),
ColumnData::Unsigned(data) => data.extend(iter::repeat(None).take(additional)),
ColumnData::Boolean(data) => data.extend(iter::repeat(None).take(additional)),
ColumnData::String(data) => data.extend(iter::repeat(None).take(additional)),
ColumnData::Tag(data) => data.extend(iter::repeat(None).take(additional)),
}
}
fn extend_f64(&mut self, arr: &[Option<f64>]) {
if let Self::Float(data) = self {
data.extend_from_slice(arr);
} else {
unreachable!("can't extend {:?} column with floats", self)
}
}
fn extend_i64(&mut self, arr: &[Option<i64>]) {
if let Self::Integer(data) = self {
data.extend_from_slice(arr);
} else {
unreachable!("can't extend {:?} column with integers", self)
}
}
fn extend_u64(&mut self, arr: &[Option<u64>]) {
if let Self::Unsigned(data) = self {
data.extend_from_slice(arr);
} else {
unreachable!("can't extend {:?} column with unsigned integers", self)
}
}
fn extend_bool(&mut self, arr: &[Option<bool>]) {
if let Self::Boolean(data) = self {
data.extend_from_slice(arr);
} else {
unreachable!("can't extend {:?} column with bools", self)
}
}
fn extend_string(&mut self, arr: &[Option<String>]) {
if let Self::String(data) = self {
data.extend_from_slice(arr);
} else {
unreachable!("can't extend {:?} column with strings", self)
}
}
fn len(&self) -> usize {
match self {
ColumnData::Float(arr) => arr.len(),
ColumnData::Integer(arr) => arr.len(),
ColumnData::Unsigned(arr) => arr.len(),
ColumnData::Boolean(arr) => arr.len(),
ColumnData::String(arr) => arr.len(),
ColumnData::Tag(arr) => arr.len(),
}
}
}
#[derive(Debug)]
struct IntermediateTable {
schema: Schema,
// constant-time access to the correct column from a tag or field key
tag_columns: HashMap<Vec<u8>, usize>,
field_columns: HashMap<Vec<u8>, usize>,
column_data: Vec<ColumnData>,
time_column: Vec<i64>,
}
impl IntermediateTable {
fn try_new(table_columns: TableColumns) -> Result<Self, Error> {
let mut schema_builder = SchemaBuilder::new();
let mut tag_columns = HashMap::new();
let mut field_columns = HashMap::new();
let mut column_data = vec![];
// First add the tag columns to the schema and column data.
for tag_key in table_columns.tag_columns {
let column_name = String::from_utf8(tag_key.clone()).context(InvalidTagKeySnafu)?;
schema_builder.influx_column(&column_name, InfluxColumnType::Tag);
// track position of column
tag_columns.insert(tag_key, column_data.len());
column_data.push(ColumnData::Tag(vec![]));
}
// Then add the field columns to the schema and column data.
for (field_key, data_type) in table_columns.field_columns {
let column_name = String::from_utf8(field_key.clone()).context(InvalidTagKeySnafu)?;
schema_builder.influx_column(
&column_name,
InfluxColumnType::Field(match data_type {
DataType::Float => InfluxFieldType::Float,
DataType::Integer => InfluxFieldType::Integer,
DataType::Unsigned => InfluxFieldType::UInteger,
DataType::Boolean => InfluxFieldType::Boolean,
DataType::String => InfluxFieldType::String,
}),
);
// track position of column
field_columns.insert(field_key, column_data.len());
column_data.push(match data_type {
DataType::Float => ColumnData::Float(vec![]),
DataType::Integer => ColumnData::Integer(vec![]),
DataType::Unsigned => ColumnData::Unsigned(vec![]),
DataType::Boolean => ColumnData::Boolean(vec![]),
DataType::String => ColumnData::String(vec![]),
});
}
// Finally add the timestamp column.
schema_builder.influx_column("time", InfluxColumnType::Timestamp);
let time_column = vec![];
Ok(Self {
schema: schema_builder.build().context(SchemaBuildingSnafu)?,
tag_columns,
field_columns,
column_data,
time_column,
})
}
fn field_column(&mut self, field: &[u8]) -> &mut ColumnData {
let idx = self
.field_columns
.get(field)
.expect("field column mapping to be present");
&mut self.column_data[*idx]
}
// Returns the number of rows in the largest column. Useful for padding the
// rest of the columns out.
fn max_rows(&self) -> usize {
self.column_data
.iter()
.map(|c| c.len())
.max()
.unwrap_or_default()
}
}
impl TryFrom<IntermediateTable> for RecordBatch {
type Error = Error;
fn try_from(table: IntermediateTable) -> Result<Self, Self::Error> {
let arrow_schema: arrow::datatypes::SchemaRef = table.schema.into();
let mut rb_columns: Vec<Arc<dyn arrow::array::Array>> =
Vec::with_capacity(&table.column_data.len() + 1); // + time column
for col in table.column_data {
match col {
ColumnData::Integer(v) => {
rb_columns.push(Arc::new(arrow::array::Int64Array::from(v)));
}
ColumnData::Unsigned(v) => {
rb_columns.push(Arc::new(arrow::array::UInt64Array::from(v)));
}
ColumnData::Float(v) => {
rb_columns.push(Arc::new(arrow::array::Float64Array::from(v)));
}
ColumnData::String(v) => {
rb_columns.push(Arc::new(arrow::array::StringArray::from(
v.iter().map(|s| s.as_deref()).collect::<Vec<_>>(),
)));
}
ColumnData::Boolean(v) => {
rb_columns.push(Arc::new(arrow::array::BooleanArray::from(v)));
}
ColumnData::Tag(v) => {
rb_columns.push(Arc::new(arrow::array::DictionaryArray::<
arrow::datatypes::Int32Type,
>::from_iter(
v.iter().map(|s| s.as_deref())
)));
}
}
}
// time column
rb_columns.push(Arc::new(arrow::array::TimestampNanosecondArray::from(
table.time_column,
)));
Self::try_new(arrow_schema, rb_columns).context(ArrowSnafu)
}
}
// These constants describe known values for the keys associated with
// measurements and fields.
const MEASUREMENT_TAG_KEY_TEXT: [u8; 12] = [
b'_', b'm', b'e', b'a', b's', b'u', b'r', b'e', b'm', b'e', b'n', b't',
];
pub(crate) const MEASUREMENT_TAG_KEY_BIN: [u8; 1] = [0_u8];
const FIELD_TAG_KEY_TEXT: [u8; 6] = [b'_', b'f', b'i', b'e', b'l', b'd'];
pub(crate) const FIELD_TAG_KEY_BIN: [u8; 1] = [255_u8];
// Store a collection of column names and types for a single table (measurement).
#[derive(Debug, Default, PartialEq, Eq)]
struct TableColumns {
tag_columns: BTreeSet<Vec<u8>>,
field_columns: BTreeMap<Vec<u8>, DataType>,
}
// Given a set of data frames determine from the series frames within the set
// of tag columns for each distinct table (measurement).
fn determine_tag_columns(frames: &[Data]) -> BTreeMap<Vec<u8>, TableColumns> {
let mut schema: BTreeMap<Vec<u8>, TableColumns> = BTreeMap::new();
for frame in frames {
if let Data::Series(sf) = frame {
assert!(!sf.tags.is_empty(), "expected _measurement and _field tags");
assert!(tag_key_is_measurement(&sf.tags[0].key));
// PERF: avoid clone of value
let measurement_name = sf.tags[0].value.clone();
let table = schema.entry(measurement_name).or_default();
for Tag { key, value } in sf.tags.iter().skip(1) {
if tag_key_is_field(key) {
table.field_columns.insert(value.clone(), sf.data_type());
continue;
}
// PERF: avoid clone of key
table.tag_columns.insert(key.clone()); // Add column to table schema
}
}
}
schema
}
// Extract a reference to the measurement name from a Series frame.
fn measurement(frame: &SeriesFrame) -> &Vec<u8> {
assert!(tag_key_is_measurement(&frame.tags[0].key));
&frame.tags[0].value
}
// Extract a reference to the field name from a Series frame.
fn field_name(frame: &SeriesFrame) -> &Vec<u8> {
let idx = frame.tags.len() - 1;
assert!(tag_key_is_field(&frame.tags[idx].key));
&frame.tags[idx].value
}
pub(crate) fn tag_key_is_measurement(key: &[u8]) -> bool {
(key == MEASUREMENT_TAG_KEY_TEXT) || (key == MEASUREMENT_TAG_KEY_BIN)
}
pub(crate) fn tag_key_is_field(key: &[u8]) -> bool {
(key == FIELD_TAG_KEY_TEXT) || (key == FIELD_TAG_KEY_BIN)
}
#[cfg(test)]
mod test_super {
use arrow::util::pretty::pretty_format_batches;
use generated_types::read_response::{
BooleanPointsFrame, FloatPointsFrame, IntegerPointsFrame, SeriesFrame, StringPointsFrame,
UnsignedPointsFrame,
};
use super::*;
// converts a vector of key/value pairs into a vector of `Tag`.
fn make_tags(pairs: &[(&str, &str)]) -> Vec<Tag> {
pairs
.iter()
.map(|(key, value)| Tag {
key: key.as_bytes().to_vec(),
value: value.as_bytes().to_vec(),
})
.collect::<Vec<_>>()
}
struct TableColumnInput<'a> {
measurement: &'a str,
tags: &'a [&'a str],
fields: &'a [(&'a str, DataType)],
}
impl<'a> TableColumnInput<'a> {
fn new(measurement: &'a str, tags: &'a [&str], fields: &'a [(&str, DataType)]) -> Self {
Self {
measurement,
tags,
fields,
}
}
}
// converts a vector of key/value tag pairs and a field datatype into a
// collection of `TableColumns` objects.
fn make_table_columns(input: &'_ [TableColumnInput<'_>]) -> BTreeMap<Vec<u8>, TableColumns> {
let mut all_table_columns = BTreeMap::new();
for TableColumnInput {
measurement,
tags,
fields,
} in input
{
let tag_columns = tags
.iter()
.map(|c| c.as_bytes().to_vec())
.collect::<Vec<Vec<u8>>>();
let mut tag_columns_set = BTreeSet::new();
for c in tag_columns {
tag_columns_set.insert(c);
}
let mut field_columns = BTreeMap::new();
for (field, data_type) in *fields {
field_columns.insert(field.as_bytes().to_vec(), *data_type);
}
let table_columns = TableColumns {
tag_columns: tag_columns_set,
field_columns,
};
all_table_columns.insert(measurement.as_bytes().to_vec(), table_columns);
}
all_table_columns
}
// generate a substantial set of frames across multiple tables.
fn gen_frames() -> Vec<Data> {
vec![
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "cpu"),
("host", "foo"),
("server", "a"),
("_field", "temp"),
]),
data_type: DataType::Float as i32,
}),
Data::FloatPoints(FloatPointsFrame {
timestamps: vec![1, 2, 3, 4],
values: vec![1.1, 2.2, 3.3, 4.4],
}),
Data::FloatPoints(FloatPointsFrame {
timestamps: vec![5, 6, 7, 10],
values: vec![5.1, 5.2, 5.3, 10.4],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "cpu"),
("host", "foo"),
("server", "a"),
("_field", "voltage"),
]),
data_type: DataType::Integer as i32,
}),
Data::IntegerPoints(IntegerPointsFrame {
timestamps: vec![1, 2],
values: vec![22, 22],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "cpu"),
("host", "foo"),
("new_column", "a"),
("_field", "voltage"),
]),
data_type: DataType::Integer as i32,
}),
Data::IntegerPoints(IntegerPointsFrame {
timestamps: vec![100, 200],
values: vec![1000, 2000],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[("_measurement", "another table"), ("_field", "voltage")]),
data_type: DataType::String as i32,
}),
Data::StringPoints(StringPointsFrame {
timestamps: vec![200, 201],
values: vec!["hello".to_string(), "abc".to_string()],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "another table"),
("region", "west"),
("_field", "voltage"),
]),
data_type: DataType::String as i32,
}),
Data::StringPoints(StringPointsFrame {
timestamps: vec![302, 304],
values: vec!["foo".to_string(), "bar".to_string()],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "another table"),
("region", "north"),
("_field", "bool_field"),
]),
data_type: DataType::Boolean as i32,
}),
Data::BooleanPoints(BooleanPointsFrame {
timestamps: vec![1000],
values: vec![true],
}),
Data::Series(SeriesFrame {
tags: make_tags(&[
("_measurement", "another table"),
("region", "south"),
("_field", "unsigned_field"),
]),
data_type: DataType::Unsigned as i32,
}),
Data::UnsignedPoints(UnsignedPointsFrame {
timestamps: vec![2000],
values: vec![600],
}),
]
}
#[test]
fn test_determine_tag_columns() {
assert!(determine_tag_columns(&[]).is_empty());
let frame = Data::Series(SeriesFrame {
tags: make_tags(&[("_measurement", "cpu"), ("server", "a"), ("_field", "temp")]),
data_type: DataType::Float as i32,
});
let exp = make_table_columns(&[TableColumnInput::new(
"cpu",
&["server"],
&[("temp", DataType::Float)],
)]);
assert_eq!(determine_tag_columns(&[frame]), exp);
// larger example
let frames = gen_frames();
let exp = make_table_columns(&[
TableColumnInput::new(
"cpu",
&["host", "new_column", "server"],
&[("temp", DataType::Float), ("voltage", DataType::Integer)],
),
TableColumnInput::new(
"another table",
&["region"],
&[
("bool_field", DataType::Boolean),
("unsigned_field", DataType::Unsigned),
("voltage", DataType::String),
],
),
]);
assert_eq!(determine_tag_columns(&frames), exp);
}
#[test]
fn test_frames_to_into_record_batches() {
let frames = gen_frames();
let rbs = frames_to_record_batches(&frames);
let exp = vec![
(
"another table",
vec![
"+--------+------------+----------------+---------+-------------------------------+",
"| region | bool_field | unsigned_field | voltage | time |",
"+--------+------------+----------------+---------+-------------------------------+",
"| | | | hello | 1970-01-01 00:00:00.000000200 |",
"| | | | abc | 1970-01-01 00:00:00.000000201 |",
"| west | | | foo | 1970-01-01 00:00:00.000000302 |",
"| west | | | bar | 1970-01-01 00:00:00.000000304 |",
"| north | true | | | 1970-01-01 00:00:00.000001 |",
"| south | | 600 | | 1970-01-01 00:00:00.000002 |",
"+--------+------------+----------------+---------+-------------------------------+",
],
),
(
"cpu",
vec![
"+------+------------+--------+------+---------+-------------------------------+",
"| host | new_column | server | temp | voltage | time |",
"+------+------------+--------+------+---------+-------------------------------+",
"| foo | | a | 1.1 | | 1970-01-01 00:00:00.000000001 |",
"| foo | | a | 2.2 | | 1970-01-01 00:00:00.000000002 |",
"| foo | | a | 3.3 | | 1970-01-01 00:00:00.000000003 |",
"| foo | | a | 4.4 | | 1970-01-01 00:00:00.000000004 |",
"| foo | | a | 5.1 | | 1970-01-01 00:00:00.000000005 |",
"| foo | | a | 5.2 | | 1970-01-01 00:00:00.000000006 |",
"| foo | | a | 5.3 | | 1970-01-01 00:00:00.000000007 |",
"| foo | | a | 10.4 | | 1970-01-01 00:00:00.000000010 |",
"| foo | | a | | 22 | 1970-01-01 00:00:00.000000001 |",
"| foo | | a | | 22 | 1970-01-01 00:00:00.000000002 |",
"| foo | a | | | 1000 | 1970-01-01 00:00:00.000000100 |",
"| foo | a | | | 2000 | 1970-01-01 00:00:00.000000200 |",
"+------+------------+--------+------+---------+-------------------------------+",
],
),
]
.into_iter()
.map(|(k, v)| (k.to_owned(), v.join("\n")))
.collect::<BTreeMap<String, String>>();
let got = rbs
.unwrap()
.into_iter()
.map(|(k, v)| {
let table: String = pretty_format_batches(&[v]).unwrap().to_string();
(k, table)
})
.collect::<BTreeMap<String, String>>();
assert_eq!(got, exp);
}
}

View File

@ -252,7 +252,7 @@ async fn query(
let db = server.db(&db_name)?;
let _query_completed_token = db.record_query("sql", &q);
let _query_completed_token = db.record_query("sql", Box::new(q.clone()));
let ctx = db.new_query_context(req.extensions().get().cloned());
let physical_plan = Planner::new(&ctx).sql(&q).await.context(PlanningSnafu)?;

View File

@ -172,7 +172,7 @@ impl Flight for FlightService {
.db(&database)
.map_err(default_server_error_handler)?;
let _query_completed_token = db.record_query("sql", &read_info.sql_query);
let _query_completed_token = db.record_query("sql", Box::new(read_info.sql_query.clone()));
let ctx = db.new_query_context(span_ctx);

View File

@ -26,9 +26,9 @@ use super::{TAG_KEY_FIELD, TAG_KEY_MEASUREMENT};
use observability_deps::tracing::warn;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::{
predicate::PredicateBuilder,
regex::regex_match_expr,
rpc_predicate::{FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME},
PredicateBuilder,
};
use query::group_by::{Aggregate as QueryAggregate, WindowDuration};
use snafu::{OptionExt, ResultExt, Snafu};
@ -867,7 +867,7 @@ fn format_comparison(v: i32, f: &mut fmt::Formatter<'_>) -> fmt::Result {
#[cfg(test)]
mod tests {
use generated_types::node::Type as RPCNodeType;
use predicate::{predicate::Predicate, rpc_predicate::QueryDatabaseMeta};
use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
use std::{collections::BTreeSet, sync::Arc};
use super::*;

View File

@ -30,7 +30,7 @@ use query::{
fieldlist::FieldList, seriesset::converter::Error as SeriesSetError,
ExecutionContextProvider,
},
QueryDatabase,
QueryDatabase, QueryText,
};
use server::DatabaseStore;
@ -1303,31 +1303,29 @@ where
/// Return something which can be formatted as json ("pbjson"
/// specifically)
fn defer_json<S>(s: &S) -> impl Into<String> + '_
fn defer_json<S>(s: &S) -> QueryText
where
S: serde::Serialize,
S: serde::Serialize + Send + Sync + Clone + 'static,
{
/// Defers conversion into a String
struct DeferredToJson<'a, S>
struct DeferredToJson<S>
where
S: serde::Serialize,
{
s: &'a S,
s: S,
}
impl<S> From<DeferredToJson<'_, S>> for String
where
S: serde::Serialize,
{
fn from(w: DeferredToJson<'_, S>) -> Self {
match serde_json::to_string_pretty(&w.s) {
Ok(json) => json,
Err(e) => e.to_string(),
impl<S: serde::Serialize> std::fmt::Display for DeferredToJson<S> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// This buffering is unfortunate but `Formatter` doesn't implement `std::io::Write`
match serde_json::to_string_pretty(&self.s) {
Ok(s) => f.write_str(&s),
Err(e) => write!(f, "error formatting: {}", e),
}
}
}
DeferredToJson { s }
Box::new(DeferredToJson { s: s.clone() })
}
#[cfg(test)]
@ -1351,7 +1349,7 @@ mod tests {
Client as StorageClient, OrgAndBucket,
};
use panic_logging::SendPanicsToTracing;
use predicate::predicate::{PredicateBuilder, PredicateMatch};
use predicate::{PredicateBuilder, PredicateMatch};
use query::{
exec::Executor,
test::{TestChunk, TestDatabase, TestError},
@ -2971,7 +2969,7 @@ mod tests {
db_name: &str,
partition_key: &str,
chunk_id: u128,
expected_predicate: &predicate::predicate::Predicate,
expected_predicate: &predicate::Predicate,
) {
let actual_predicates = self
.test_storage

View File

@ -264,7 +264,8 @@ fn main() -> Result<(), std::io::Error> {
}
Command::Storage(config) => {
let _tracing_guard = handle_init_logs(init_simple_logs(log_verbose_count));
if let Err(e) = commands::storage::command(config).await {
let connection = connection().await;
if let Err(e) = commands::storage::command(connection, config).await {
eprintln!("{}", e);
std::process::exit(ReturnCode::Failure as _)
}

View File

@ -162,7 +162,7 @@ pub async fn test_delete_on_router() {
let fixture = ServerFixture::create_shared(ServerType::Router).await;
let db_name = rand_name();
let (_tmpdir, mut write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
let (_tmpdir, write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
let table = "cpu";
let pred = DeletePredicate {
@ -179,8 +179,10 @@ pub async fn test_delete_on_router() {
.await
.expect("cannot delete");
let mut stream = write_buffer.streams().into_values().next().unwrap();
let delete_actual = stream.stream.next().await.unwrap().unwrap();
let sequencer_id = write_buffer.sequencer_ids().into_iter().next().unwrap();
let mut handler = write_buffer.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
let delete_actual = stream.next().await.unwrap().unwrap();
let delete_expected = DmlDelete::new(
&db_name,
pred,

View File

@ -45,7 +45,7 @@ pub async fn test_write_pb_router() {
let fixture = ServerFixture::create_shared(ServerType::Router).await;
let db_name = rand_name();
let (_tmpdir, mut write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
let (_tmpdir, write_buffer) = create_router_to_write_buffer(&fixture, &db_name).await;
fixture
.write_client()
@ -53,8 +53,10 @@ pub async fn test_write_pb_router() {
.await
.expect("cannot write");
let mut stream = write_buffer.streams().into_values().next().unwrap();
let write_actual = stream.stream.next().await.unwrap().unwrap();
let sequencer_id = write_buffer.sequencer_ids().into_iter().next().unwrap();
let mut handler = write_buffer.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
let write_actual = stream.next().await.unwrap().unwrap();
let write_expected = DmlWrite::new(
&db_name,
lines_to_batches("mytable mycol1=5 3", 0).unwrap(),

View File

@ -7,6 +7,7 @@ edition = "2021"
[dependencies]
arrow = { version = "8.0", features = ["prettyprint"] }
arrow_util = { path = "../arrow_util" }
async-trait = "0.1.42"
base64 = "0.13"
bytes = "1.0"
datafusion = { path = "../datafusion" }

View File

@ -3,6 +3,7 @@
use arrow::record_batch::RecordBatch;
use data_types::delete_predicate::DeletePredicate;
use async_trait::async_trait;
use chrono::{format::StrftimeItems, TimeZone, Utc};
use dml::DmlOperation;
use iox_catalog::interface::{
@ -100,6 +101,25 @@ impl IngesterData {
}
}
/// The Persister has a single function that will persist a given partition Id. It is expected
/// that the persist function will retry forever until it succeeds.
#[async_trait]
pub(crate) trait Persister: Send + Sync + 'static {
async fn persist(&self, partition_id: PartitionId);
}
#[async_trait]
impl Persister for IngesterData {
async fn persist(&self, _partition_id: PartitionId) {
// lookup the TableData
// let persisting_batch = table_data.create_persisting_batch(partition.partition_key);
// do the persist with this persisting batch
// update the catalog
// table_data.clear_persisting_batch() (behind the scenes this will remove the persisting batch
// and if the partition is empty, remove it from the map in table_data)
}
}
/// Data of a Shard
#[derive(Default)]
pub struct SequencerData {
@ -144,12 +164,15 @@ impl SequencerData {
namespace: &str,
catalog: &dyn Catalog,
) -> Result<Arc<NamespaceData>> {
let namespace = catalog
let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
let namespace = txn
.namespaces()
.get_by_name(namespace)
.await
.context(CatalogSnafu)?
.context(NamespaceNotFoundSnafu { namespace })?;
txn.commit().await.context(CatalogSnafu)?;
let mut n = self.namespaces.write();
let data = Arc::clone(
n.entry(namespace.name)
@ -230,11 +253,14 @@ impl NamespaceData {
table_name: &str,
catalog: &dyn Catalog,
) -> Result<Arc<TableData>> {
let table = catalog
let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
let table = txn
.tables()
.create_or_get(table_name, self.namespace_id)
.await
.context(CatalogSnafu)?;
txn.commit().await.context(CatalogSnafu)?;
let mut t = self.tables.write();
let data = Arc::clone(
t.entry(table.name)
@ -306,7 +332,8 @@ impl TableData {
let min_time = Timestamp::new(predicate.range.start());
let max_time = Timestamp::new(predicate.range.end());
let tombstone = catalog
let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
let tombstone = txn
.tombstones()
.create_or_get(
self.table_id,
@ -318,6 +345,7 @@ impl TableData {
)
.await
.context(CatalogSnafu)?;
txn.commit().await.context(CatalogSnafu)?;
let partitions = self.partition_data.read();
for data in partitions.values() {
@ -339,11 +367,13 @@ impl TableData {
sequencer_id: SequencerId,
catalog: &dyn Catalog,
) -> Result<Arc<PartitionData>> {
let partition = catalog
let mut txn = catalog.start_transaction().await.context(CatalogSnafu)?;
let partition = txn
.partitions()
.create_or_get(partition_key, sequencer_id, self.table_id)
.await
.context(CatalogSnafu)?;
txn.commit().await.context(CatalogSnafu)?;
let mut p = self.partition_data.write();
let data = Arc::new(PartitionData::new(partition.id));
p.insert(partition.partition_key, Arc::clone(&data));

View File

@ -3,21 +3,24 @@
use iox_catalog::interface::{Catalog, KafkaPartition, KafkaTopic, Sequencer, SequencerId};
use object_store::ObjectStore;
use crate::data::{IngesterData, SequencerData};
use crate::{
data::{IngesterData, SequencerData},
lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
};
use db::write_buffer::metrics::{SequencerMetrics, WriteBufferIngestMetrics};
use dml::DmlOperation;
use futures::{stream::BoxStream, StreamExt};
use futures::StreamExt;
use observability_deps::tracing::{debug, warn};
use snafu::Snafu;
use snafu::{ResultExt, Snafu};
use std::collections::BTreeMap;
use std::{
fmt::Formatter,
sync::Arc,
time::{Duration, Instant},
};
use time::SystemProvider;
use tokio::task::JoinHandle;
use trace::span::SpanRecorder;
use write_buffer::core::{FetchHighWatermark, WriteBufferError, WriteBufferReading};
use write_buffer::core::{WriteBufferReading, WriteBufferStreamHandler};
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
@ -31,6 +34,11 @@ pub enum Error {
kafka_topic: String,
kafka_partition: KafkaPartition,
},
#[snafu(display("Write buffer error: {}", source))]
WriteBuffer {
source: write_buffer::core::WriteBufferError,
},
}
/// A specialized `Error` for Catalog errors
@ -45,11 +53,11 @@ pub struct IngestHandlerImpl {
#[allow(dead_code)]
kafka_topic: KafkaTopic,
/// Future that resolves when the background worker exits
#[allow(dead_code)]
join_handles: Vec<JoinHandle<()>>,
/// The cache and buffered data for the ingester
#[allow(dead_code)]
data: Arc<IngesterData>,
/// The lifecycle manager, keeping state of partitions across all sequencers
lifecycle_manager: Arc<LifecycleManager>,
}
impl std::fmt::Debug for IngestHandlerImpl {
@ -60,14 +68,15 @@ impl std::fmt::Debug for IngestHandlerImpl {
impl IngestHandlerImpl {
/// Initialize the Ingester
pub fn new(
pub async fn new(
lifecycle_config: LifecycleConfig,
topic: KafkaTopic,
mut sequencer_states: BTreeMap<KafkaPartition, Sequencer>,
sequencer_states: BTreeMap<KafkaPartition, Sequencer>,
catalog: Arc<dyn Catalog>,
object_store: Arc<ObjectStore>,
write_buffer: Box<dyn WriteBufferReading>,
write_buffer: Arc<dyn WriteBufferReading>,
registry: &metric::Registry,
) -> Self {
) -> Result<Self> {
// build the initial ingester data state
let mut sequencers = BTreeMap::new();
for s in sequencer_states.values() {
@ -83,40 +92,46 @@ impl IngestHandlerImpl {
let kafka_topic_name = topic.name.clone();
let ingest_metrics = WriteBufferIngestMetrics::new(registry, &topic.name);
let write_buffer: &'static mut _ = Box::leak(write_buffer);
let join_handles: Vec<_> = write_buffer
.streams()
.into_iter()
.filter_map(|(kafka_partition_id, stream)| {
// streams may return a stream for every partition in the kafka topic. We only want
// to process streams for those specified by the call to new.
let kafka_partition = KafkaPartition::new(kafka_partition_id as i32);
sequencer_states.remove(&kafka_partition).map(|sequencer| {
let metrics = ingest_metrics.new_sequencer_metrics(kafka_partition_id);
let mut join_handles = Vec::with_capacity(sequencer_states.len());
for (kafka_partition, sequencer) in sequencer_states {
let metrics = ingest_metrics.new_sequencer_metrics(kafka_partition.get() as u32);
let ingester_data = Arc::clone(&ingester_data);
let kafka_topic_name = kafka_topic_name.clone();
tokio::task::spawn(async move {
stream_in_sequenced_entries(
let stream_handler = write_buffer
.stream_handler(kafka_partition.get() as u32)
.await
.context(WriteBufferSnafu)?;
join_handles.push(tokio::task::spawn(stream_in_sequenced_entries(
ingester_data,
sequencer.id,
kafka_topic_name,
kafka_partition,
stream.stream,
stream.fetch_high_watermark,
Arc::clone(&write_buffer),
stream_handler,
metrics,
)
.await;
})
})
})
.collect();
)));
}
Self {
// start the lifecycle manager
let persister = Arc::clone(&data);
let lifecycle_manager = Arc::new(LifecycleManager::new(
lifecycle_config,
Arc::new(SystemProvider::new()),
));
let manager = Arc::clone(&lifecycle_manager);
let handle = tokio::task::spawn(async move {
run_lifecycle_manager(manager, persister).await;
});
join_handles.push(handle);
Ok(Self {
data,
kafka_topic: topic,
join_handles,
}
lifecycle_manager,
})
}
}
@ -135,17 +150,18 @@ impl Drop for IngestHandlerImpl {
///
/// Note all errors reading / parsing / writing entries from the write
/// buffer are ignored.
async fn stream_in_sequenced_entries<'a>(
async fn stream_in_sequenced_entries(
ingester_data: Arc<IngesterData>,
sequencer_id: SequencerId,
kafka_topic: String,
kafka_partition: KafkaPartition,
mut stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
f_mark: FetchHighWatermark<'a>,
write_buffer: Arc<dyn WriteBufferReading>,
mut write_buffer_stream: Box<dyn WriteBufferStreamHandler>,
mut metrics: SequencerMetrics,
) {
let mut watermark_last_updated: Option<Instant> = None;
let mut watermark = 0_u64;
let mut stream = write_buffer_stream.stream();
while let Some(db_write_result) = stream.next().await {
// maybe update sequencer watermark
@ -156,7 +172,10 @@ async fn stream_in_sequenced_entries<'a>(
.map(|ts| now.duration_since(ts) > Duration::from_secs(10))
.unwrap_or(true)
{
match f_mark().await {
match write_buffer
.fetch_high_watermark(sequencer_id.get() as u32)
.await
{
Ok(w) => {
watermark = w;
}
@ -233,34 +252,28 @@ mod tests {
use iox_catalog::validate_or_insert_schema;
use metric::{Attributes, Metric, U64Counter, U64Gauge};
use mutable_batch_lp::lines_to_batches;
use std::num::NonZeroU32;
use std::{num::NonZeroU32, ops::DerefMut};
use time::Time;
use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};
#[tokio::test]
async fn read_from_write_buffer_write_to_mutable_buffer() {
let catalog = MemCatalog::new();
let kafka_topic = catalog
.kafka_topics()
.create_or_get("whatevs")
.await
.unwrap();
let query_pool = catalog
.query_pools()
.create_or_get("whatevs")
.await
.unwrap();
let mut txn = catalog.start_transaction().await.unwrap();
let kafka_topic = txn.kafka_topics().create_or_get("whatevs").await.unwrap();
let query_pool = txn.query_pools().create_or_get("whatevs").await.unwrap();
let kafka_partition = KafkaPartition::new(0);
let namespace = catalog
let namespace = txn
.namespaces()
.create("foo", "inf", kafka_topic.id, query_pool.id)
.await
.unwrap();
let sequencer = catalog
let sequencer = txn
.sequencers()
.create_or_get(&kafka_topic, kafka_partition)
.await
.unwrap();
let mut sequencer_states = BTreeMap::new();
sequencer_states.insert(kafka_partition, sequencer);
@ -275,7 +288,7 @@ mod tests {
lines_to_batches("mem foo=1 10", 0).unwrap(),
DmlMeta::sequenced(Sequence::new(0, 0), ingest_ts1, None, 50),
);
let schema = validate_or_insert_schema(w1.tables(), &schema, &catalog)
let schema = validate_or_insert_schema(w1.tables(), &schema, txn.deref_mut())
.await
.unwrap()
.unwrap();
@ -285,23 +298,29 @@ mod tests {
lines_to_batches("cpu bar=2 20\ncpu bar=3 30", 0).unwrap(),
DmlMeta::sequenced(Sequence::new(0, 7), ingest_ts2, None, 150),
);
let _schema = validate_or_insert_schema(w2.tables(), &schema, &catalog)
let _schema = validate_or_insert_schema(w2.tables(), &schema, txn.deref_mut())
.await
.unwrap()
.unwrap();
txn.commit().await.unwrap();
write_buffer_state.push_write(w2);
let reading = Box::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
let reading: Arc<dyn WriteBufferReading> =
Arc::new(MockBufferForReading::new(write_buffer_state, None).unwrap());
let object_store = Arc::new(ObjectStore::new_in_memory());
let metrics: Arc<metric::Registry> = Default::default();
let lifecycle_config = LifecycleConfig::new(1000000, 1000, 1000, Duration::from_secs(10));
let ingester = IngestHandlerImpl::new(
lifecycle_config,
kafka_topic,
sequencer_states,
Arc::new(catalog),
object_store,
reading,
&metrics,
);
)
.await
.unwrap();
// give the writes some time to go through the buffer. Exit once we've verified there's
// data in there from both writes.

View File

@ -16,6 +16,7 @@
pub mod compact;
pub mod data;
pub mod handler;
pub mod lifecycle;
pub mod persist;
pub mod query;
pub mod server;

475
ingester/src/lifecycle.rs Normal file
View File

@ -0,0 +1,475 @@
//! Manages the persistence and eviction lifecycle of data in the buffer across all sequencers.
//! Note that the byte counts logged by the lifecycle manager and when exactly persistence gets
//! triggered aren't required to be absolutely accurate. The byte count is just an estimate
//! anyway, this just needs to keep things moving along to keep memory use roughly under
//! some absolute number and individual Parquet files that get persisted below some number. It
//! is expected that they may be above or below the absolute thresholds.
use crate::data::Persister;
use iox_catalog::interface::PartitionId;
use parking_lot::Mutex;
use std::collections::BTreeMap;
use std::sync::Arc;
use std::time::Duration;
use time::{Time, TimeProvider};
/// The lifecycle manager keeps track of the size and age of partitions across all sequencers.
/// It triggers persistence based on keeping total memory usage around a set amount while
/// ensuring that partitions don't get too old or large before being persisted.
pub(crate) struct LifecycleManager {
config: LifecycleConfig,
time_provider: Arc<dyn TimeProvider>,
state: Mutex<LifecycleState>,
persist_running: tokio::sync::Mutex<()>,
}
/// The configuration options for the lifecycle on the ingester.
#[derive(Debug, Clone, Copy)]
pub struct LifecycleConfig {
/// The ingester will pause pulling data from Kafka if it hits this amount of memory used, waiting
/// until persistence evicts partitions from memory.
pause_ingest_size: usize,
/// When the ingester hits this threshold, the lifecycle manager will persist the largest
/// partitions currently buffered until it falls below this threshold. An ingester running
/// in a steady state should operate around this amount of memory usage.
persist_memory_threshold: usize,
/// If an individual partition crosses this threshold, it will be persisted. The purpose of this
/// setting to to ensure the ingester doesn't create Parquet files that are too large.
partition_size_threshold: usize,
/// If an individual partitiion has had data buffered for longer than this period of time, the
/// manager will persist it. This setting is to ensure we have an upper bound on how far back
/// we will need to read in Kafka on restart or recovery.
partition_age_threshold: Duration,
}
impl LifecycleConfig {
/// Initialize a new LifecycleConfig. panics if the passed `pause_ingest_size` is less than the
/// `persist_memory_threshold`.
pub fn new(
pause_ingest_size: usize,
persist_memory_threshold: usize,
partition_size_threshold: usize,
partition_age_threshold: Duration,
) -> Self {
// this must be true to ensure that persistence will get triggered, freeing up memory
assert!(pause_ingest_size > persist_memory_threshold);
Self {
pause_ingest_size,
persist_memory_threshold,
partition_size_threshold,
partition_age_threshold,
}
}
}
#[derive(Default, Debug)]
struct LifecycleState {
total_bytes: usize,
partition_stats: BTreeMap<PartitionId, PartitionLifecycleStats>,
}
impl LifecycleState {
fn remove(&mut self, partition_id: &PartitionId) -> Option<PartitionLifecycleStats> {
self.partition_stats.remove(partition_id).map(|stats| {
self.total_bytes -= stats.bytes_written;
stats
})
}
}
/// A snapshot of the stats for the lifecycle manager
#[derive(Debug)]
pub struct LifecycleStats {
/// total number of bytes the lifecycle manager is aware of across all sequencers and
/// partitions. Based on the mutable batch sizes received into all partitions.
pub total_bytes: usize,
/// the stats for every partition the lifecycle manager is tracking.
pub partition_stats: Vec<PartitionLifecycleStats>,
}
/// The stats for a partition
#[derive(Debug, Clone, Copy)]
pub struct PartitionLifecycleStats {
/// The partition identifier
partition_id: PartitionId,
/// Time that the partition received its first write. This is reset anytime
/// the partition is persisted.
first_write: Time,
/// The number of bytes in the partition as estimated by the mutable batch sizes.
bytes_written: usize,
}
impl LifecycleManager {
/// Initialize a new lifecycle manager that will persist when `maybe_persist` is called
/// if anything is over the size or age threshold.
pub(crate) fn new(config: LifecycleConfig, time_provider: Arc<dyn TimeProvider>) -> Self {
Self {
config,
time_provider,
state: Default::default(),
persist_running: Default::default(),
}
}
/// Logs bytes written into a partition so that it can be tracked for the manager to
/// trigger persistence. Returns true if the ingester should pause consuming from the
/// write buffer so that persistence can catch up and free up memory.
pub fn log_write(&self, partition_id: PartitionId, bytes_written: usize) -> bool {
let mut s = self.state.lock();
s.partition_stats
.entry(partition_id)
.or_insert_with(|| PartitionLifecycleStats {
partition_id,
first_write: self.time_provider.now(),
bytes_written: 0,
})
.bytes_written += bytes_written;
s.total_bytes += bytes_written;
s.total_bytes > self.config.pause_ingest_size
}
/// Returns true if the `total_bytes` tracked by the manager is less than the pause amount.
/// As persistence runs, the `total_bytes` go down.
pub fn can_resume_ingest(&self) -> bool {
let s = self.state.lock();
s.total_bytes < self.config.pause_ingest_size
}
/// This will persist any partitions that are over their size or age thresholds and
/// persist as many partitions as necessary (largest first) to get below the memory threshold.
/// The persist operations are spawned in new tasks and run at the same time, but the
/// function waits for all to return before completing.
pub async fn maybe_persist<P: Persister>(&self, persister: &Arc<P>) {
// ensure that this is only running one at a time
self.persist_running.lock().await;
let LifecycleStats {
mut total_bytes,
partition_stats,
} = self.stats();
// get anything over the threshold size or age to persist
let now = self.time_provider.now();
let (to_persist, mut rest): (Vec<PartitionLifecycleStats>, Vec<PartitionLifecycleStats>) =
partition_stats.into_iter().partition(|s| {
let aged_out = now
.checked_duration_since(s.first_write)
.map(|age| age > self.config.partition_age_threshold)
.unwrap_or(false);
let sized_out = s.bytes_written > self.config.partition_size_threshold;
aged_out || sized_out
});
let mut persist_tasks: Vec<_> = to_persist
.into_iter()
.map(|s| {
let bytes_removed = self
.remove(s.partition_id)
.map(|s| s.bytes_written)
.unwrap_or(0);
total_bytes -= bytes_removed;
let persister = Arc::clone(persister);
tokio::task::spawn(async move {
persister.persist(s.partition_id).await;
})
})
.collect();
// if we're still over the memory threshold, persist as many of the largest partitions
// until we're under. It's ok if this is stale, it'll just get handled on the next pass
// through.
if total_bytes > self.config.persist_memory_threshold {
let mut to_persist = vec![];
rest.sort_by(|a, b| b.bytes_written.cmp(&a.bytes_written));
for s in rest {
total_bytes -= s.bytes_written;
to_persist.push(s);
if total_bytes < self.config.persist_memory_threshold {
break;
}
}
let mut to_persist: Vec<_> = to_persist
.into_iter()
.map(|s| {
self.remove(s.partition_id);
let persister = Arc::clone(persister);
tokio::task::spawn(async move {
persister.persist(s.partition_id).await;
})
})
.collect();
persist_tasks.append(&mut to_persist);
}
let persists = futures::future::join_all(persist_tasks.into_iter());
persists.await;
}
/// Returns a point in time snapshot of the lifecycle state.
pub fn stats(&self) -> LifecycleStats {
let s = self.state.lock();
let partition_stats: Vec<_> = s.partition_stats.values().cloned().collect();
LifecycleStats {
total_bytes: s.total_bytes,
partition_stats,
}
}
/// Removes the partition from the state
pub fn remove(&self, partition_id: PartitionId) -> Option<PartitionLifecycleStats> {
let mut s = self.state.lock();
s.remove(&partition_id)
}
}
const CHECK_INTERVAL: Duration = Duration::from_secs(1);
/// Runs the lifecycle manager to trigger persistence every second.
pub(crate) async fn run_lifecycle_manager<P: Persister>(
manager: Arc<LifecycleManager>,
persister: Arc<P>,
) {
loop {
manager.maybe_persist(&persister).await;
tokio::time::sleep(CHECK_INTERVAL).await;
}
}
#[cfg(test)]
mod tests {
use super::*;
use async_trait::async_trait;
use std::collections::BTreeSet;
use time::{MockProvider, SystemProvider};
#[derive(Default)]
struct TestPersister {
persist_called: Mutex<BTreeSet<PartitionId>>,
}
#[async_trait]
impl Persister for TestPersister {
async fn persist(&self, partition_id: PartitionId) {
let mut p = self.persist_called.lock();
p.insert(partition_id);
}
}
impl TestPersister {
fn persist_called_for(&self, partition_id: PartitionId) -> bool {
let p = self.persist_called.lock();
p.contains(&partition_id)
}
}
#[test]
fn logs_write() {
let config = LifecycleConfig {
pause_ingest_size: 20,
persist_memory_threshold: 10,
partition_size_threshold: 5,
partition_age_threshold: Duration::from_nanos(0),
};
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
let tp = Arc::clone(&time_provider);
let m = LifecycleManager::new(config, tp);
// log first two writes at different times
assert!(!m.log_write(PartitionId::new(1), 1));
time_provider.inc(Duration::from_nanos(10));
assert!(!m.log_write(PartitionId::new(1), 1));
// log another write for different partition
assert!(!m.log_write(PartitionId::new(2), 3));
let stats = m.stats();
assert_eq!(stats.total_bytes, 5);
let p1 = stats.partition_stats.get(0).unwrap();
assert_eq!(p1.bytes_written, 2);
assert_eq!(p1.partition_id, PartitionId::new(1));
assert_eq!(p1.first_write, Time::from_timestamp_nanos(0));
let p2 = stats.partition_stats.get(1).unwrap();
assert_eq!(p2.bytes_written, 3);
assert_eq!(p2.partition_id, PartitionId::new(2));
assert_eq!(p2.first_write, Time::from_timestamp_nanos(10));
}
#[test]
fn pausing_and_resuming_ingest() {
let config = LifecycleConfig {
pause_ingest_size: 20,
persist_memory_threshold: 10,
partition_size_threshold: 5,
partition_age_threshold: Duration::from_nanos(0),
};
let time_provider = Arc::new(SystemProvider::new());
let m = LifecycleManager::new(config, time_provider);
assert!(!m.log_write(PartitionId::new(1), 15));
// now it should indicate a pause
assert!(m.log_write(PartitionId::new(1), 10));
assert!(!m.can_resume_ingest());
m.remove(PartitionId::new(1));
assert!(m.can_resume_ingest());
assert!(!m.log_write(PartitionId::new(1), 3));
}
#[tokio::test]
async fn persists_based_on_age() {
let config = LifecycleConfig {
pause_ingest_size: 30,
persist_memory_threshold: 20,
partition_size_threshold: 10,
partition_age_threshold: Duration::from_nanos(5),
};
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
let tp = Arc::clone(&time_provider);
let m = LifecycleManager::new(config, tp);
let partition_id = PartitionId::new(1);
let persister = Arc::new(TestPersister::default());
m.log_write(partition_id, 10);
m.maybe_persist(&persister).await;
let stats = m.stats();
assert_eq!(stats.total_bytes, 10);
assert_eq!(stats.partition_stats[0].partition_id, partition_id);
// age out the partition
time_provider.inc(Duration::from_nanos(6));
// validate that from before, persist wasn't called for the partition
assert!(!persister.persist_called_for(partition_id));
// write in data for a new partition so we can be sure it isn't persisted, but the older one is
m.log_write(PartitionId::new(2), 6);
m.maybe_persist(&persister).await;
assert!(persister.persist_called_for(partition_id));
assert!(!persister.persist_called_for(PartitionId::new(2)));
let stats = m.stats();
assert_eq!(stats.total_bytes, 6);
assert_eq!(stats.partition_stats.len(), 1);
assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(2));
}
#[tokio::test]
async fn persists_based_on_partition_size() {
let config = LifecycleConfig {
pause_ingest_size: 30,
persist_memory_threshold: 20,
partition_size_threshold: 5,
partition_age_threshold: Duration::from_millis(100),
};
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
let m = LifecycleManager::new(config, time_provider);
let partition_id = PartitionId::new(1);
let persister = Arc::new(TestPersister::default());
m.log_write(partition_id, 4);
m.maybe_persist(&persister).await;
let stats = m.stats();
assert_eq!(stats.total_bytes, 4);
assert_eq!(stats.partition_stats[0].partition_id, partition_id);
assert!(!persister.persist_called_for(partition_id));
// introduce a new partition under the limit to verify it doesn't get taken with the other
m.log_write(PartitionId::new(2), 3);
m.log_write(partition_id, 5);
m.maybe_persist(&persister).await;
assert!(persister.persist_called_for(partition_id));
assert!(!persister.persist_called_for(PartitionId::new(2)));
let stats = m.stats();
assert_eq!(stats.total_bytes, 3);
assert_eq!(stats.partition_stats.len(), 1);
assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(2));
}
#[tokio::test]
async fn persists_based_on_memory_size() {
let config = LifecycleConfig {
pause_ingest_size: 60,
persist_memory_threshold: 20,
partition_size_threshold: 20,
partition_age_threshold: Duration::from_millis(1000),
};
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
let m = LifecycleManager::new(config, time_provider);
let partition_id = PartitionId::new(1);
let persister = Arc::new(TestPersister::default());
m.log_write(partition_id, 8);
m.log_write(PartitionId::new(2), 13);
m.maybe_persist(&persister).await;
// the bigger of the two partitions should have been persisted, leaving the smaller behind
let stats = m.stats();
assert_eq!(stats.total_bytes, 8);
assert_eq!(stats.partition_stats[0].partition_id, partition_id);
assert!(!persister.persist_called_for(partition_id));
assert!(persister.persist_called_for(PartitionId::new(2)));
// add that partition back in over size
m.log_write(partition_id, 20);
m.log_write(PartitionId::new(2), 21);
// both partitions should now need to be persisted to bring us below the mem threshold of 20.
m.maybe_persist(&persister).await;
assert!(persister.persist_called_for(partition_id));
assert!(persister.persist_called_for(PartitionId::new(2)));
let stats = m.stats();
assert_eq!(stats.total_bytes, 0);
assert_eq!(stats.partition_stats.len(), 0);
}
#[tokio::test]
async fn persist_based_on_partition_and_memory_size() {
let config = LifecycleConfig {
pause_ingest_size: 60,
persist_memory_threshold: 6,
partition_size_threshold: 5,
partition_age_threshold: Duration::from_millis(1000),
};
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
let tp = Arc::clone(&time_provider);
let m = LifecycleManager::new(config, tp);
let persister = Arc::new(TestPersister::default());
m.log_write(PartitionId::new(1), 4);
time_provider.inc(Duration::from_nanos(1));
m.log_write(PartitionId::new(2), 6);
time_provider.inc(Duration::from_nanos(1));
m.log_write(PartitionId::new(3), 3);
m.maybe_persist(&persister).await;
// the bigger of the two partitions should have been persisted, leaving the smaller behind
let stats = m.stats();
assert_eq!(stats.total_bytes, 3);
assert_eq!(stats.partition_stats[0].partition_id, PartitionId::new(3));
assert!(!persister.persist_called_for(PartitionId::new(3)));
assert!(persister.persist_called_for(PartitionId::new(2)));
assert!(persister.persist_called_for(PartitionId::new(1)));
}
}

View File

@ -15,10 +15,7 @@ use datafusion::physical_plan::{
SendableRecordBatchStream,
};
use iox_catalog::interface::{SequenceNumber, Tombstone};
use predicate::{
delete_predicate::parse_delete_predicate,
predicate::{Predicate, PredicateMatch},
};
use predicate::{delete_predicate::parse_delete_predicate, Predicate, PredicateMatch};
use query::{exec::stringset::StringSet, QueryChunk, QueryChunkMeta};
use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
use snafu::{ResultExt, Snafu};

View File

@ -22,9 +22,13 @@ pub struct IngesterServer<I: IngestHandler> {
impl<I: IngestHandler> IngesterServer<I> {
/// Initialise a new [`IngesterServer`] using the provided HTTP and gRPC
/// handlers.
pub fn new(http: HttpDelegate<I>, grpc: GrpcDelegate<I>) -> Self {
pub fn new(
metrics: Arc<metric::Registry>,
http: HttpDelegate<I>,
grpc: GrpcDelegate<I>,
) -> Self {
Self {
metrics: Default::default(),
metrics,
http,
grpc,
}

View File

@ -22,5 +22,6 @@ dotenv = "0.15.0"
mutable_batch_lp = { path = "../mutable_batch_lp" }
paste = "1.0.6"
pretty_assertions = "1.0.0"
test_helpers = { path = "../test_helpers" }
[features]

File diff suppressed because it is too large Load Diff

View File

@ -12,11 +12,11 @@
)]
use crate::interface::{
Catalog, ColumnType, Error, KafkaPartition, KafkaTopic, NamespaceSchema, QueryPool, Result,
Sequencer, SequencerId, TableSchema,
ColumnType, Error, KafkaPartition, KafkaTopic, NamespaceSchema, QueryPool, Result, Sequencer,
SequencerId, TableSchema, Transaction,
};
use futures::{stream::FuturesOrdered, StreamExt};
use interface::{ParquetFile, ProcessedTombstone, Tombstone};
use mutable_batch::MutableBatch;
use std::{borrow::Cow, collections::BTreeMap};
@ -43,7 +43,7 @@ pub mod postgres;
pub async fn validate_or_insert_schema<'a, T, U>(
tables: T,
schema: &NamespaceSchema,
catalog: &dyn Catalog,
txn: &mut dyn Transaction,
) -> Result<Option<NamespaceSchema>>
where
T: IntoIterator<IntoIter = U, Item = (&'a str, &'a MutableBatch)> + Send + Sync,
@ -55,7 +55,7 @@ where
let mut schema = Cow::Borrowed(schema);
for (table_name, batch) in tables {
validate_mutable_batch(batch, table_name, &mut schema, catalog).await?;
validate_mutable_batch(batch, table_name, &mut schema, txn).await?;
}
match schema {
@ -68,7 +68,7 @@ async fn validate_mutable_batch(
mb: &MutableBatch,
table_name: &str,
schema: &mut Cow<'_, NamespaceSchema>,
catalog: &dyn Catalog,
txn: &mut dyn Transaction,
) -> Result<()> {
// Check if the table exists in the schema.
//
@ -81,14 +81,14 @@ async fn validate_mutable_batch(
//
// Attempt to create the table in the catalog, or load an existing
// table from the catalog to populate the cache.
let mut table = catalog
let mut table = txn
.tables()
.create_or_get(table_name, schema.id)
.await
.map(|t| TableSchema::new(t.id))?;
// Always add a time column to all new tables.
let time_col = catalog
let time_col = txn
.columns()
.create_or_get(TIME_COLUMN, table.id, ColumnType::Time)
.await?;
@ -134,7 +134,7 @@ async fn validate_mutable_batch(
None => {
// The column does not exist in the cache, create/get it from
// the catalog, and add it to the table.
let column = catalog
let column = txn
.columns()
.create_or_get(name.as_str(), table.id, ColumnType::from(col.influx_type()))
.await?;
@ -161,34 +161,53 @@ async fn validate_mutable_batch(
/// each of the partitions.
pub async fn create_or_get_default_records(
kafka_partition_count: i32,
catalog: &dyn Catalog,
txn: &mut dyn Transaction,
) -> Result<(KafkaTopic, QueryPool, BTreeMap<SequencerId, Sequencer>)> {
let kafka_topic = catalog
.kafka_topics()
.create_or_get(SHARED_KAFKA_TOPIC)
.await?;
let query_pool = catalog
.query_pools()
.create_or_get(SHARED_QUERY_POOL)
.await?;
let kafka_topic = txn.kafka_topics().create_or_get(SHARED_KAFKA_TOPIC).await?;
let query_pool = txn.query_pools().create_or_get(SHARED_QUERY_POOL).await?;
let sequencers = (1..=kafka_partition_count)
.map(|partition| {
catalog
let mut sequencers = BTreeMap::new();
for partition in 1..=kafka_partition_count {
let sequencer = txn
.sequencers()
.create_or_get(&kafka_topic, KafkaPartition::new(partition))
})
.collect::<FuturesOrdered<_>>()
.map(|v| {
let v = v.expect("failed to create sequencer");
(v.id, v)
})
.collect::<BTreeMap<_, _>>()
.await;
.await?;
sequencers.insert(sequencer.id, sequencer);
}
Ok((kafka_topic, query_pool, sequencers))
}
/// Insert the conpacted parquet file and its tombstones
pub async fn add_parquet_file_with_tombstones(
parquet_file: &ParquetFile,
tombstones: &[Tombstone],
txn: &mut dyn Transaction,
) -> Result<(ParquetFile, Vec<ProcessedTombstone>), Error> {
// create a parquet file in the catalog first
let parquet = txn
.parquet_files()
.create(
parquet_file.sequencer_id,
parquet_file.table_id,
parquet_file.partition_id,
parquet_file.object_store_id,
parquet_file.min_sequence_number,
parquet_file.max_sequence_number,
parquet_file.min_time,
parquet_file.max_time,
)
.await?;
// Now the parquet available, create its processed tombstones
let processed_tombstones = txn
.processed_tombstones()
.create_many(parquet.id, tombstones)
.await?;
Ok((parquet, processed_tombstones))
}
#[cfg(test)]
mod tests {
use super::*;
@ -211,13 +230,16 @@ mod tests {
#[allow(clippy::bool_assert_comparison)]
#[tokio::test]
async fn [<test_validate_schema_ $name>]() {
use crate::interface::Catalog;
use std::ops::DerefMut;
use pretty_assertions::assert_eq;
const NAMESPACE_NAME: &str = "bananas";
let repo = MemCatalog::new();
let (kafka_topic, query_pool, _) = create_or_get_default_records(2, &repo).await.unwrap();
let mut txn = repo.start_transaction().await.unwrap();
let (kafka_topic, query_pool, _) = create_or_get_default_records(2, txn.deref_mut()).await.unwrap();
let namespace = repo
let namespace = txn
.namespaces()
.create(NAMESPACE_NAME, "inf", kafka_topic.id, query_pool.id)
.await
@ -240,7 +262,7 @@ mod tests {
let (writes, _) = mutable_batch_lp::lines_to_batches_stats(lp.as_str(), 42)
.expect("failed to build test writes from LP");
let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, &repo)
let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut())
.await;
match got {
@ -260,7 +282,7 @@ mod tests {
// Invariant: in absence of concurrency, the schema within
// the database must always match the incrementally built
// cached schema.
let db_schema = get_schema_by_name(NAMESPACE_NAME, &repo)
let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut())
.await
.expect("database failed to query for namespace schema");
assert_eq!(schema, db_schema, "schema in DB and cached schema differ");

View File

@ -2,23 +2,26 @@
//! used for testing or for an IOx designed to run without catalog persistence.
use crate::interface::{
Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic,
KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionRepo, QueryPool, QueryPoolId,
QueryPoolRepo, Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId,
TableRepo, Timestamp, Tombstone, TombstoneId, TombstoneRepo,
sealed::TransactionFinalize, Catalog, Column, ColumnId, ColumnRepo, ColumnType, Error,
KafkaPartition, KafkaTopic, KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId,
NamespaceRepo, ParquetFile, ParquetFileId, ParquetFileRepo, Partition, PartitionId,
PartitionInfo, PartitionRepo, ProcessedTombstone, ProcessedTombstoneRepo, QueryPool,
QueryPoolId, QueryPoolRepo, Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo,
Table, TableId, TableRepo, Timestamp, Tombstone, TombstoneId, TombstoneRepo, Transaction,
};
use async_trait::async_trait;
use observability_deps::tracing::warn;
use std::convert::TryFrom;
use std::fmt::Formatter;
use std::sync::Mutex;
use std::sync::Arc;
use tokio::sync::{Mutex, OwnedMutexGuard};
use uuid::Uuid;
/// In-memory catalog that implements the `RepoCollection` and individual repo traits from
/// the catalog interface.
#[derive(Default)]
pub struct MemCatalog {
collections: Mutex<MemCollections>,
collections: Arc<Mutex<MemCollections>>,
}
impl MemCatalog {
@ -30,12 +33,11 @@ impl MemCatalog {
impl std::fmt::Debug for MemCatalog {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let c = self.collections.lock().expect("mutex poisoned");
write!(f, "MemCatalog[ {:?} ]", c)
f.debug_struct("MemCatalog").finish_non_exhaustive()
}
}
#[derive(Default, Debug)]
#[derive(Default, Debug, Clone)]
struct MemCollections {
kafka_topics: Vec<KafkaTopic>,
query_pools: Vec<QueryPool>,
@ -46,6 +48,23 @@ struct MemCollections {
partitions: Vec<Partition>,
tombstones: Vec<Tombstone>,
parquet_files: Vec<ParquetFile>,
processed_tombstones: Vec<ProcessedTombstone>,
}
/// transaction bound to an in-memory catalog.
#[derive(Debug)]
pub struct MemTxn {
guard: OwnedMutexGuard<MemCollections>,
stage: MemCollections,
finalized: bool,
}
impl Drop for MemTxn {
fn drop(&mut self) {
if !self.finalized {
warn!("Dropping MemTxn w/o finalizing (commit or abort)");
}
}
}
#[async_trait]
@ -55,66 +74,95 @@ impl Catalog for MemCatalog {
Ok(())
}
fn kafka_topics(&self) -> &dyn KafkaTopicRepo {
async fn start_transaction(&self) -> Result<Box<dyn Transaction>, Error> {
let guard = Arc::clone(&self.collections).lock_owned().await;
let stage = guard.clone();
Ok(Box::new(MemTxn {
guard,
stage,
finalized: false,
}))
}
}
#[async_trait]
impl TransactionFinalize for MemTxn {
async fn commit_inplace(&mut self) -> Result<(), Error> {
*self.guard = std::mem::take(&mut self.stage);
self.finalized = true;
Ok(())
}
async fn abort_inplace(&mut self) -> Result<(), Error> {
self.finalized = true;
Ok(())
}
}
#[async_trait]
impl Transaction for MemTxn {
fn kafka_topics(&mut self) -> &mut dyn KafkaTopicRepo {
self
}
fn query_pools(&self) -> &dyn QueryPoolRepo {
fn query_pools(&mut self) -> &mut dyn QueryPoolRepo {
self
}
fn namespaces(&self) -> &dyn NamespaceRepo {
fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
self
}
fn tables(&self) -> &dyn TableRepo {
fn tables(&mut self) -> &mut dyn TableRepo {
self
}
fn columns(&self) -> &dyn ColumnRepo {
fn columns(&mut self) -> &mut dyn ColumnRepo {
self
}
fn sequencers(&self) -> &dyn SequencerRepo {
fn sequencers(&mut self) -> &mut dyn SequencerRepo {
self
}
fn partitions(&self) -> &dyn PartitionRepo {
fn partitions(&mut self) -> &mut dyn PartitionRepo {
self
}
fn tombstones(&self) -> &dyn TombstoneRepo {
fn tombstones(&mut self) -> &mut dyn TombstoneRepo {
self
}
fn parquet_files(&self) -> &dyn ParquetFileRepo {
fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
self
}
fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo {
self
}
}
#[async_trait]
impl KafkaTopicRepo for MemCatalog {
async fn create_or_get(&self, name: &str) -> Result<KafkaTopic> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let topic = match collections.kafka_topics.iter().find(|t| t.name == name) {
impl KafkaTopicRepo for MemTxn {
async fn create_or_get(&mut self, name: &str) -> Result<KafkaTopic> {
let topic = match self.stage.kafka_topics.iter().find(|t| t.name == name) {
Some(t) => t,
None => {
let topic = KafkaTopic {
id: KafkaTopicId::new(collections.kafka_topics.len() as i32 + 1),
id: KafkaTopicId::new(self.stage.kafka_topics.len() as i32 + 1),
name: name.to_string(),
};
collections.kafka_topics.push(topic);
collections.kafka_topics.last().unwrap()
self.stage.kafka_topics.push(topic);
self.stage.kafka_topics.last().unwrap()
}
};
Ok(topic.clone())
}
async fn get_by_name(&self, name: &str) -> Result<Option<KafkaTopic>> {
let collections = self.collections.lock().expect("mutex poisoned");
let kafka_topic = collections
async fn get_by_name(&mut self, name: &str) -> Result<Option<KafkaTopic>> {
let kafka_topic = self
.stage
.kafka_topics
.iter()
.find(|t| t.name == name)
@ -124,19 +172,17 @@ impl KafkaTopicRepo for MemCatalog {
}
#[async_trait]
impl QueryPoolRepo for MemCatalog {
async fn create_or_get(&self, name: &str) -> Result<QueryPool> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let pool = match collections.query_pools.iter().find(|t| t.name == name) {
impl QueryPoolRepo for MemTxn {
async fn create_or_get(&mut self, name: &str) -> Result<QueryPool> {
let pool = match self.stage.query_pools.iter().find(|t| t.name == name) {
Some(t) => t,
None => {
let pool = QueryPool {
id: QueryPoolId::new(collections.query_pools.len() as i16 + 1),
id: QueryPoolId::new(self.stage.query_pools.len() as i16 + 1),
name: name.to_string(),
};
collections.query_pools.push(pool);
collections.query_pools.last().unwrap()
self.stage.query_pools.push(pool);
self.stage.query_pools.last().unwrap()
}
};
@ -145,35 +191,34 @@ impl QueryPoolRepo for MemCatalog {
}
#[async_trait]
impl NamespaceRepo for MemCatalog {
impl NamespaceRepo for MemTxn {
async fn create(
&self,
&mut self,
name: &str,
retention_duration: &str,
kafka_topic_id: KafkaTopicId,
query_pool_id: QueryPoolId,
) -> Result<Namespace> {
let mut collections = self.collections.lock().expect("mutex poisoned");
if collections.namespaces.iter().any(|n| n.name == name) {
if self.stage.namespaces.iter().any(|n| n.name == name) {
return Err(Error::NameExists {
name: name.to_string(),
});
}
let namespace = Namespace {
id: NamespaceId::new(collections.namespaces.len() as i32 + 1),
id: NamespaceId::new(self.stage.namespaces.len() as i32 + 1),
name: name.to_string(),
kafka_topic_id,
query_pool_id,
retention_duration: Some(retention_duration.to_string()),
};
collections.namespaces.push(namespace);
Ok(collections.namespaces.last().unwrap().clone())
self.stage.namespaces.push(namespace);
Ok(self.stage.namespaces.last().unwrap().clone())
}
async fn get_by_name(&self, name: &str) -> Result<Option<Namespace>> {
let collections = self.collections.lock().expect("mutex poisoned");
Ok(collections
async fn get_by_name(&mut self, name: &str) -> Result<Option<Namespace>> {
Ok(self
.stage
.namespaces
.iter()
.find(|n| n.name == name)
@ -182,11 +227,10 @@ impl NamespaceRepo for MemCatalog {
}
#[async_trait]
impl TableRepo for MemCatalog {
async fn create_or_get(&self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let table = match collections
impl TableRepo for MemTxn {
async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
let table = match self
.stage
.tables
.iter()
.find(|t| t.name == name && t.namespace_id == namespace_id)
@ -194,21 +238,21 @@ impl TableRepo for MemCatalog {
Some(t) => t,
None => {
let table = Table {
id: TableId::new(collections.tables.len() as i32 + 1),
id: TableId::new(self.stage.tables.len() as i32 + 1),
namespace_id,
name: name.to_string(),
};
collections.tables.push(table);
collections.tables.last().unwrap()
self.stage.tables.push(table);
self.stage.tables.last().unwrap()
}
};
Ok(table.clone())
}
async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
let collections = self.collections.lock().expect("mutex poisoned");
let tables: Vec<_> = collections
async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
let tables: Vec<_> = self
.stage
.tables
.iter()
.filter(|t| t.namespace_id == namespace_id)
@ -219,16 +263,15 @@ impl TableRepo for MemCatalog {
}
#[async_trait]
impl ColumnRepo for MemCatalog {
impl ColumnRepo for MemTxn {
async fn create_or_get(
&self,
&mut self,
name: &str,
table_id: TableId,
column_type: ColumnType,
) -> Result<Column> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let column = match collections
let column = match self
.stage
.columns
.iter()
.find(|t| t.name == name && t.table_id == table_id)
@ -246,31 +289,31 @@ impl ColumnRepo for MemCatalog {
}
None => {
let column = Column {
id: ColumnId::new(collections.columns.len() as i32 + 1),
id: ColumnId::new(self.stage.columns.len() as i32 + 1),
table_id,
name: name.to_string(),
column_type: column_type as i16,
};
collections.columns.push(column);
collections.columns.last().unwrap()
self.stage.columns.push(column);
self.stage.columns.last().unwrap()
}
};
Ok(column.clone())
}
async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
let collections = self.collections.lock().expect("mutex poisoned");
let table_ids: Vec<_> = collections
async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
let table_ids: Vec<_> = self
.stage
.tables
.iter()
.filter(|t| t.namespace_id == namespace_id)
.map(|t| t.id)
.collect();
println!("tables: {:?}", collections.tables);
println!("tables: {:?}", self.stage.tables);
println!("table_ids: {:?}", table_ids);
let columns: Vec<_> = collections
let columns: Vec<_> = self
.stage
.columns
.iter()
.filter(|c| table_ids.contains(&c.table_id))
@ -282,15 +325,14 @@ impl ColumnRepo for MemCatalog {
}
#[async_trait]
impl SequencerRepo for MemCatalog {
impl SequencerRepo for MemTxn {
async fn create_or_get(
&self,
&mut self,
topic: &KafkaTopic,
partition: KafkaPartition,
) -> Result<Sequencer> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let sequencer = match collections
let sequencer = match self
.stage
.sequencers
.iter()
.find(|s| s.kafka_topic_id == topic.id && s.kafka_partition == partition)
@ -298,13 +340,13 @@ impl SequencerRepo for MemCatalog {
Some(t) => t,
None => {
let sequencer = Sequencer {
id: SequencerId::new(collections.sequencers.len() as i16 + 1),
id: SequencerId::new(self.stage.sequencers.len() as i16 + 1),
kafka_topic_id: topic.id,
kafka_partition: partition,
min_unpersisted_sequence_number: 0,
};
collections.sequencers.push(sequencer);
collections.sequencers.last().unwrap()
self.stage.sequencers.push(sequencer);
self.stage.sequencers.last().unwrap()
}
};
@ -312,12 +354,12 @@ impl SequencerRepo for MemCatalog {
}
async fn get_by_topic_id_and_partition(
&self,
&mut self,
topic_id: KafkaTopicId,
partition: KafkaPartition,
) -> Result<Option<Sequencer>> {
let collections = self.collections.lock().expect("mutex poisoned");
let sequencer = collections
let sequencer = self
.stage
.sequencers
.iter()
.find(|s| s.kafka_topic_id == topic_id && s.kafka_partition == partition)
@ -325,14 +367,13 @@ impl SequencerRepo for MemCatalog {
Ok(sequencer)
}
async fn list(&self) -> Result<Vec<Sequencer>> {
let collections = self.collections.lock().expect("mutex poisoned");
Ok(collections.sequencers.clone())
async fn list(&mut self) -> Result<Vec<Sequencer>> {
Ok(self.stage.sequencers.clone())
}
async fn list_by_kafka_topic(&self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
let collections = self.collections.lock().expect("mutex poisoned");
let sequencers: Vec<_> = collections
async fn list_by_kafka_topic(&mut self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
let sequencers: Vec<_> = self
.stage
.sequencers
.iter()
.filter(|s| s.kafka_topic_id == topic.id)
@ -343,36 +384,35 @@ impl SequencerRepo for MemCatalog {
}
#[async_trait]
impl PartitionRepo for MemCatalog {
impl PartitionRepo for MemTxn {
async fn create_or_get(
&self,
&mut self,
key: &str,
sequencer_id: SequencerId,
table_id: TableId,
) -> Result<Partition> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let partition = match collections.partitions.iter().find(|p| {
let partition = match self.stage.partitions.iter().find(|p| {
p.partition_key == key && p.sequencer_id == sequencer_id && p.table_id == table_id
}) {
Some(p) => p,
None => {
let p = Partition {
id: PartitionId::new(collections.partitions.len() as i64 + 1),
id: PartitionId::new(self.stage.partitions.len() as i64 + 1),
sequencer_id,
table_id,
partition_key: key.to_string(),
};
collections.partitions.push(p);
collections.partitions.last().unwrap()
self.stage.partitions.push(p);
self.stage.partitions.last().unwrap()
}
};
Ok(partition.clone())
}
async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
let collections = self.collections.lock().expect("mutex poisoned");
let partitions: Vec<_> = collections
async fn list_by_sequencer(&mut self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
let partitions: Vec<_> = self
.stage
.partitions
.iter()
.filter(|p| p.sequencer_id == sequencer_id)
@ -380,12 +420,50 @@ impl PartitionRepo for MemCatalog {
.collect();
Ok(partitions)
}
async fn partition_info_by_id(
&mut self,
partition_id: PartitionId,
) -> Result<Option<PartitionInfo>> {
let partition = self
.stage
.partitions
.iter()
.find(|p| p.id == partition_id)
.cloned();
if let Some(partition) = partition {
let table = self
.stage
.tables
.iter()
.find(|t| t.id == partition.table_id)
.cloned();
if let Some(table) = table {
let namespace = self
.stage
.namespaces
.iter()
.find(|n| n.id == table.namespace_id)
.cloned();
if let Some(namespace) = namespace {
return Ok(Some(PartitionInfo {
namespace_name: namespace.name,
table_name: table.name,
partition,
}));
}
}
}
Ok(None)
}
}
#[async_trait]
impl TombstoneRepo for MemCatalog {
impl TombstoneRepo for MemTxn {
async fn create_or_get(
&self,
&mut self,
table_id: TableId,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
@ -393,8 +471,7 @@ impl TombstoneRepo for MemCatalog {
max_time: Timestamp,
predicate: &str,
) -> Result<Tombstone> {
let mut collections = self.collections.lock().expect("mutex poisoned");
let tombstone = match collections.tombstones.iter().find(|t| {
let tombstone = match self.stage.tombstones.iter().find(|t| {
t.table_id == table_id
&& t.sequencer_id == sequencer_id
&& t.sequence_number == sequence_number
@ -402,7 +479,7 @@ impl TombstoneRepo for MemCatalog {
Some(t) => t,
None => {
let t = Tombstone {
id: TombstoneId::new(collections.tombstones.len() as i64 + 1),
id: TombstoneId::new(self.stage.tombstones.len() as i64 + 1),
table_id,
sequencer_id,
sequence_number,
@ -410,8 +487,8 @@ impl TombstoneRepo for MemCatalog {
max_time,
serialized_predicate: predicate.to_string(),
};
collections.tombstones.push(t);
collections.tombstones.last().unwrap()
self.stage.tombstones.push(t);
self.stage.tombstones.last().unwrap()
}
};
@ -419,12 +496,12 @@ impl TombstoneRepo for MemCatalog {
}
async fn list_tombstones_by_sequencer_greater_than(
&self,
&mut self,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
) -> Result<Vec<Tombstone>> {
let collections = self.collections.lock().expect("mutex poisoned");
let tombstones: Vec<_> = collections
let tombstones: Vec<_> = self
.stage
.tombstones
.iter()
.filter(|t| t.sequencer_id == sequencer_id && t.sequence_number > sequence_number)
@ -435,9 +512,9 @@ impl TombstoneRepo for MemCatalog {
}
#[async_trait]
impl ParquetFileRepo for MemCatalog {
impl ParquetFileRepo for MemTxn {
async fn create(
&self,
&mut self,
sequencer_id: SequencerId,
table_id: TableId,
partition_id: PartitionId,
@ -447,8 +524,8 @@ impl ParquetFileRepo for MemCatalog {
min_time: Timestamp,
max_time: Timestamp,
) -> Result<ParquetFile> {
let mut collections = self.collections.lock().expect("mutex poisoned");
if collections
if self
.stage
.parquet_files
.iter()
.any(|f| f.object_store_id == object_store_id)
@ -457,7 +534,7 @@ impl ParquetFileRepo for MemCatalog {
}
let parquet_file = ParquetFile {
id: ParquetFileId::new(collections.parquet_files.len() as i64 + 1),
id: ParquetFileId::new(self.stage.parquet_files.len() as i64 + 1),
sequencer_id,
table_id,
partition_id,
@ -468,14 +545,12 @@ impl ParquetFileRepo for MemCatalog {
max_time,
to_delete: false,
};
collections.parquet_files.push(parquet_file);
Ok(*collections.parquet_files.last().unwrap())
self.stage.parquet_files.push(parquet_file);
Ok(*self.stage.parquet_files.last().unwrap())
}
async fn flag_for_delete(&self, id: ParquetFileId) -> Result<()> {
let mut collections = self.collections.lock().expect("mutex poisoned");
match collections.parquet_files.iter_mut().find(|p| p.id == id) {
async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()> {
match self.stage.parquet_files.iter_mut().find(|p| p.id == id) {
Some(f) => f.to_delete = true,
None => return Err(Error::ParquetRecordNotFound { id }),
}
@ -484,12 +559,12 @@ impl ParquetFileRepo for MemCatalog {
}
async fn list_by_sequencer_greater_than(
&self,
&mut self,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
) -> Result<Vec<ParquetFile>> {
let collections = self.collections.lock().expect("mutex poisoned");
let files: Vec<_> = collections
let files: Vec<_> = self
.stage
.parquet_files
.iter()
.filter(|f| f.sequencer_id == sequencer_id && f.max_sequence_number > sequence_number)
@ -497,6 +572,100 @@ impl ParquetFileRepo for MemCatalog {
.collect();
Ok(files)
}
async fn exist(&mut self, id: ParquetFileId) -> Result<bool> {
Ok(self.stage.parquet_files.iter().any(|f| f.id == id))
}
async fn count(&mut self) -> Result<i64> {
let count = self.stage.parquet_files.len();
let count_i64 = i64::try_from(count);
if count_i64.is_err() {
return Err(Error::InvalidValue { value: count });
}
Ok(count_i64.unwrap())
}
}
#[async_trait]
impl ProcessedTombstoneRepo for MemTxn {
async fn create_many(
&mut self,
parquet_file_id: ParquetFileId,
tombstones: &[Tombstone],
) -> Result<Vec<ProcessedTombstone>> {
// check if the parquet file available
if !self
.stage
.parquet_files
.iter()
.any(|f| f.id == parquet_file_id)
{
return Err(Error::FileNotFound {
id: parquet_file_id.get(),
});
}
let mut processed_tombstones = vec![];
for tombstone in tombstones {
// check if tomstone exists
if !self.stage.tombstones.iter().any(|f| f.id == tombstone.id) {
return Err(Error::TombstoneNotFound {
id: tombstone.id.get(),
});
}
if self
.stage
.processed_tombstones
.iter()
.any(|pt| pt.tombstone_id == tombstone.id && pt.parquet_file_id == parquet_file_id)
{
// The tombstone was already proccessed for this file
return Err(Error::ProcessTombstoneExists {
parquet_file_id: parquet_file_id.get(),
tombstone_id: tombstone.id.get(),
});
}
let processed_tombstone = ProcessedTombstone {
tombstone_id: tombstone.id,
parquet_file_id,
};
processed_tombstones.push(processed_tombstone);
}
// save for returning
let return_processed_tombstones = processed_tombstones.clone();
// Add to the catalog
self.stage
.processed_tombstones
.append(&mut processed_tombstones);
Ok(return_processed_tombstones)
}
async fn exist(
&mut self,
parquet_file_id: ParquetFileId,
tombstone_id: TombstoneId,
) -> Result<bool> {
Ok(self
.stage
.processed_tombstones
.iter()
.any(|f| f.parquet_file_id == parquet_file_id && f.tombstone_id == tombstone_id))
}
async fn count(&mut self) -> Result<i64> {
let count = self.stage.processed_tombstones.len();
let count_i64 = i64::try_from(count);
if count_i64.is_err() {
return Err(Error::InvalidValue { value: count });
}
Ok(count_i64.unwrap())
}
}
#[cfg(test)]

View File

@ -1,15 +1,16 @@
//! A Postgres backed implementation of the Catalog
use crate::interface::{
Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition, KafkaTopic, KafkaTopicId,
KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile, ParquetFileId,
ParquetFileRepo, Partition, PartitionId, PartitionRepo, QueryPool, QueryPoolId, QueryPoolRepo,
Result, SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo,
Timestamp, Tombstone, TombstoneRepo,
sealed::TransactionFinalize, Catalog, Column, ColumnRepo, ColumnType, Error, KafkaPartition,
KafkaTopic, KafkaTopicId, KafkaTopicRepo, Namespace, NamespaceId, NamespaceRepo, ParquetFile,
ParquetFileId, ParquetFileRepo, Partition, PartitionId, PartitionInfo, PartitionRepo,
ProcessedTombstone, ProcessedTombstoneRepo, QueryPool, QueryPoolId, QueryPoolRepo, Result,
SequenceNumber, Sequencer, SequencerId, SequencerRepo, Table, TableId, TableRepo, Timestamp,
Tombstone, TombstoneId, TombstoneRepo, Transaction,
};
use async_trait::async_trait;
use observability_deps::tracing::info;
use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres};
use observability_deps::tracing::{info, warn};
use sqlx::{migrate::Migrator, postgres::PgPoolOptions, Executor, Pool, Postgres, Row};
use std::time::Duration;
use uuid::Uuid;
@ -21,12 +22,18 @@ pub const SCHEMA_NAME: &str = "iox_catalog";
static MIGRATOR: Migrator = sqlx::migrate!();
/// In-memory catalog that implements the `RepoCollection` and individual repo traits.
/// PostgreSQL catalog.
#[derive(Debug)]
pub struct PostgresCatalog {
pool: Pool<Postgres>,
}
// struct to get return value from "select count(*) ..." wuery"
#[derive(sqlx::FromRow)]
struct Count {
count: i64,
}
impl PostgresCatalog {
/// Connect to the catalog store.
pub async fn connect(
@ -63,6 +70,50 @@ impl PostgresCatalog {
}
}
/// transaction for [`PostgresCatalog`].
#[derive(Debug)]
pub struct PostgresTxn {
transaction: Option<sqlx::Transaction<'static, Postgres>>,
}
impl PostgresTxn {
fn transaction(&mut self) -> &mut sqlx::Transaction<'static, Postgres> {
self.transaction.as_mut().expect("Not yet finalized")
}
}
impl Drop for PostgresTxn {
fn drop(&mut self) {
if self.transaction.is_some() {
warn!("Dropping PostgresTxn w/o finalizing (commit or abort)");
// SQLx ensures that the inner transaction enqueues a rollback when it is dropped, so we don't need to spawn
// a task here to call `rollback` manually.
}
}
}
#[async_trait]
impl TransactionFinalize for PostgresTxn {
async fn commit_inplace(&mut self) -> Result<(), Error> {
self.transaction
.take()
.expect("Not yet finalized")
.commit()
.await
.map_err(|e| Error::SqlxError { source: e })
}
async fn abort_inplace(&mut self) -> Result<(), Error> {
self.transaction
.take()
.expect("Not yet finalized")
.rollback()
.await
.map_err(|e| Error::SqlxError { source: e })
}
}
#[async_trait]
impl Catalog for PostgresCatalog {
async fn setup(&self) -> Result<(), Error> {
@ -74,46 +125,65 @@ impl Catalog for PostgresCatalog {
Ok(())
}
fn kafka_topics(&self) -> &dyn KafkaTopicRepo {
async fn start_transaction(&self) -> Result<Box<dyn Transaction>, Error> {
let transaction = self
.pool
.begin()
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(Box::new(PostgresTxn {
transaction: Some(transaction),
}))
}
}
#[async_trait]
impl Transaction for PostgresTxn {
fn kafka_topics(&mut self) -> &mut dyn KafkaTopicRepo {
self
}
fn query_pools(&self) -> &dyn QueryPoolRepo {
fn query_pools(&mut self) -> &mut dyn QueryPoolRepo {
self
}
fn namespaces(&self) -> &dyn NamespaceRepo {
fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
self
}
fn tables(&self) -> &dyn TableRepo {
fn tables(&mut self) -> &mut dyn TableRepo {
self
}
fn columns(&self) -> &dyn ColumnRepo {
fn columns(&mut self) -> &mut dyn ColumnRepo {
self
}
fn sequencers(&self) -> &dyn SequencerRepo {
fn sequencers(&mut self) -> &mut dyn SequencerRepo {
self
}
fn partitions(&self) -> &dyn PartitionRepo {
fn partitions(&mut self) -> &mut dyn PartitionRepo {
self
}
fn tombstones(&self) -> &dyn TombstoneRepo {
fn tombstones(&mut self) -> &mut dyn TombstoneRepo {
self
}
fn parquet_files(&self) -> &dyn ParquetFileRepo {
fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
self
}
fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo {
self
}
}
#[async_trait]
impl KafkaTopicRepo for PostgresCatalog {
async fn create_or_get(&self, name: &str) -> Result<KafkaTopic> {
impl KafkaTopicRepo for PostgresTxn {
async fn create_or_get(&mut self, name: &str) -> Result<KafkaTopic> {
let rec = sqlx::query_as::<_, KafkaTopic>(
r#"
INSERT INTO kafka_topic ( name )
@ -123,21 +193,21 @@ DO UPDATE SET name = kafka_topic.name RETURNING *;
"#,
)
.bind(&name) // $1
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(rec)
}
async fn get_by_name(&self, name: &str) -> Result<Option<KafkaTopic>> {
async fn get_by_name(&mut self, name: &str) -> Result<Option<KafkaTopic>> {
let rec = sqlx::query_as::<_, KafkaTopic>(
r#"
SELECT * FROM kafka_topic WHERE name = $1;
"#,
)
.bind(&name) // $1
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await;
if let Err(sqlx::Error::RowNotFound) = rec {
@ -151,8 +221,8 @@ SELECT * FROM kafka_topic WHERE name = $1;
}
#[async_trait]
impl QueryPoolRepo for PostgresCatalog {
async fn create_or_get(&self, name: &str) -> Result<QueryPool> {
impl QueryPoolRepo for PostgresTxn {
async fn create_or_get(&mut self, name: &str) -> Result<QueryPool> {
let rec = sqlx::query_as::<_, QueryPool>(
r#"
INSERT INTO query_pool ( name )
@ -162,7 +232,7 @@ DO UPDATE SET name = query_pool.name RETURNING *;
"#,
)
.bind(&name) // $1
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
@ -171,9 +241,9 @@ DO UPDATE SET name = query_pool.name RETURNING *;
}
#[async_trait]
impl NamespaceRepo for PostgresCatalog {
impl NamespaceRepo for PostgresTxn {
async fn create(
&self,
&mut self,
name: &str,
retention_duration: &str,
kafka_topic_id: KafkaTopicId,
@ -190,7 +260,7 @@ RETURNING *
.bind(&retention_duration) // $2
.bind(kafka_topic_id) // $3
.bind(query_pool_id) // $4
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_unique_violation(&e) {
@ -207,14 +277,14 @@ RETURNING *
Ok(rec)
}
async fn get_by_name(&self, name: &str) -> Result<Option<Namespace>> {
async fn get_by_name(&mut self, name: &str) -> Result<Option<Namespace>> {
let rec = sqlx::query_as::<_, Namespace>(
r#"
SELECT * FROM namespace WHERE name = $1;
"#,
)
.bind(&name) // $1
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await;
if let Err(sqlx::Error::RowNotFound) = rec {
@ -228,8 +298,8 @@ SELECT * FROM namespace WHERE name = $1;
}
#[async_trait]
impl TableRepo for PostgresCatalog {
async fn create_or_get(&self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
impl TableRepo for PostgresTxn {
async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result<Table> {
let rec = sqlx::query_as::<_, Table>(
r#"
INSERT INTO table_name ( name, namespace_id )
@ -240,7 +310,7 @@ DO UPDATE SET name = table_name.name RETURNING *;
)
.bind(&name) // $1
.bind(&namespace_id) // $2
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_fk_violation(&e) {
@ -253,7 +323,7 @@ DO UPDATE SET name = table_name.name RETURNING *;
Ok(rec)
}
async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
let rec = sqlx::query_as::<_, Table>(
r#"
SELECT * FROM table_name
@ -261,7 +331,7 @@ WHERE namespace_id = $1;
"#,
)
.bind(&namespace_id)
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
@ -270,9 +340,9 @@ WHERE namespace_id = $1;
}
#[async_trait]
impl ColumnRepo for PostgresCatalog {
impl ColumnRepo for PostgresTxn {
async fn create_or_get(
&self,
&mut self,
name: &str,
table_id: TableId,
column_type: ColumnType,
@ -290,7 +360,7 @@ DO UPDATE SET name = column_name.name RETURNING *;
.bind(&name) // $1
.bind(&table_id) // $2
.bind(&ct) // $3
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_fk_violation(&e) {
@ -311,7 +381,7 @@ DO UPDATE SET name = column_name.name RETURNING *;
Ok(rec)
}
async fn list_by_namespace_id(&self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
let rec = sqlx::query_as::<_, Column>(
r#"
SELECT column_name.* FROM table_name
@ -320,7 +390,7 @@ WHERE table_name.namespace_id = $1;
"#,
)
.bind(&namespace_id)
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
@ -329,9 +399,9 @@ WHERE table_name.namespace_id = $1;
}
#[async_trait]
impl SequencerRepo for PostgresCatalog {
impl SequencerRepo for PostgresTxn {
async fn create_or_get(
&self,
&mut self,
topic: &KafkaTopic,
partition: KafkaPartition,
) -> Result<Sequencer> {
@ -347,7 +417,7 @@ impl SequencerRepo for PostgresCatalog {
)
.bind(&topic.id) // $1
.bind(&partition) // $2
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_fk_violation(&e) {
@ -359,7 +429,7 @@ impl SequencerRepo for PostgresCatalog {
}
async fn get_by_topic_id_and_partition(
&self,
&mut self,
topic_id: KafkaTopicId,
partition: KafkaPartition,
) -> Result<Option<Sequencer>> {
@ -370,7 +440,7 @@ SELECT * FROM sequencer WHERE kafka_topic_id = $1 AND kafka_partition = $2;
)
.bind(topic_id) // $1
.bind(partition) // $2
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await;
if let Err(sqlx::Error::RowNotFound) = rec {
@ -382,26 +452,26 @@ SELECT * FROM sequencer WHERE kafka_topic_id = $1 AND kafka_partition = $2;
Ok(Some(sequencer))
}
async fn list(&self) -> Result<Vec<Sequencer>> {
async fn list(&mut self) -> Result<Vec<Sequencer>> {
sqlx::query_as::<_, Sequencer>(r#"SELECT * FROM sequencer;"#)
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })
}
async fn list_by_kafka_topic(&self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
async fn list_by_kafka_topic(&mut self, topic: &KafkaTopic) -> Result<Vec<Sequencer>> {
sqlx::query_as::<_, Sequencer>(r#"SELECT * FROM sequencer WHERE kafka_topic_id = $1;"#)
.bind(&topic.id) // $1
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })
}
}
#[async_trait]
impl PartitionRepo for PostgresCatalog {
impl PartitionRepo for PostgresTxn {
async fn create_or_get(
&self,
&mut self,
key: &str,
sequencer_id: SequencerId,
table_id: TableId,
@ -419,7 +489,7 @@ impl PartitionRepo for PostgresCatalog {
.bind(key) // $1
.bind(&sequencer_id) // $2
.bind(&table_id) // $3
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_fk_violation(&e) {
@ -430,19 +500,53 @@ impl PartitionRepo for PostgresCatalog {
})
}
async fn list_by_sequencer(&self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
async fn list_by_sequencer(&mut self, sequencer_id: SequencerId) -> Result<Vec<Partition>> {
sqlx::query_as::<_, Partition>(r#"SELECT * FROM partition WHERE sequencer_id = $1;"#)
.bind(&sequencer_id) // $1
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })
}
async fn partition_info_by_id(
&mut self,
partition_id: PartitionId,
) -> Result<Option<PartitionInfo>> {
let info = sqlx::query(
r#"
SELECT namespace.name as namespace_name, table_name.name as table_name, partition.id,
partition.sequencer_id, partition.table_id, partition.partition_key
FROM partition
INNER JOIN table_name on table_name.id = partition.table_id
INNER JOIN namespace on namespace.id = table_name.namespace_id
WHERE partition.id = $1;"#,
)
.bind(&partition_id) // $1
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
let namespace_name = info.get("namespace_name");
let table_name = info.get("table_name");
let partition = Partition {
id: info.get("id"),
sequencer_id: info.get("sequencer_id"),
table_id: info.get("table_id"),
partition_key: info.get("partition_key"),
};
Ok(Some(PartitionInfo {
namespace_name,
table_name,
partition,
}))
}
}
#[async_trait]
impl TombstoneRepo for PostgresCatalog {
impl TombstoneRepo for PostgresTxn {
async fn create_or_get(
&self,
&mut self,
table_id: TableId,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
@ -466,7 +570,7 @@ impl TombstoneRepo for PostgresCatalog {
.bind(&min_time) // $4
.bind(&max_time) // $5
.bind(predicate) // $6
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_fk_violation(&e) {
@ -478,23 +582,23 @@ impl TombstoneRepo for PostgresCatalog {
}
async fn list_tombstones_by_sequencer_greater_than(
&self,
&mut self,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
) -> Result<Vec<Tombstone>> {
sqlx::query_as::<_, Tombstone>(r#"SELECT * FROM tombstone WHERE sequencer_id = $1 AND sequence_number > $2 ORDER BY id;"#)
.bind(&sequencer_id) // $1
.bind(&sequence_number) // $2
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })
}
}
#[async_trait]
impl ParquetFileRepo for PostgresCatalog {
impl ParquetFileRepo for PostgresTxn {
async fn create(
&self,
&mut self,
sequencer_id: SequencerId,
table_id: TableId,
partition_id: PartitionId,
@ -519,7 +623,7 @@ RETURNING *
.bind(max_sequence_number) // $6
.bind(min_time) // $7
.bind(max_time) // $8
.fetch_one(&self.pool)
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_unique_violation(&e) {
@ -536,10 +640,10 @@ RETURNING *
Ok(rec)
}
async fn flag_for_delete(&self, id: ParquetFileId) -> Result<()> {
async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()> {
let _ = sqlx::query(r#"UPDATE parquet_file SET to_delete = true WHERE id = $1;"#)
.bind(&id) // $1
.execute(&self.pool)
.execute(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
@ -547,17 +651,109 @@ RETURNING *
}
async fn list_by_sequencer_greater_than(
&self,
&mut self,
sequencer_id: SequencerId,
sequence_number: SequenceNumber,
) -> Result<Vec<ParquetFile>> {
sqlx::query_as::<_, ParquetFile>(r#"SELECT * FROM parquet_file WHERE sequencer_id = $1 AND max_sequence_number > $2 ORDER BY id;"#)
.bind(&sequencer_id) // $1
.bind(&sequence_number) // $2
.fetch_all(&self.pool)
.fetch_all(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })
}
async fn exist(&mut self, id: ParquetFileId) -> Result<bool> {
let read_result = sqlx::query_as::<_, Count>(
r#"SELECT count(*) as count FROM parquet_file WHERE id = $1;"#,
)
.bind(&id) // $1
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(read_result.count > 0)
}
async fn count(&mut self) -> Result<i64> {
let read_result =
sqlx::query_as::<_, Count>(r#"SELECT count(*) as count FROM parquet_file;"#)
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(read_result.count)
}
}
#[async_trait]
impl ProcessedTombstoneRepo for PostgresTxn {
async fn create_many(
&mut self,
parquet_file_id: ParquetFileId,
tombstones: &[Tombstone],
) -> Result<Vec<ProcessedTombstone>> {
// no transaction provided
// todo: we should never needs this but since right now we implement 2 catalogs,
// postgres (for production) and mem (for testing only) that does not need to provide txt
// this will be refactor when Marco has his new abstraction done
let mut processed_tombstones = vec![];
for tombstone in tombstones {
let processed_tombstone = sqlx::query_as::<_, ProcessedTombstone>(
r#"
INSERT INTO processed_tombstone ( tombstone_id, parquet_file_id )
VALUES ( $1, $2 )
RETURNING *
"#,
)
.bind(tombstone.id) // $1
.bind(parquet_file_id) // $2
.fetch_one(self.transaction())
.await
.map_err(|e| {
if is_unique_violation(&e) {
Error::ProcessTombstoneExists {
tombstone_id: tombstone.id.get(),
parquet_file_id: parquet_file_id.get(),
}
} else if is_fk_violation(&e) {
Error::ForeignKeyViolation { source: e }
} else {
Error::SqlxError { source: e }
}
})?;
processed_tombstones.push(processed_tombstone);
}
Ok(processed_tombstones)
}
async fn exist(
&mut self,
parquet_file_id: ParquetFileId,
tombstone_id: TombstoneId,
) -> Result<bool> {
let read_result = sqlx::query_as::<_, Count>(
r#"SELECT count(*) as count FROM processed_tombstone WHERE parquet_file_id = $1 AND tombstone_id = $2;"#)
.bind(&parquet_file_id) // $1
.bind(&tombstone_id) // $2
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(read_result.count > 0)
}
async fn count(&mut self) -> Result<i64> {
let read_result =
sqlx::query_as::<_, Count>(r#"SELECT count(*) as count FROM processed_tombstone;"#)
.fetch_one(self.transaction())
.await
.map_err(|e| Error::SqlxError { source: e })?;
Ok(read_result.count)
}
}
/// The error code returned by Postgres for a unique constraint violation.
@ -659,6 +855,10 @@ mod tests {
}
async fn clear_schema(pool: &Pool<Postgres>) {
sqlx::query("delete from processed_tombstone;")
.execute(pool)
.await
.unwrap();
sqlx::query("delete from tombstone;")
.execute(pool)
.await

View File

@ -5,7 +5,7 @@ use data_types::{
};
use datafusion::physical_plan::SendableRecordBatchStream;
use iox_object_store::{IoxObjectStore, ParquetFilePath};
use predicate::predicate::Predicate;
use predicate::Predicate;
use schema::selection::Selection;
use schema::{Schema, TIME_COLUMN_NAME};
use snafu::{ResultExt, Snafu};

View File

@ -92,7 +92,9 @@ use data_types::{
};
use generated_types::influxdata::iox::ingest::v1 as proto;
use generated_types::influxdata::iox::preserved_catalog::v1 as preserved_catalog;
use iox_catalog::interface::{NamespaceId, PartitionId, SequenceNumber, SequencerId, TableId};
use iox_catalog::interface::{
NamespaceId, ParquetFile, ParquetFileId, PartitionId, SequenceNumber, SequencerId, TableId,
};
use parquet::{
arrow::parquet_to_arrow_schema,
file::{
@ -589,6 +591,26 @@ impl IoxMetadata {
pub fn match_object_store_id(&self, uuid: Uuid) -> bool {
uuid == self.object_store_id
}
// create a corresponding iox catalog's ParquetFile
pub fn to_parquet_file(&self) -> ParquetFile {
ParquetFile {
id: ParquetFileId::new(0), // this will be created in the DB. This 0 won't be used anywhere
sequencer_id: self.sequencer_id,
table_id: self.table_id,
partition_id: self.partition_id,
object_store_id: self.object_store_id,
min_sequence_number: self.min_sequence_number,
max_sequence_number: self.max_sequence_number,
min_time: iox_catalog::interface::Timestamp::new(
self.time_of_first_write.timestamp_nanos(),
),
max_time: iox_catalog::interface::Timestamp::new(
self.time_of_last_write.timestamp_nanos(),
),
to_delete: false,
}
}
}
/// Parse big-endian UUID from protobuf.

View File

@ -22,7 +22,7 @@ use parquet::{
basic::Compression,
file::{metadata::KeyValue, properties::WriterProperties, writer::TryClone},
};
use predicate::predicate::Predicate;
use predicate::Predicate;
use schema::selection::Selection;
use snafu::{OptionExt, ResultExt, Snafu};
use std::{

View File

@ -68,7 +68,7 @@ pub enum Error {
/// Result type for Parser Cient
pub type Result<T, E = Error> = std::result::Result<T, E>;
impl From<DeletePredicate> for crate::predicate::Predicate {
impl From<DeletePredicate> for crate::Predicate {
fn from(pred: DeletePredicate) -> Self {
Self {
field_columns: None,

View File

@ -10,7 +10,648 @@
pub mod delete_expr;
pub mod delete_predicate;
pub mod predicate;
pub mod regex;
pub mod rewrite;
pub mod rpc_predicate;
use data_types::timestamp::{TimestampRange, MAX_NANO_TIME, MIN_NANO_TIME};
use datafusion::{
error::DataFusionError,
logical_plan::{col, lit_timestamp_nano, Column, Expr, Operator},
optimizer::utils,
};
use datafusion_util::{make_range_expr, AndExprBuilder};
use observability_deps::tracing::debug;
use schema::TIME_COLUMN_NAME;
use std::{
collections::{BTreeSet, HashSet},
fmt,
};
/// This `Predicate` represents the empty predicate (aka that evaluates to true for all rows).
pub const EMPTY_PREDICATE: Predicate = Predicate {
field_columns: None,
exprs: vec![],
range: None,
partition_key: None,
value_expr: vec![],
};
/// A unified Predicate structure for IOx queries that can select and filter Fields and Tags from
/// the InfluxDB data mode, as well as for arbitrary other predicates that are expressed by
/// DataFusion's [`Expr`] type.
///
/// Note that the InfluxDB data model (e.g. ParsedLine's) distinguishes between some types of
/// columns (tags and fields), and likewise the semantics of this structure can express some types
/// of restrictions that only apply to certain types of columns.
#[derive(Clone, Debug, Default, PartialEq, PartialOrd)]
pub struct Predicate {
/// Optional field restriction. If present, restricts the results to only
/// tables which have *at least one* of the fields in field_columns.
pub field_columns: Option<BTreeSet<String>>,
/// Optional partition key filter
pub partition_key: Option<String>,
/// Optional timestamp range: only rows within this range are included in
/// results. Other rows are excluded
pub range: Option<TimestampRange>,
/// Optional arbitrary predicates, represented as list of
/// DataFusion expressions applied a logical conjunction (aka they
/// are 'AND'ed together). Only rows that evaluate to TRUE for all
/// these expressions should be returned. Other rows are excluded
/// from the results.
pub exprs: Vec<Expr>,
/// Optional arbitrary predicates on the special `_value` column. These
/// expressions are applied to `field_columns` projections in the form of
/// `CASE` statement conditions.
pub value_expr: Vec<BinaryExpr>,
}
impl Predicate {
/// Return true if this predicate has any general purpose predicates
pub fn has_exprs(&self) -> bool {
!self.exprs.is_empty()
}
/// Return a DataFusion `Expr` predicate representing the
/// combination of all predicate (`exprs`) and timestamp
/// restriction in this Predicate. Returns None if there are no
/// `Expr`'s restricting the data
pub fn filter_expr(&self) -> Option<Expr> {
let mut builder =
AndExprBuilder::default().append_opt(self.make_timestamp_predicate_expr());
for expr in &self.exprs {
builder = builder.append_expr(expr.clone());
}
builder.build()
}
/// Return true if the field should be included in results
pub fn should_include_field(&self, field_name: &str) -> bool {
match &self.field_columns {
None => true, // No field restriction on predicate
Some(field_names) => field_names.contains(field_name),
}
}
/// Creates a DataFusion predicate for appliying a timestamp range:
///
/// `range.start <= time and time < range.end`
fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
self.range
.map(|range| make_range_expr(range.start(), range.end(), TIME_COLUMN_NAME))
}
/// Returns true if ths predicate evaluates to true for all rows
pub fn is_empty(&self) -> bool {
self == &EMPTY_PREDICATE
}
/// Return a negated DF logical expression for the given delete predicates
pub fn negated_expr<S>(delete_predicates: &[S]) -> Option<Expr>
where
S: AsRef<Self>,
{
if delete_predicates.is_empty() {
return None;
}
let mut pred = PredicateBuilder::default().build();
pred.merge_delete_predicates(delete_predicates);
// Make a conjunctive expression of the pred.exprs
let mut val = None;
for e in pred.exprs {
match val {
None => val = Some(e),
Some(expr) => val = Some(expr.and(e)),
}
}
val
}
/// Merge the given delete predicates into this select predicate.
/// Since we want to eliminate data filtered by the delete predicates,
/// they are first converted into their negated form: NOT(delete_predicate)
/// then added/merged into the selection one
pub fn merge_delete_predicates<S>(&mut self, delete_predicates: &[S])
where
S: AsRef<Self>,
{
// Create a list of disjunctive negated expressions.
// Example: there are two deletes as follows (note that time_range is stored separated in the Predicate
// but we need to put it together with the exprs here)
// . Delete_1: WHERE city != "Boston" AND temp = 70 AND time_range in [10, 30)
// . Delete 2: WHERE state = "NY" AND route != "I90" AND time_range in [20, 50)
// The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
// NOT(city != "Boston" AND temp = 70 AND time_range in [10, 30]), NOT(state = "NY" AND route != "I90" AND time_range in [20, 50]) which means
// [NOT(city = Boston") OR NOT(temp = 70) OR NOT(time_range in [10, 30])], [NOT(state = "NY") OR NOT(route != "I90") OR NOT(time_range in [20, 50])]
// Note that the "NOT(time_range in [20, 50])]" or "NOT(20 <= time <= 50)"" is replaced with "time < 20 OR time > 50"
for pred in delete_predicates {
let pred = pred.as_ref();
let mut expr: Option<Expr> = None;
// Time range
if let Some(range) = pred.range {
// time_expr = NOT(start <= time_range <= end)
// Equivalent to: (time < start OR time > end)
let time_expr = col(TIME_COLUMN_NAME)
.lt(lit_timestamp_nano(range.start()))
.or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end())));
match expr {
None => expr = Some(time_expr),
Some(e) => expr = Some(e.or(time_expr)),
}
}
// Exprs
for exp in &pred.exprs {
match expr {
None => expr = Some(exp.clone().not()),
Some(e) => expr = Some(e.or(exp.clone().not())),
}
}
// Push the negated expression of the delete predicate into the list exprs of the select predicate
if let Some(e) = expr {
self.exprs.push(e);
}
}
}
/// Removes the timestamp range from this predicate, if the range
/// is for the entire min/max valid range.
///
/// This is used in certain cases to retain compatibility with the
/// existing storage engine
pub(crate) fn clear_timestamp_if_max_range(mut self) -> Self {
self.range = self.range.take().and_then(|range| {
if range.start() <= MIN_NANO_TIME && range.end() >= MAX_NANO_TIME {
None
} else {
Some(range)
}
});
self
}
}
impl fmt::Display for Predicate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn iter_to_str<S>(s: impl IntoIterator<Item = S>) -> String
where
S: ToString,
{
s.into_iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", ")
}
write!(f, "Predicate")?;
if let Some(field_columns) = &self.field_columns {
write!(f, " field_columns: {{{}}}", iter_to_str(field_columns))?;
}
if let Some(partition_key) = &self.partition_key {
write!(f, " partition_key: '{}'", partition_key)?;
}
if let Some(range) = &self.range {
// TODO: could be nice to show this as actual timestamps (not just numbers)?
write!(f, " range: [{} - {}]", range.start(), range.end())?;
}
if !self.exprs.is_empty() {
write!(f, " exprs: [")?;
for (i, expr) in self.exprs.iter().enumerate() {
write!(f, "{}", expr)?;
if i < self.exprs.len() - 1 {
write!(f, ", ")?;
}
}
write!(f, "]")?;
}
Ok(())
}
}
#[derive(Debug, Clone, Copy)]
/// The result of evaluating a predicate on a set of rows
pub enum PredicateMatch {
/// There is at least one row that matches the predicate that has
/// at least one non null value in each field of the predicate
AtLeastOneNonNullField,
/// There are exactly zero rows that match the predicate
Zero,
/// There *may* be rows that match, OR there *may* be no rows that
/// match
Unknown,
}
/// Structure for building [`Predicate`]s
///
/// Example:
/// ```
/// use predicate::PredicateBuilder;
/// use datafusion::logical_plan::{col, lit};
///
/// let p = PredicateBuilder::new()
/// .timestamp_range(1, 100)
/// .add_expr(col("foo").eq(lit(42)))
/// .build();
///
/// assert_eq!(
/// p.to_string(),
/// "Predicate range: [1 - 100] exprs: [#foo = Int32(42)]"
/// );
/// ```
#[derive(Debug, Default)]
pub struct PredicateBuilder {
inner: Predicate,
}
impl From<Predicate> for PredicateBuilder {
fn from(inner: Predicate) -> Self {
Self { inner }
}
}
impl PredicateBuilder {
pub fn new() -> Self {
Self::default()
}
/// Sets the timestamp range
pub fn timestamp_range(mut self, start: i64, end: i64) -> Self {
// Without more thought, redefining the timestamp range would
// lose the old range. Asser that that cannot happen.
assert!(
self.inner.range.is_none(),
"Unexpected re-definition of timestamp range"
);
self.inner.range = Some(TimestampRange::new(start, end));
self
}
/// sets the optional timestamp range, if any
pub fn timestamp_range_option(mut self, range: Option<TimestampRange>) -> Self {
// Without more thought, redefining the timestamp range would
// lose the old range. Asser that that cannot happen.
assert!(
range.is_none() || self.inner.range.is_none(),
"Unexpected re-definition of timestamp range"
);
self.inner.range = range;
self
}
/// Adds an expression to the list of general purpose predicates
pub fn add_expr(mut self, expr: Expr) -> Self {
self.inner.exprs.push(expr);
self
}
/// Builds a regex matching expression from the provided column name and
/// pattern. Values not matching the regex will be filtered out.
pub fn build_regex_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
self.regex_match_expr(column, pattern, true)
}
/// Builds a regex "not matching" expression from the provided column name
/// and pattern. Values *matching* the regex will be filtered out.
pub fn build_regex_not_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
self.regex_match_expr(column, pattern, false)
}
fn regex_match_expr(mut self, column: &str, pattern: impl Into<String>, matches: bool) -> Self {
let expr = crate::regex::regex_match_expr(col(column), pattern.into(), matches);
self.inner.exprs.push(expr);
self
}
/// Sets field_column restriction
pub fn field_columns(mut self, columns: Vec<impl Into<String>>) -> Self {
// We need to distinguish predicates like `column_name In
// (foo, bar)` and `column_name = foo and column_name = bar` in order to handle
// this
if self.inner.field_columns.is_some() {
unimplemented!("Complex/Multi field predicates are not yet supported");
}
let column_names = columns
.into_iter()
.map(|s| s.into())
.collect::<BTreeSet<_>>();
self.inner.field_columns = Some(column_names);
self
}
/// Set the partition key restriction
pub fn partition_key(mut self, partition_key: impl Into<String>) -> Self {
assert!(
self.inner.partition_key.is_none(),
"multiple partition key predicates not suported"
);
self.inner.partition_key = Some(partition_key.into());
self
}
/// Create a predicate, consuming this builder
pub fn build(self) -> Predicate {
self.inner
}
/// Adds only the expressions from `filters` that can be pushed down to
/// execution engines.
pub fn add_pushdown_exprs(mut self, filters: &[Expr]) -> Self {
// For each expression of the filters, recursively split it, if it is is an AND conjunction
// For example, expression (x AND y) will be split into a vector of 2 expressions [x, y]
let mut exprs = vec![];
filters
.iter()
.for_each(|expr| Self::split_members(expr, &mut exprs));
// Only keep single_column and primitive binary expressions
let mut pushdown_exprs: Vec<Expr> = vec![];
let exprs_result = exprs
.into_iter()
.try_for_each::<_, Result<_, DataFusionError>>(|expr| {
let mut columns = HashSet::new();
utils::expr_to_columns(&expr, &mut columns)?;
if columns.len() == 1 && Self::primitive_binary_expr(&expr) {
pushdown_exprs.push(expr);
}
Ok(())
});
match exprs_result {
Ok(()) => {
// Return the builder with only the pushdownable expressions on it.
self.inner.exprs.append(&mut pushdown_exprs);
}
Err(e) => {
debug!("Error, {}, building push-down predicates for filters: {:#?}. No predicates are pushed down", e, filters);
}
}
self
}
/// Recursively split all "AND" expressions into smaller one
/// Example: "A AND B AND C" => [A, B, C]
pub fn split_members(predicate: &Expr, predicates: &mut Vec<Expr>) {
match predicate {
Expr::BinaryExpr {
right,
op: Operator::And,
left,
} => {
Self::split_members(left, predicates);
Self::split_members(right, predicates);
}
other => predicates.push(other.clone()),
}
}
/// Return true if the given expression is in a primitive binary in the form: `column op constant`
// and op must be a comparison one
pub fn primitive_binary_expr(expr: &Expr) -> bool {
match expr {
Expr::BinaryExpr { left, op, right } => {
matches!(
(&**left, &**right),
(Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_))
) && matches!(
op,
Operator::Eq
| Operator::NotEq
| Operator::Lt
| Operator::LtEq
| Operator::Gt
| Operator::GtEq
)
}
_ => false,
}
}
}
// A representation of the `BinaryExpr` variant of a Datafusion expression.
#[derive(Clone, Debug, PartialEq, PartialOrd)]
pub struct BinaryExpr {
pub left: Column,
pub op: Operator,
pub right: Expr,
}
#[cfg(test)]
mod tests {
use super::*;
use datafusion::logical_plan::{col, lit};
#[test]
fn test_default_predicate_is_empty() {
let p = Predicate::default();
assert!(p.is_empty());
}
#[test]
fn test_non_default_predicate_is_not_empty() {
let p = PredicateBuilder::new().timestamp_range(1, 100).build();
assert!(!p.is_empty());
}
#[test]
fn test_pushdown_predicates() {
let mut filters = vec![];
// state = CA
let expr1 = col("state").eq(lit("CA"));
filters.push(expr1);
// "price > 10"
let expr2 = col("price").gt(lit(10));
filters.push(expr2);
// a < 10 AND b >= 50 --> will be split to [a < 10, b >= 50]
let expr3 = col("a").lt(lit(10)).and(col("b").gt_eq(lit(50)));
filters.push(expr3);
// c != 3 OR d = 8 --> won't be pushed down
let expr4 = col("c").not_eq(lit(3)).or(col("d").eq(lit(8)));
filters.push(expr4);
// e is null --> won't be pushed down
let expr5 = col("e").is_null();
filters.push(expr5);
// f <= 60
let expr6 = col("f").lt_eq(lit(60));
filters.push(expr6);
// g is not null --> won't be pushed down
let expr7 = col("g").is_not_null();
filters.push(expr7);
// h + i --> won't be pushed down
let expr8 = col("h") + col("i");
filters.push(expr8);
// city = Boston
let expr9 = col("city").eq(lit("Boston"));
filters.push(expr9);
// city != Braintree
let expr9 = col("city").not_eq(lit("Braintree"));
filters.push(expr9);
// city != state --> won't be pushed down
let expr10 = col("city").not_eq(col("state"));
filters.push(expr10);
// city = state --> won't be pushed down
let expr11 = col("city").eq(col("state"));
filters.push(expr11);
// city_state = city + state --> won't be pushed down
let expr12 = col("city_sate").eq(col("city") + col("state"));
filters.push(expr12);
// city = city + 5 --> won't be pushed down
let expr13 = col("city").eq(col("city") + lit(5));
filters.push(expr13);
// city = city --> won't be pushed down
let expr14 = col("city").eq(col("city"));
filters.push(expr14);
// city + 5 = city --> won't be pushed down
let expr15 = (col("city") + lit(5)).eq(col("city"));
filters.push(expr15);
// 5 = city
let expr16 = lit(5).eq(col("city"));
filters.push(expr16);
println!(" --------------- Filters: {:#?}", filters);
// Expected pushdown predicates: [state = CA, price > 10, a < 10, b >= 50, f <= 60, city = Boston, city != Braintree, 5 = city]
let predicate = PredicateBuilder::default()
.add_pushdown_exprs(&filters)
.build();
println!(" ------------- Predicates: {:#?}", predicate);
assert_eq!(predicate.exprs.len(), 8);
assert_eq!(predicate.exprs[0], col("state").eq(lit("CA")));
assert_eq!(predicate.exprs[1], col("price").gt(lit(10)));
assert_eq!(predicate.exprs[2], col("a").lt(lit(10)));
assert_eq!(predicate.exprs[3], col("b").gt_eq(lit(50)));
assert_eq!(predicate.exprs[4], col("f").lt_eq(lit(60)));
assert_eq!(predicate.exprs[5], col("city").eq(lit("Boston")));
assert_eq!(predicate.exprs[6], col("city").not_eq(lit("Braintree")));
assert_eq!(predicate.exprs[7], lit(5).eq(col("city")));
}
#[test]
fn predicate_display_ts() {
// TODO make this a doc example?
let p = PredicateBuilder::new().timestamp_range(1, 100).build();
assert_eq!(p.to_string(), "Predicate range: [1 - 100]");
}
#[test]
fn predicate_display_ts_and_expr() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)).and(col("bar").lt(lit(11))))
.build();
assert_eq!(
p.to_string(),
"Predicate range: [1 - 100] exprs: [#foo = Int32(42) AND #bar < Int32(11)]"
);
}
#[test]
fn predicate_display_full() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)))
.field_columns(vec!["f1", "f2"])
.partition_key("the_key")
.build();
assert_eq!(p.to_string(), "Predicate field_columns: {f1, f2} partition_key: 'the_key' range: [1 - 100] exprs: [#foo = Int32(42)]");
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range_low() {
let p = PredicateBuilder::new()
.timestamp_range(MIN_NANO_TIME, 100)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range_high() {
let p = PredicateBuilder::new()
.timestamp_range(0, MAX_NANO_TIME)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_in_range() {
let p = PredicateBuilder::new()
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = PredicateBuilder::new()
.add_expr(col("foo").eq(lit(42)))
.build();
// rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
}

View File

@ -1,648 +0,0 @@
//! This module contains a unified Predicate structure for IOx qieries
//! that can select and filter Fields and Tags from the InfluxDB data
//! mode as well as for arbitrary other predicates that are expressed
//! by DataFusion's `Expr` type.
use std::{
collections::{BTreeSet, HashSet},
fmt,
};
use data_types::timestamp::{TimestampRange, MAX_NANO_TIME, MIN_NANO_TIME};
use datafusion::{
error::DataFusionError,
logical_plan::{col, lit_timestamp_nano, Column, Expr, Operator},
optimizer::utils,
};
use datafusion_util::{make_range_expr, AndExprBuilder};
use observability_deps::tracing::debug;
use schema::TIME_COLUMN_NAME;
/// This `Predicate` represents the empty predicate (aka that
/// evaluates to true for all rows).
pub const EMPTY_PREDICATE: Predicate = Predicate {
field_columns: None,
exprs: vec![],
range: None,
partition_key: None,
value_expr: vec![],
};
#[derive(Debug, Clone, Copy)]
/// The result of evaluating a predicate on a set of rows
pub enum PredicateMatch {
/// There is at least one row that matches the predicate that has
/// at least one non null value in each field of the predicate
AtLeastOneNonNullField,
/// There are exactly zero rows that match the predicate
Zero,
/// There *may* be rows that match, OR there *may* be no rows that
/// match
Unknown,
}
/// Represents a parsed predicate for evaluation by the InfluxDB IOx
/// query engine.
///
/// Note that the InfluxDB data model (e.g. ParsedLine's)
/// distinguishes between some types of columns (tags and fields), and
/// likewise the semantics of this structure can express some types of
/// restrictions that only apply to certain types of columns.
#[derive(Clone, Debug, Default, PartialEq, PartialOrd)]
pub struct Predicate {
/// Optional field restriction. If present, restricts the results to only
/// tables which have *at least one* of the fields in field_columns.
pub field_columns: Option<BTreeSet<String>>,
/// Optional partition key filter
pub partition_key: Option<String>,
/// Optional timestamp range: only rows within this range are included in
/// results. Other rows are excluded
pub range: Option<TimestampRange>,
/// Optional arbitrary predicates, represented as list of
/// DataFusion expressions applied a logical conjunction (aka they
/// are 'AND'ed together). Only rows that evaluate to TRUE for all
/// these expressions should be returned. Other rows are excluded
/// from the results.
pub exprs: Vec<Expr>,
/// Optional arbitrary predicates on the special `_value` column. These
/// expressions are applied to `field_columns` projections in the form of
/// `CASE` statement conditions.
pub value_expr: Vec<BinaryExpr>,
}
impl Predicate {
/// Return true if this predicate has any general purpose predicates
pub fn has_exprs(&self) -> bool {
!self.exprs.is_empty()
}
/// Return a DataFusion `Expr` predicate representing the
/// combination of all predicate (`exprs`) and timestamp
/// restriction in this Predicate. Returns None if there are no
/// `Expr`'s restricting the data
pub fn filter_expr(&self) -> Option<Expr> {
let mut builder =
AndExprBuilder::default().append_opt(self.make_timestamp_predicate_expr());
for expr in &self.exprs {
builder = builder.append_expr(expr.clone());
}
builder.build()
}
/// Return true if the field should be included in results
pub fn should_include_field(&self, field_name: &str) -> bool {
match &self.field_columns {
None => true, // No field restriction on predicate
Some(field_names) => field_names.contains(field_name),
}
}
/// Creates a DataFusion predicate for appliying a timestamp range:
///
/// `range.start <= time and time < range.end`
fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
self.range
.map(|range| make_range_expr(range.start(), range.end(), TIME_COLUMN_NAME))
}
/// Returns true if ths predicate evaluates to true for all rows
pub fn is_empty(&self) -> bool {
self == &EMPTY_PREDICATE
}
/// Return a negated DF logical expression for the given delete predicates
pub fn negated_expr<S>(delete_predicates: &[S]) -> Option<Expr>
where
S: AsRef<Self>,
{
if delete_predicates.is_empty() {
return None;
}
let mut pred = PredicateBuilder::default().build();
pred.merge_delete_predicates(delete_predicates);
// Make a conjunctive expression of the pred.exprs
let mut val = None;
for e in pred.exprs {
match val {
None => val = Some(e),
Some(expr) => val = Some(expr.and(e)),
}
}
val
}
/// Merge the given delete predicates into this select predicate.
/// Since we want to eliminate data filtered by the delete predicates,
/// they are first converted into their negated form: NOT(delete_predicate)
/// then added/merged into the selection one
pub fn merge_delete_predicates<S>(&mut self, delete_predicates: &[S])
where
S: AsRef<Self>,
{
// Create a list of disjunctive negated expressions.
// Example: there are two deletes as follows (note that time_range is stored separated in the Predicate
// but we need to put it together with the exprs here)
// . Delete_1: WHERE city != "Boston" AND temp = 70 AND time_range in [10, 30)
// . Delete 2: WHERE state = "NY" AND route != "I90" AND time_range in [20, 50)
// The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
// NOT(city != "Boston" AND temp = 70 AND time_range in [10, 30]), NOT(state = "NY" AND route != "I90" AND time_range in [20, 50]) which means
// [NOT(city = Boston") OR NOT(temp = 70) OR NOT(time_range in [10, 30])], [NOT(state = "NY") OR NOT(route != "I90") OR NOT(time_range in [20, 50])]
// Note that the "NOT(time_range in [20, 50])]" or "NOT(20 <= time <= 50)"" is replaced with "time < 20 OR time > 50"
for pred in delete_predicates {
let pred = pred.as_ref();
let mut expr: Option<Expr> = None;
// Time range
if let Some(range) = pred.range {
// time_expr = NOT(start <= time_range <= end)
// Equivalent to: (time < start OR time > end)
let time_expr = col(TIME_COLUMN_NAME)
.lt(lit_timestamp_nano(range.start()))
.or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end())));
match expr {
None => expr = Some(time_expr),
Some(e) => expr = Some(e.or(time_expr)),
}
}
// Exprs
for exp in &pred.exprs {
match expr {
None => expr = Some(exp.clone().not()),
Some(e) => expr = Some(e.or(exp.clone().not())),
}
}
// Push the negated expression of the delete predicate into the list exprs of the select predicate
if let Some(e) = expr {
self.exprs.push(e);
}
}
}
/// Removes the timestamp range from this predicate, if the range
/// is for the entire min/max valid range.
///
/// This is used in certain cases to retain compatibility with the
/// existing storage engine
pub(crate) fn clear_timestamp_if_max_range(mut self) -> Self {
self.range = self.range.take().and_then(|range| {
if range.start() <= MIN_NANO_TIME && range.end() >= MAX_NANO_TIME {
None
} else {
Some(range)
}
});
self
}
}
impl fmt::Display for Predicate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn iter_to_str<S>(s: impl IntoIterator<Item = S>) -> String
where
S: ToString,
{
s.into_iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", ")
}
write!(f, "Predicate")?;
if let Some(field_columns) = &self.field_columns {
write!(f, " field_columns: {{{}}}", iter_to_str(field_columns))?;
}
if let Some(partition_key) = &self.partition_key {
write!(f, " partition_key: '{}'", partition_key)?;
}
if let Some(range) = &self.range {
// TODO: could be nice to show this as actual timestamps (not just numbers)?
write!(f, " range: [{} - {}]", range.start(), range.end())?;
}
if !self.exprs.is_empty() {
write!(f, " exprs: [")?;
for (i, expr) in self.exprs.iter().enumerate() {
write!(f, "{}", expr)?;
if i < self.exprs.len() - 1 {
write!(f, ", ")?;
}
}
write!(f, "]")?;
}
Ok(())
}
}
#[derive(Debug, Default)]
/// Structure for building [`Predicate`]s
///
/// Example:
/// ```
/// use predicate::predicate::PredicateBuilder;
/// use datafusion::logical_plan::{col, lit};
///
/// let p = PredicateBuilder::new()
/// .timestamp_range(1, 100)
/// .add_expr(col("foo").eq(lit(42)))
/// .build();
///
/// assert_eq!(
/// p.to_string(),
/// "Predicate range: [1 - 100] exprs: [#foo = Int32(42)]"
/// );
/// ```
pub struct PredicateBuilder {
inner: Predicate,
}
impl From<Predicate> for PredicateBuilder {
fn from(inner: Predicate) -> Self {
Self { inner }
}
}
impl PredicateBuilder {
pub fn new() -> Self {
Self::default()
}
/// Sets the timestamp range
pub fn timestamp_range(mut self, start: i64, end: i64) -> Self {
// Without more thought, redefining the timestamp range would
// lose the old range. Asser that that cannot happen.
assert!(
self.inner.range.is_none(),
"Unexpected re-definition of timestamp range"
);
self.inner.range = Some(TimestampRange::new(start, end));
self
}
/// sets the optional timestamp range, if any
pub fn timestamp_range_option(mut self, range: Option<TimestampRange>) -> Self {
// Without more thought, redefining the timestamp range would
// lose the old range. Asser that that cannot happen.
assert!(
range.is_none() || self.inner.range.is_none(),
"Unexpected re-definition of timestamp range"
);
self.inner.range = range;
self
}
/// Adds an expression to the list of general purpose predicates
pub fn add_expr(mut self, expr: Expr) -> Self {
self.inner.exprs.push(expr);
self
}
/// Builds a regex matching expression from the provided column name and
/// pattern. Values not matching the regex will be filtered out.
pub fn build_regex_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
self.regex_match_expr(column, pattern, true)
}
/// Builds a regex "not matching" expression from the provided column name
/// and pattern. Values *matching* the regex will be filtered out.
pub fn build_regex_not_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
self.regex_match_expr(column, pattern, false)
}
fn regex_match_expr(mut self, column: &str, pattern: impl Into<String>, matches: bool) -> Self {
let expr = crate::regex::regex_match_expr(col(column), pattern.into(), matches);
self.inner.exprs.push(expr);
self
}
/// Sets field_column restriction
pub fn field_columns(mut self, columns: Vec<impl Into<String>>) -> Self {
// We need to distinguish predicates like `column_name In
// (foo, bar)` and `column_name = foo and column_name = bar` in order to handle
// this
if self.inner.field_columns.is_some() {
unimplemented!("Complex/Multi field predicates are not yet supported");
}
let column_names = columns
.into_iter()
.map(|s| s.into())
.collect::<BTreeSet<_>>();
self.inner.field_columns = Some(column_names);
self
}
/// Set the partition key restriction
pub fn partition_key(mut self, partition_key: impl Into<String>) -> Self {
assert!(
self.inner.partition_key.is_none(),
"multiple partition key predicates not suported"
);
self.inner.partition_key = Some(partition_key.into());
self
}
/// Create a predicate, consuming this builder
pub fn build(self) -> Predicate {
self.inner
}
/// Adds only the expressions from `filters` that can be pushed down to
/// execution engines.
pub fn add_pushdown_exprs(mut self, filters: &[Expr]) -> Self {
// For each expression of the filters, recursively split it, if it is is an AND conjunction
// For example, expression (x AND y) will be split into a vector of 2 expressions [x, y]
let mut exprs = vec![];
filters
.iter()
.for_each(|expr| Self::split_members(expr, &mut exprs));
// Only keep single_column and primitive binary expressions
let mut pushdown_exprs: Vec<Expr> = vec![];
let exprs_result = exprs
.into_iter()
.try_for_each::<_, Result<_, DataFusionError>>(|expr| {
let mut columns = HashSet::new();
utils::expr_to_columns(&expr, &mut columns)?;
if columns.len() == 1 && Self::primitive_binary_expr(&expr) {
pushdown_exprs.push(expr);
}
Ok(())
});
match exprs_result {
Ok(()) => {
// Return the builder with only the pushdownable expressions on it.
self.inner.exprs.append(&mut pushdown_exprs);
}
Err(e) => {
debug!("Error, {}, building push-down predicates for filters: {:#?}. No predicates are pushed down", e, filters);
}
}
self
}
/// Recursively split all "AND" expressions into smaller one
/// Example: "A AND B AND C" => [A, B, C]
pub fn split_members(predicate: &Expr, predicates: &mut Vec<Expr>) {
match predicate {
Expr::BinaryExpr {
right,
op: Operator::And,
left,
} => {
Self::split_members(left, predicates);
Self::split_members(right, predicates);
}
other => predicates.push(other.clone()),
}
}
/// Return true if the given expression is in a primitive binary in the form: `column op constant`
// and op must be a comparison one
pub fn primitive_binary_expr(expr: &Expr) -> bool {
match expr {
Expr::BinaryExpr { left, op, right } => {
matches!(
(&**left, &**right),
(Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_))
) && matches!(
op,
Operator::Eq
| Operator::NotEq
| Operator::Lt
| Operator::LtEq
| Operator::Gt
| Operator::GtEq
)
}
_ => false,
}
}
}
// A representation of the `BinaryExpr` variant of a Datafusion expression.
#[derive(Clone, Debug, PartialEq, PartialOrd)]
pub struct BinaryExpr {
pub left: Column,
pub op: Operator,
pub right: Expr,
}
#[cfg(test)]
mod tests {
use super::*;
use datafusion::logical_plan::{col, lit};
#[test]
fn test_default_predicate_is_empty() {
let p = Predicate::default();
assert!(p.is_empty());
}
#[test]
fn test_non_default_predicate_is_not_empty() {
let p = PredicateBuilder::new().timestamp_range(1, 100).build();
assert!(!p.is_empty());
}
#[test]
fn test_pushdown_predicates() {
let mut filters = vec![];
// state = CA
let expr1 = col("state").eq(lit("CA"));
filters.push(expr1);
// "price > 10"
let expr2 = col("price").gt(lit(10));
filters.push(expr2);
// a < 10 AND b >= 50 --> will be split to [a < 10, b >= 50]
let expr3 = col("a").lt(lit(10)).and(col("b").gt_eq(lit(50)));
filters.push(expr3);
// c != 3 OR d = 8 --> won't be pushed down
let expr4 = col("c").not_eq(lit(3)).or(col("d").eq(lit(8)));
filters.push(expr4);
// e is null --> won't be pushed down
let expr5 = col("e").is_null();
filters.push(expr5);
// f <= 60
let expr6 = col("f").lt_eq(lit(60));
filters.push(expr6);
// g is not null --> won't be pushed down
let expr7 = col("g").is_not_null();
filters.push(expr7);
// h + i --> won't be pushed down
let expr8 = col("h") + col("i");
filters.push(expr8);
// city = Boston
let expr9 = col("city").eq(lit("Boston"));
filters.push(expr9);
// city != Braintree
let expr9 = col("city").not_eq(lit("Braintree"));
filters.push(expr9);
// city != state --> won't be pushed down
let expr10 = col("city").not_eq(col("state"));
filters.push(expr10);
// city = state --> won't be pushed down
let expr11 = col("city").eq(col("state"));
filters.push(expr11);
// city_state = city + state --> won't be pushed down
let expr12 = col("city_sate").eq(col("city") + col("state"));
filters.push(expr12);
// city = city + 5 --> won't be pushed down
let expr13 = col("city").eq(col("city") + lit(5));
filters.push(expr13);
// city = city --> won't be pushed down
let expr14 = col("city").eq(col("city"));
filters.push(expr14);
// city + 5 = city --> won't be pushed down
let expr15 = (col("city") + lit(5)).eq(col("city"));
filters.push(expr15);
// 5 = city
let expr16 = lit(5).eq(col("city"));
filters.push(expr16);
println!(" --------------- Filters: {:#?}", filters);
// Expected pushdown predicates: [state = CA, price > 10, a < 10, b >= 50, f <= 60, city = Boston, city != Braintree, 5 = city]
let predicate = PredicateBuilder::default()
.add_pushdown_exprs(&filters)
.build();
println!(" ------------- Predicates: {:#?}", predicate);
assert_eq!(predicate.exprs.len(), 8);
assert_eq!(predicate.exprs[0], col("state").eq(lit("CA")));
assert_eq!(predicate.exprs[1], col("price").gt(lit(10)));
assert_eq!(predicate.exprs[2], col("a").lt(lit(10)));
assert_eq!(predicate.exprs[3], col("b").gt_eq(lit(50)));
assert_eq!(predicate.exprs[4], col("f").lt_eq(lit(60)));
assert_eq!(predicate.exprs[5], col("city").eq(lit("Boston")));
assert_eq!(predicate.exprs[6], col("city").not_eq(lit("Braintree")));
assert_eq!(predicate.exprs[7], lit(5).eq(col("city")));
}
#[test]
fn predicate_display_ts() {
// TODO make this a doc example?
let p = PredicateBuilder::new().timestamp_range(1, 100).build();
assert_eq!(p.to_string(), "Predicate range: [1 - 100]");
}
#[test]
fn predicate_display_ts_and_expr() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)).and(col("bar").lt(lit(11))))
.build();
assert_eq!(
p.to_string(),
"Predicate range: [1 - 100] exprs: [#foo = Int32(42) AND #bar < Int32(11)]"
);
}
#[test]
fn predicate_display_full() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)))
.field_columns(vec!["f1", "f2"])
.partition_key("the_key")
.build();
assert_eq!(p.to_string(), "Predicate field_columns: {f1, f2} partition_key: 'the_key' range: [1 - 100] exprs: [#foo = Int32(42)]");
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range() {
let p = PredicateBuilder::new()
.timestamp_range(1, 100)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range_low() {
let p = PredicateBuilder::new()
.timestamp_range(MIN_NANO_TIME, 100)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_out_of_range_high() {
let p = PredicateBuilder::new()
.timestamp_range(0, MAX_NANO_TIME)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = p.clone();
// no rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
#[test]
fn test_clear_timestamp_if_max_range_in_range() {
let p = PredicateBuilder::new()
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
.add_expr(col("foo").eq(lit(42)))
.build();
let expected = PredicateBuilder::new()
.add_expr(col("foo").eq(lit(42)))
.build();
// rewrite
assert_eq!(p.clear_timestamp_if_max_range(), expected);
}
}

View File

@ -1,7 +1,6 @@
//! Interface logic between IOx ['Predicate`] and predicates used by the
//! InfluxDB Storage gRPC API
use crate::predicate::{BinaryExpr, Predicate};
use crate::rewrite;
use crate::{rewrite, BinaryExpr, Predicate};
use datafusion::error::Result as DataFusionResult;
use datafusion::execution::context::ExecutionProps;

View File

@ -18,8 +18,8 @@ use datafusion_util::AsExpr;
use hashbrown::HashSet;
use observability_deps::tracing::{debug, trace};
use predicate::predicate::{BinaryExpr, Predicate, PredicateMatch};
use predicate::rpc_predicate::{InfluxRpcPredicate, FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME};
use predicate::{BinaryExpr, Predicate, PredicateMatch};
use schema::selection::Selection;
use schema::{InfluxColumnType, Schema, TIME_COLUMN_NAME};
use snafu::{ensure, OptionExt, ResultExt, Snafu};
@ -219,6 +219,9 @@ impl InfluxRpcPlanner {
{
debug!(?rpc_predicate, "planning table_names");
// Special case predicates that span the entire valid timestamp range
let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
let mut builder = StringSetPlanBuilder::new();
// Mapping between table and chunks that need full plan
@ -617,6 +620,9 @@ impl InfluxRpcPlanner {
{
debug!(?rpc_predicate, "planning field_columns");
// Special case predicates that span the entire valid timestamp range
let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
// Algorithm is to run a "select field_cols from table where
// <predicate> type plan for each table in the chunks"
//
@ -1834,7 +1840,7 @@ impl<'a> ExprRewriter for MissingColumnsToNull<'a> {
#[cfg(test)]
mod tests {
use datafusion::logical_plan::lit;
use predicate::predicate::PredicateBuilder;
use predicate::PredicateBuilder;
use schema::builder::SchemaBuilder;
use crate::{
@ -1953,7 +1959,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_rewrite_table_names() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
InfluxRpcPlanner::new()
.table_names(test_db, rpc_predicate)
.expect("creating plan");
@ -1963,7 +1969,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_rewrite_tag_keys() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
InfluxRpcPlanner::new()
.tag_keys(test_db, rpc_predicate)
.expect("creating plan");
@ -1973,7 +1979,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_rewrite_tag_values() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
InfluxRpcPlanner::new()
.tag_values(test_db, "foo", rpc_predicate)
.expect("creating plan");
@ -1983,7 +1989,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_rewrite_field_columns() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
InfluxRpcPlanner::new()
.field_columns(test_db, rpc_predicate)
.expect("creating plan");
@ -1993,7 +1999,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_rewrite_read_filter() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
InfluxRpcPlanner::new()
.read_filter(test_db, rpc_predicate)
.expect("creating plan");
@ -2003,7 +2009,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_read_group() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
let agg = Aggregate::None;
let group_columns = &["foo"];
InfluxRpcPlanner::new()
@ -2015,7 +2021,7 @@ mod tests {
#[tokio::test]
async fn test_predicate_read_window_aggregate() {
run_test::<_, TestDatabase>(|test_db, rpc_predicate| {
run_test(&|test_db, rpc_predicate| {
let agg = Aggregate::First;
let every = WindowDuration::from_months(1, false);
let offset = WindowDuration::from_months(1, false);
@ -2026,17 +2032,15 @@ mod tests {
.await
}
/// Runs func() and checks that predicates are simplified prior to sending them off
async fn run_test<F, D>(f: F)
where
F: FnOnce(&TestDatabase, InfluxRpcPredicate) + Send,
{
let chunk0 = Arc::new(
TestChunk::new("h2o")
.with_id(0)
.with_tag_column("foo")
.with_time_column(),
);
/// Given a `TestDatabase` plans a InfluxRPC query
/// (e.g. read_filter, read_window_aggregate, etc). The test below
/// ensures that predicates are simplified during query planning.
type PlanRPCFunc = dyn Fn(&TestDatabase, InfluxRpcPredicate) + Send + Sync;
/// Runs func() and checks that predicates are simplified prior to
/// sending them down to the chunks for processing.
async fn run_test(func: &'static PlanRPCFunc) {
// ------------- Test 1 ----------------
// this is what happens with a grpc predicate on a tag
//
@ -2053,22 +2057,74 @@ mod tests {
.add_expr(expr.eq(lit("bar")))
.build();
// verify that the predicate was rewritten to `foo = 'bar'`
let expr = col("foo").eq(lit("bar"));
let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
// ------------- Test 2 ----------------
// Validate that _measurement predicates are translated
//
// https://github.com/influxdata/influxdb_iox/issues/3601
// _measurement = 'foo'
let silly_predicate = PredicateBuilder::new()
.add_expr(col("_measurement").eq(lit("foo")))
.build();
// verify that the predicate was rewritten to `false` as the
// measurement name is `h20`
let expr = lit(false);
let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
// ------------- Test 3 ----------------
// more complicated _measurement predicates are translated
//
// https://github.com/influxdata/influxdb_iox/issues/3601
// (_measurement = 'foo' or measurement = 'h2o') AND time > 5
let silly_predicate = PredicateBuilder::new()
.add_expr(
col("_measurement")
.eq(lit("foo"))
.or(col("_measurement").eq(lit("h2o")))
.and(col("time").gt(lit(5))),
)
.build();
// verify that the predicate was rewritten to time > 5
let expr = col("time").gt(lit(5));
let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
}
/// Runs func() with the specified predicate and verifies
/// `expected_predicate` is received by the chunk
async fn run_test_with_predicate(
func: &PlanRPCFunc,
predicate: Predicate,
expected_predicate: Predicate,
) {
let chunk0 = Arc::new(
TestChunk::new("h2o")
.with_id(0)
.with_tag_column("foo")
.with_time_column(),
);
let executor = Arc::new(Executor::new(1));
let test_db = TestDatabase::new(Arc::clone(&executor));
test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
let rpc_predicate = InfluxRpcPredicate::new(None, silly_predicate);
let rpc_predicate = InfluxRpcPredicate::new(None, predicate);
// run the function
f(&test_db, rpc_predicate);
func(&test_db, rpc_predicate);
let actual_predicate = test_db.get_chunks_predicate();
// verify that the predicate was rewritten to `foo = 'bar'`
let expr = col("foo").eq(lit("bar"));
let expected_predicate = PredicateBuilder::new().add_expr(expr).build();
assert_eq!(
actual_predicate, expected_predicate,
"\nActual: {:?}\nExpected: {:?}",

View File

@ -16,10 +16,7 @@ use data_types::{
use datafusion::physical_plan::SendableRecordBatchStream;
use exec::stringset::StringSet;
use observability_deps::tracing::{debug, trace};
use predicate::{
predicate::{Predicate, PredicateMatch},
rpc_predicate::QueryDatabaseMeta,
};
use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate, PredicateMatch};
use schema::selection::Selection;
use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};
@ -109,6 +106,11 @@ impl<'a> Drop for QueryCompletedToken<'a> {
}
}
/// Boxed description of a query that knows how to render to a string
///
/// This avoids storing potentially large strings
pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
/// A `Database` is the main trait implemented by the IOx subsystems
/// that store actual data.
///
@ -132,7 +134,7 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
fn record_query(
&self,
query_type: impl Into<String>,
query_text: impl Into<String>,
query_text: QueryText,
) -> QueryCompletedToken<'_>;
}

View File

@ -18,7 +18,7 @@ use datafusion::{
},
};
use observability_deps::tracing::{debug, trace};
use predicate::predicate::{Predicate, PredicateBuilder};
use predicate::{Predicate, PredicateBuilder};
use schema::{merge::SchemaMerger, sort::SortKey, Schema};
use crate::{

View File

@ -16,7 +16,7 @@ use schema::selection::Selection;
use schema::Schema;
use crate::QueryChunk;
use predicate::predicate::Predicate;
use predicate::Predicate;
use async_trait::async_trait;

View File

@ -13,7 +13,7 @@ use datafusion::{
physical_optimizer::pruning::{PruningPredicate, PruningStatistics},
};
use observability_deps::tracing::{debug, trace};
use predicate::predicate::Predicate;
use predicate::Predicate;
use schema::Schema;
use crate::{group_by::Aggregate, QueryChunkMeta};
@ -228,7 +228,7 @@ mod test {
use std::{cell::RefCell, sync::Arc};
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::PredicateBuilder;
use schema::merge::SchemaMerger;
use crate::{test::TestChunk, QueryChunk};

View File

@ -4,11 +4,11 @@
//! AKA it is a Mock
use crate::exec::{ExecutionContextProvider, Executor, ExecutorType, IOxExecutionContext};
use crate::QueryCompletedToken;
use crate::{
exec::stringset::{StringSet, StringSetRef},
Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryDatabase,
};
use crate::{QueryCompletedToken, QueryText};
use arrow::array::UInt64Array;
use arrow::{
array::{ArrayRef, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray},
@ -155,7 +155,7 @@ impl QueryDatabase for TestDatabase {
fn record_query(
&self,
_query_type: impl Into<String>,
_query_text: impl Into<String>,
_query_text: QueryText,
) -> QueryCompletedToken<'_> {
QueryCompletedToken::new(|| {})
}

View File

@ -1,7 +1,8 @@
use arrow::datatypes::DataType;
use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::fieldlist::{Field, FieldList},
frontend::influxrpc::InfluxRpcPlanner,
@ -216,3 +217,64 @@ async fn test_field_name_plan_with_delete() {
)
.await;
}
#[tokio::test]
async fn list_field_columns_max_time() {
let predicate = PredicateBuilder::default()
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_fields = FieldList {
fields: vec![Field {
name: "value".into(),
data_type: DataType::Float64,
last_timestamp: MAX_NANO_TIME,
}],
};
run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
}
#[tokio::test]
async fn list_field_columns_max_i64() {
let predicate = PredicateBuilder::default()
.timestamp_range(i64::MIN, i64::MAX)
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_fields = FieldList {
fields: vec![Field {
name: "value".into(),
data_type: DataType::Float64,
last_timestamp: MAX_NANO_TIME,
}],
};
run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
}
#[tokio::test]
async fn list_field_columns_max_time_less_one() {
let predicate = PredicateBuilder::default()
// one less than max timestamp
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME - 1)
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_fields = FieldList { fields: vec![] };
run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
}
#[tokio::test]
async fn list_field_columns_max_time_greater_one() {
let predicate = PredicateBuilder::default()
.timestamp_range(MIN_NANO_TIME + 1, MAX_NANO_TIME)
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_fields = FieldList { fields: vec![] };
run_field_columns_test_case(MeasurementWithMaxTime {}, predicate, expected_fields).await;
}

View File

@ -13,8 +13,8 @@ use crate::{
},
};
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::frontend::influxrpc::InfluxRpcPlanner;
/// runs read_filter(predicate) and compares it to the expected

View File

@ -14,8 +14,8 @@ use datafusion::{
logical_plan::{binary_expr, Operator},
prelude::*,
};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{frontend::influxrpc::InfluxRpcPlanner, group_by::Aggregate};
/// runs read_group(predicate) and compares it to the expected
@ -360,8 +360,6 @@ async fn test_grouped_series_set_plan_count_measurement_pred() {
.await;
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
#[tokio::test]
async fn test_grouped_series_set_plan_first() {
let predicate = PredicateBuilder::default()
@ -486,8 +484,6 @@ async fn test_grouped_series_set_plan_last_with_nulls() {
.await;
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
#[tokio::test]
async fn test_grouped_series_set_plan_min() {
let predicate = PredicateBuilder::default()

View File

@ -10,8 +10,8 @@ use async_trait::async_trait;
use data_types::{delete_predicate::DeletePredicate, timestamp::TimestampRange};
use datafusion::prelude::*;
use db::{test_helpers::write_lp, utils::make_db};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
frontend::influxrpc::InfluxRpcPlanner,
group_by::{Aggregate, WindowDuration},
@ -108,8 +108,6 @@ impl DbSetup for MeasurementForWindowAggregate {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
#[tokio::test]
async fn test_read_window_aggregate_nanoseconds() {
let predicate = PredicateBuilder::default()
@ -256,8 +254,6 @@ impl DbSetup for MeasurementForWindowAggregateMonths {
}
}
// NGA todo: add delete DbSetup
#[tokio::test]
async fn test_read_window_aggregate_months() {
let agg = Aggregate::Mean;

View File

@ -1,7 +1,8 @@
//! Tests for the Influx gRPC queries
use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRpcPlanner,
@ -207,6 +208,48 @@ async fn list_table_names_data_pred_250_300_with_delete_all() {
run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(250, 300), vec![]).await;
}
#[tokio::test]
async fn list_table_names_max_time() {
run_table_names_test_case(
MeasurementWithMaxTime {},
tsp(MIN_NANO_TIME, MAX_NANO_TIME),
vec!["cpu"],
)
.await;
}
#[tokio::test]
async fn list_table_names_max_i64() {
run_table_names_test_case(
MeasurementWithMaxTime {},
// outside valid timestamp range
tsp(i64::MIN, i64::MAX),
vec!["cpu"],
)
.await;
}
#[tokio::test]
async fn list_table_names_time_less_one() {
run_table_names_test_case(
MeasurementWithMaxTime {},
tsp(MIN_NANO_TIME, MAX_NANO_TIME - 1),
vec![],
)
.await;
}
#[tokio::test]
async fn list_table_names_max_time_greater_one() {
run_table_names_test_case(
MeasurementWithMaxTime {},
// one more than max timestamp
tsp(MIN_NANO_TIME + 1, MAX_NANO_TIME),
vec![],
)
.await;
}
// Note when table names supports general purpose predicates, add a
// test here with a `_measurement` predicate
// https://github.com/influxdata/influxdb_iox/issues/762

View File

@ -1,6 +1,7 @@
use data_types::timestamp::{MAX_NANO_TIME, MIN_NANO_TIME};
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRpcPlanner,
@ -186,7 +187,7 @@ async fn list_tag_name_end_to_end_with_delete() {
async fn list_tag_name_max_time() {
test_helpers::maybe_start_logging();
let predicate = PredicateBuilder::default()
.timestamp_range(-9223372036854775806, 9223372036854775806)
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME)
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_tag_keys = vec!["host"];
@ -209,7 +210,7 @@ async fn list_tag_name_max_i64() {
async fn list_tag_name_max_time_less_one() {
test_helpers::maybe_start_logging();
let predicate = PredicateBuilder::default()
.timestamp_range(-9223372036854775806, 9223372036854775805) // one less than max timestamp
.timestamp_range(MIN_NANO_TIME, MAX_NANO_TIME - 1) // one less than max timestamp
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_tag_keys = vec![];
@ -220,7 +221,7 @@ async fn list_tag_name_max_time_less_one() {
async fn list_tag_name_max_time_greater_one() {
test_helpers::maybe_start_logging();
let predicate = PredicateBuilder::default()
.timestamp_range(-9223372036854775805, 9223372036854775806) // one more than min timestamp
.timestamp_range(MIN_NANO_TIME + 1, MAX_NANO_TIME) // one more than min timestamp
.build();
let predicate = InfluxRpcPredicate::new(None, predicate);
let expected_tag_keys = vec![];

View File

@ -1,6 +1,6 @@
use datafusion::logical_plan::{col, lit};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRpcPlanner,

View File

@ -5,8 +5,8 @@ use db::{
utils::{make_db, TestDb},
};
use metric::{Attributes, Metric, U64Counter};
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::{stringset::StringSet, ExecutionContextProvider},
frontend::{influxrpc::InfluxRpcPlanner, sql::SqlQueryPlanner},

View File

@ -1308,8 +1308,6 @@ impl DbSetup for OneMeasurementForAggs {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
pub struct AnotherMeasurementForAggs {}
#[async_trait]
impl DbSetup for AnotherMeasurementForAggs {
@ -1332,8 +1330,6 @@ impl DbSetup for AnotherMeasurementForAggs {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
pub struct TwoMeasurementForAggs {}
#[async_trait]
impl DbSetup for TwoMeasurementForAggs {
@ -1353,8 +1349,6 @@ impl DbSetup for TwoMeasurementForAggs {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
pub struct MeasurementForSelectors {}
#[async_trait]
impl DbSetup for MeasurementForSelectors {
@ -1408,8 +1402,6 @@ impl DbSetup for MeasurementForMax {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
pub struct MeasurementForGroupKeys {}
#[async_trait]
impl DbSetup for MeasurementForGroupKeys {
@ -1432,8 +1424,6 @@ impl DbSetup for MeasurementForGroupKeys {
}
}
// NGA todo: add delete DbSetup after all scenarios are done for 2 chunks
pub struct MeasurementForGroupByField {}
#[async_trait]
impl DbSetup for MeasurementForGroupByField {

View File

@ -104,7 +104,9 @@ impl VariantWriteBuffer {
write_buffer
.store_operation(0, operation)
.await
.context(WriteFailureSnafu)?;
.map_err(|e| Error::WriteFailure {
source: Box::new(e),
})?;
Ok(())
}

View File

@ -91,8 +91,13 @@ where
if self.cache.get_schema(&namespace).is_none() {
trace!(%namespace, "namespace auto-create cache miss");
match self
let mut txn = self
.catalog
.start_transaction()
.await
.map_err(NamespaceCreationError::Create)?;
match txn
.namespaces()
.create(
namespace.as_str(),
@ -103,6 +108,8 @@ where
.await
{
Ok(_) => {
txn.commit().await.map_err(NamespaceCreationError::Create)?;
debug!(%namespace, "created namespace");
}
Err(iox_catalog::interface::Error::NameExists { .. }) => {
@ -110,9 +117,11 @@ where
// namespace, or another thread raced populating the catalog
// and beat this thread to it.
debug!(%namespace, "spurious namespace create failed");
txn.abort().await.map_err(NamespaceCreationError::Create)?;
}
Err(e) => {
error!(error=%e, %namespace, "failed to auto-create namespace");
txn.abort().await.map_err(NamespaceCreationError::Create)?;
return Err(NamespaceCreationError::Create(e));
}
}
@ -190,15 +199,19 @@ mod tests {
// The cache hit should mean the catalog SHOULD NOT see a create request
// for the namespace.
let mut txn = catalog
.start_transaction()
.await
.expect("failed to start UoW");
assert!(
catalog
.namespaces()
txn.namespaces()
.get_by_name(ns.as_str())
.await
.expect("lookup should not error")
.is_none(),
"expected no request to the catalog"
);
txn.abort().await.expect("failed to abort UoW");
// And the DML handler must be called.
assert_matches!(mock_handler.calls().as_slice(), [MockDmlHandlerCall::Write { namespace, .. }] => {
@ -230,12 +243,17 @@ mod tests {
// The cache miss should mean the catalog MUST see a create request for
// the namespace.
let got = catalog
let mut txn = catalog
.start_transaction()
.await
.expect("failed to start UoW");
let got = txn
.namespaces()
.get_by_name(ns.as_str())
.await
.expect("lookup should not error")
.expect("creation request should be sent to catalog");
txn.abort().await.expect("failed to abort UoW");
assert_eq!(
got,

View File

@ -1,4 +1,4 @@
use std::sync::Arc;
use std::{ops::DerefMut, sync::Arc};
use async_trait::async_trait;
use data_types::{delete_predicate::DeletePredicate, DatabaseName};
@ -135,6 +135,12 @@ where
batches: HashMap<String, MutableBatch>,
span_ctx: Option<SpanContext>,
) -> Result<(), Self::WriteError> {
let mut txn = self
.catalog
.start_transaction()
.await
.map_err(SchemaError::NamespaceLookup)?;
// Load the namespace schema from the cache, falling back to pulling it
// from the global catalog (if it exists).
let schema = self.cache.get_schema(&namespace);
@ -143,7 +149,7 @@ where
None => {
// Pull the schema from the global catalog or error if it does
// not exist.
let schema = get_schema_by_name(&namespace, &*self.catalog)
let schema = get_schema_by_name(&namespace, txn.deref_mut())
.await
.map_err(|e| {
warn!(error=%e, %namespace, "failed to retrieve namespace schema");
@ -162,7 +168,7 @@ where
let maybe_new_schema = validate_or_insert_schema(
batches.iter().map(|(k, v)| (k.as_str(), v)),
&schema,
&*self.catalog,
txn.deref_mut(),
)
.await
.map_err(|e| {
@ -171,6 +177,8 @@ where
})?
.map(Arc::new);
txn.commit().await.map_err(SchemaError::NamespaceLookup)?;
trace!(%namespace, "schema validation complete");
// If the schema has been updated, immediately add it to the cache
@ -246,8 +254,12 @@ mod tests {
/// named [`NAMESPACE`].
async fn create_catalog() -> Arc<dyn Catalog> {
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new());
catalog
.namespaces()
let mut txn = catalog
.start_transaction()
.await
.expect("failed to start UoW");
txn.namespaces()
.create(
NAMESPACE,
"inf",
@ -256,6 +268,8 @@ mod tests {
)
.await
.expect("failed to create test namespace");
txn.commit().await.expect("failed to commit UoW");
catalog
}

View File

@ -10,7 +10,7 @@ async-trait = "0.1"
bytes = "1.0"
chrono = { version = "0.4", default-features = false }
cache_loader_async = { version = "0.1.2", features = ["ttl-cache"] }
crc32fast = "1.3.0"
crc32fast = "1.3.2"
data_types = { path = "../data_types" }
db = { path = "../db" }
futures = "0.3"

View File

@ -511,7 +511,7 @@ impl DatabaseStateCatalogLoaded {
};
let write_buffer_consumer = match rules.write_buffer_connection.as_ref() {
Some(connection) => {
let mut consumer = write_buffer_factory
let consumer = write_buffer_factory
.new_config_read(db_name.as_str(), trace_collector.as_ref(), connection)
.await
.context(CreateWriteBufferSnafu)?;
@ -522,12 +522,14 @@ impl DatabaseStateCatalogLoaded {
self.replay_plan.as_ref().as_ref()
};
db.perform_replay(replay_plan, consumer.as_mut())
let streams = db
.perform_replay(replay_plan, Arc::clone(&consumer))
.await
.context(ReplaySnafu)?;
Some(Arc::new(WriteBufferConsumer::new(
consumer,
streams,
Arc::clone(&db),
shared.application.metric_registry().as_ref(),
)))

View File

@ -5,8 +5,8 @@ use std::io::Read;
// current-thread executor
use db::Db;
use flate2::read::GzDecoder;
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::{Executor, ExecutorType},
frontend::influxrpc::InfluxRpcPlanner,

View File

@ -5,8 +5,8 @@ use std::io::Read;
// current-thread executor
use db::Db;
use flate2::read::GzDecoder;
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::{Executor, ExecutorType},
frontend::influxrpc::InfluxRpcPlanner,

View File

@ -5,8 +5,8 @@ use std::io::Read;
// current-thread executor
use db::Db;
use flate2::read::GzDecoder;
use predicate::predicate::PredicateBuilder;
use predicate::rpc_predicate::InfluxRpcPredicate;
use predicate::PredicateBuilder;
use query::{
exec::{Executor, ExecutorType},
frontend::influxrpc::InfluxRpcPlanner,

View File

@ -19,7 +19,7 @@ base64 = { version = "0.13", features = ["std"] }
bitflags = { version = "1" }
byteorder = { version = "1", features = ["std"] }
bytes = { version = "1", features = ["std"] }
chrono = { version = "0.4", default-features = false, features = ["alloc", "clock", "libc", "std", "winapi"] }
chrono = { version = "0.4", features = ["alloc", "clock", "libc", "oldtime", "serde", "std", "time", "winapi"] }
digest = { version = "0.9", default-features = false, features = ["alloc", "std"] }
either = { version = "1", features = ["use_std"] }
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
@ -52,10 +52,10 @@ sha2 = { version = "0.9", features = ["std"] }
smallvec = { version = "1", default-features = false, features = ["union"] }
tokio = { version = "1", features = ["bytes", "fs", "full", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "parking_lot", "process", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "sync", "time", "tokio-macros", "winapi"] }
tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }
tokio-util = { version = "0.6", features = ["codec", "io"] }
tokio-util = { version = "0.6", features = ["codec", "io", "slab", "time"] }
tower = { version = "0.4", features = ["balance", "buffer", "discover", "futures-util", "indexmap", "limit", "load", "log", "make", "rand", "ready-cache", "slab", "timeout", "tokio", "tokio-stream", "tokio-util", "tracing", "util"] }
tracing = { version = "0.1", features = ["attributes", "log", "max_level_trace", "release_max_level_debug", "std", "tracing-attributes"] }
tracing-core = { version = "0.1", features = ["lazy_static", "std"] }
tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] }
tracing-log = { version = "0.1", features = ["log-tracer", "std", "trace-logger"] }
tracing-subscriber = { version = "0.3", features = ["alloc", "ansi", "ansi_term", "env-filter", "fmt", "json", "lazy_static", "matchers", "regex", "registry", "serde", "serde_json", "sharded-slab", "smallvec", "std", "thread_local", "tracing", "tracing-log", "tracing-serde"] }
uuid = { version = "0.8", features = ["getrandom", "std", "v4"] }
@ -66,6 +66,7 @@ base64 = { version = "0.13", features = ["std"] }
bitflags = { version = "1" }
byteorder = { version = "1", features = ["std"] }
bytes = { version = "1", features = ["std"] }
cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] }
digest = { version = "0.9", default-features = false, features = ["alloc", "std"] }
either = { version = "1", features = ["use_std"] }
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
@ -98,7 +99,6 @@ uuid = { version = "0.8", features = ["getrandom", "std", "v4"] }
libc = { version = "0.2", features = ["extra_traits", "std"] }
[target.x86_64-unknown-linux-gnu.build-dependencies]
cc = { version = "1", default-features = false, features = ["jobserver", "parallel"] }
libc = { version = "0.2", features = ["extra_traits", "std"] }
[target.x86_64-apple-darwin.dependencies]
@ -115,10 +115,10 @@ libc = { version = "0.2", features = ["extra_traits", "std"] }
[target.x86_64-pc-windows-msvc.dependencies]
scopeguard = { version = "1", features = ["use_std"] }
winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "profileapi", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
[target.x86_64-pc-windows-msvc.build-dependencies]
scopeguard = { version = "1", features = ["use_std"] }
winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
winapi = { version = "0.3", default-features = false, features = ["basetsd", "cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "impl-debug", "impl-default", "in6addr", "inaddr", "ioapiset", "knownfolders", "libloaderapi", "lmcons", "minschannel", "minwinbase", "minwindef", "mstcpip", "mswsock", "namedpipeapi", "ntdef", "ntsecapi", "ntstatus", "objbase", "processenv", "profileapi", "schannel", "securitybaseapi", "shellapi", "shlobj", "sspi", "std", "stringapiset", "synchapi", "sysinfoapi", "threadpoollegacyapiset", "timezoneapi", "winbase", "wincon", "wincrypt", "windef", "winerror", "winioctl", "winnt", "winreg", "winsock2", "winuser", "ws2def", "ws2ipdef", "ws2tcpip"] }
### END HAKARI SECTION

View File

@ -76,9 +76,17 @@ impl IoxHeaders {
if name.eq_ignore_ascii_case(HEADER_CONTENT_TYPE) {
content_type = match std::str::from_utf8(value.as_ref()) {
Ok(CONTENT_TYPE_PROTOBUF) => Some(ContentType::Protobuf),
Ok(c) => return Err(format!("Unknown message format: {}", c).into()),
Ok(c) => {
return Err(WriteBufferError::invalid_data(format!(
"Unknown message format: {}",
c
)))
}
Err(e) => {
return Err(format!("Error decoding content type header: {}", e).into())
return Err(WriteBufferError::invalid_data(format!(
"Error decoding content type header: {}",
e
)))
}
};
}
@ -95,7 +103,10 @@ impl IoxHeaders {
span_context = match parser.parse(trace_collector, &headers) {
Ok(ctx) => ctx,
Err(e) => {
return Err(format!("Error decoding trace context: {}", e).into())
return Err(WriteBufferError::invalid_data(format!(
"Error decoding trace context: {}",
e
)))
}
};
}
@ -103,15 +114,20 @@ impl IoxHeaders {
}
if name.eq_ignore_ascii_case(HEADER_NAMESPACE) {
namespace = Some(
String::from_utf8(value.as_ref().to_vec())
.map_err(|e| format!("Error decoding namespace header: {}", e))?,
);
namespace = Some(String::from_utf8(value.as_ref().to_vec()).map_err(|e| {
WriteBufferError::invalid_data(format!(
"Error decoding namespace header: {}",
e
))
})?);
}
}
let content_type =
content_type.ok_or_else(|| WriteBufferError::invalid_data("No content type header"))?;
Ok(Self {
content_type: content_type.ok_or_else(|| "No content type header".to_string())?,
content_type,
span_context,
namespace: namespace.unwrap_or_default(),
})
@ -173,8 +189,12 @@ pub fn decode(
match payload {
Payload::Write(write) => {
let tables = decode_database_batch(&write)
.map_err(|e| format!("failed to decode database batch: {}", e))?;
let tables = decode_database_batch(&write).map_err(|e| {
WriteBufferError::invalid_data(format!(
"failed to decode database batch: {}",
e
))
})?;
Ok(DmlOperation::Write(DmlWrite::new(
headers.namespace,
@ -183,7 +203,11 @@ pub fn decode(
)))
}
Payload::Delete(delete) => {
let predicate = delete.predicate.required("predicate")?;
let predicate = delete
.predicate
.required("predicate")
.map_err(WriteBufferError::invalid_data)?;
Ok(DmlOperation::Delete(DmlDelete::new(
headers.namespace,
predicate,
@ -220,7 +244,8 @@ pub fn encode_operation(
let payload = WriteBufferPayload {
payload: Some(payload),
};
Ok(payload.encode(buf).map_err(Box::new)?)
payload.encode(buf).map_err(WriteBufferError::invalid_input)
}
#[cfg(test)]

View File

@ -142,7 +142,7 @@ impl WriteBufferConfigFactory {
db_name: &str,
trace_collector: Option<&Arc<dyn TraceCollector>>,
cfg: &WriteBufferConnection,
) -> Result<Box<dyn WriteBufferReading>, WriteBufferError> {
) -> Result<Arc<dyn WriteBufferReading>, WriteBufferError> {
let reader = match &cfg.type_[..] {
"file" => {
let root = PathBuf::from(&cfg.connection);
@ -153,7 +153,7 @@ impl WriteBufferConfigFactory {
trace_collector,
)
.await?;
Box::new(file_buffer) as _
Arc::new(file_buffer) as _
}
"kafka" => {
let rskafka_buffer = RSKafkaConsumer::new(
@ -164,17 +164,17 @@ impl WriteBufferConfigFactory {
trace_collector.map(Arc::clone),
)
.await?;
Box::new(rskafka_buffer) as _
Arc::new(rskafka_buffer) as _
}
"mock" => match self.get_mock(&cfg.connection)? {
Mock::Normal(state) => {
let mock_buffer =
MockBufferForReading::new(state, cfg.creation_config.as_ref())?;
Box::new(mock_buffer) as _
Arc::new(mock_buffer) as _
}
Mock::AlwaysFailing => {
let mock_buffer = MockBufferForReadingThatAlwaysErrors {};
Box::new(mock_buffer) as _
Arc::new(mock_buffer) as _
}
},
other => {
@ -267,7 +267,7 @@ mod tests {
.new_config_write(db_name.as_str(), None, &cfg)
.await
.unwrap_err();
assert!(err.to_string().starts_with("Unknown mock ID:"));
assert!(err.to_string().contains("Unknown mock ID:"));
}
#[tokio::test]
@ -302,7 +302,7 @@ mod tests {
.new_config_read(db_name.as_str(), None, &cfg)
.await
.unwrap_err();
assert!(err.to_string().starts_with("Unknown mock ID:"));
assert!(err.to_string().contains("Unknown mock ID:"));
}
#[tokio::test]
@ -335,7 +335,7 @@ mod tests {
.new_config_write(db_name.as_str(), None, &cfg)
.await
.unwrap_err();
assert!(err.to_string().starts_with("Unknown mock ID:"));
assert!(err.to_string().contains("Unknown mock ID:"));
}
#[tokio::test]
@ -368,7 +368,7 @@ mod tests {
.new_config_read(db_name.as_str(), None, &cfg)
.await
.unwrap_err();
assert!(err.to_string().starts_with("Unknown mock ID:"));
assert!(err.to_string().contains("Unknown mock ID:"));
}
#[test]

View File

@ -1,3 +1,5 @@
use std::fmt::{Display, Formatter};
use std::io::Error;
use std::{
collections::{BTreeMap, BTreeSet},
fmt::Debug,
@ -5,12 +7,111 @@ use std::{
use async_trait::async_trait;
use dml::{DmlMeta, DmlOperation, DmlWrite};
use futures::{future::BoxFuture, stream::BoxStream};
use futures::stream::BoxStream;
/// Generic boxed error type that is used in this crate.
///
/// The dynamic boxing makes it easier to deal with error from different implementations.
pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
#[derive(Debug)]
pub struct WriteBufferError {
inner: Box<dyn std::error::Error + Sync + Send>,
kind: WriteBufferErrorKind,
}
impl WriteBufferError {
pub fn new(
kind: WriteBufferErrorKind,
e: impl Into<Box<dyn std::error::Error + Sync + Send>>,
) -> Self {
Self {
inner: e.into(),
kind,
}
}
pub fn invalid_data(e: impl Into<Box<dyn std::error::Error + Sync + Send>>) -> Self {
Self::new(WriteBufferErrorKind::InvalidData, e)
}
pub fn invalid_input(e: impl Into<Box<dyn std::error::Error + Sync + Send>>) -> Self {
Self::new(WriteBufferErrorKind::InvalidInput, e)
}
/// Returns the kind of error this was
pub fn kind(&self) -> WriteBufferErrorKind {
self.kind
}
/// Returns the inner error
pub fn inner(&self) -> &dyn std::error::Error {
self.inner.as_ref()
}
}
impl Display for WriteBufferError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "WriteBufferError({:?}): {}", self.kind, self.inner)
}
}
impl std::error::Error for WriteBufferError {}
impl From<std::io::Error> for WriteBufferError {
fn from(e: Error) -> Self {
Self {
inner: Box::new(e),
kind: WriteBufferErrorKind::IO,
}
}
}
impl From<rskafka::client::error::Error> for WriteBufferError {
fn from(e: rskafka::client::error::Error) -> Self {
Self {
inner: Box::new(e),
kind: WriteBufferErrorKind::IO,
}
}
}
impl From<rskafka::client::producer::Error> for WriteBufferError {
fn from(e: rskafka::client::producer::Error) -> Self {
Self {
inner: Box::new(e),
kind: WriteBufferErrorKind::IO,
}
}
}
impl From<String> for WriteBufferError {
fn from(e: String) -> Self {
Self {
inner: e.into(),
kind: WriteBufferErrorKind::Unknown,
}
}
}
impl From<&'static str> for WriteBufferError {
fn from(e: &'static str) -> Self {
Self {
inner: e.into(),
kind: WriteBufferErrorKind::Unknown,
}
}
}
#[derive(Debug, Copy, Clone)]
pub enum WriteBufferErrorKind {
/// This operation failed for an unknown reason
Unknown,
/// This operation was provided with invalid input data
InvalidInput,
/// This operation encountered invalid data
InvalidData,
/// A fatal IO error occurred - non-fatal errors should be retried internally
IO,
}
/// Writing to a Write Buffer takes a [`DmlWrite`] and returns the [`DmlMeta`] for the
/// payload that was written
@ -44,7 +145,9 @@ pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
lp: &str,
default_time: i64,
) -> Result<DmlMeta, WriteBufferError> {
let tables = mutable_batch_lp::lines_to_batches(lp, default_time).map_err(Box::new)?;
let tables = mutable_batch_lp::lines_to_batches(lp, default_time)
.map_err(WriteBufferError::invalid_input)?;
self.store_operation(
sequencer_id,
&DmlOperation::Write(DmlWrite::new("test_db", tables, Default::default())),
@ -63,47 +166,60 @@ pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
fn type_name(&self) -> &'static str;
}
pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
/// Output stream of [`WriteBufferReading`].
pub struct WriteStream<'a> {
/// Stream that produces entries.
pub stream: BoxStream<'a, Result<DmlOperation, WriteBufferError>>,
/// Get high watermark (= what we believe is the next sequence number to be added).
/// Handles a stream of a specific sequencer.
///
/// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
/// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
pub fetch_high_watermark: FetchHighWatermark<'a>,
}
/// This can be used to consume data via a stream or to seek the stream to a given offset.
#[async_trait]
pub trait WriteBufferStreamHandler: Sync + Send + Debug + 'static {
/// Stream that produces DML operations.
///
/// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
/// [`WriteBufferStreamHandler`] instance at the same time. If all streams are dropped and requested again, the last
/// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
/// create a new [`WriteBufferStreamHandler`] or use [`seek`](Self::seek).
fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>>;
impl<'a> Debug for WriteStream<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EntryStream").finish_non_exhaustive()
}
/// Seek sequencer to given sequence number. The next output of related streams will be an entry with at least
/// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
///
/// Note that due to the mutable borrow, it is not possible to seek while streams exists.
async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError>;
}
/// Produce streams (one per sequencer) of [`DmlWrite`]s.
#[async_trait]
pub trait WriteBufferReading: Sync + Send + Debug + 'static {
/// Returns a stream per sequencer.
/// List all known sequencers.
///
/// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
/// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
/// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
/// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>>;
/// This set not empty.
fn sequencer_ids(&self) -> BTreeSet<u32>;
/// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
/// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
/// Get stream handler for a dedicated sequencer.
///
/// Note that due to the mutable borrow, it is not possible to seek while streams exists.
async fn seek(
&mut self,
/// Handlers do NOT share any state (e.g. last offsets).
async fn stream_handler(
&self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError>;
) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError>;
/// Get stream handlers for all stream.
async fn stream_handlers(
&self,
) -> Result<BTreeMap<u32, Box<dyn WriteBufferStreamHandler>>, WriteBufferError> {
let mut handlers = BTreeMap::new();
for sequencer_id in self.sequencer_ids() {
handlers.insert(sequencer_id, self.stream_handler(sequencer_id).await?);
}
Ok(handlers)
}
/// Get high watermark (= what we believe is the next sequence number to be added).
///
/// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
/// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError>;
/// Return type (like `"mock"` or `"kafka"`) of this reader.
fn type_name(&self) -> &'static str;
@ -111,16 +227,14 @@ pub trait WriteBufferReading: Sync + Send + Debug + 'static {
pub mod test_utils {
//! Generic tests for all write buffer implementations.
use super::{WriteBufferError, WriteBufferReading, WriteBufferWriting};
use super::{
WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting,
};
use async_trait::async_trait;
use dml::{test_util::assert_write_op_eq, DmlMeta, DmlOperation, DmlWrite};
use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt};
use std::{
collections::{BTreeMap, BTreeSet},
convert::TryFrom,
num::NonZeroU32,
sync::Arc,
time::Duration,
collections::BTreeSet, convert::TryFrom, num::NonZeroU32, sync::Arc, time::Duration,
};
use time::{Time, TimeProvider};
use trace::{ctx::SpanContext, span::Span, RingBufferTraceCollector};
@ -246,40 +360,41 @@ pub mod test_utils {
let entry_3 = "upc user=3 300";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let sequencer_id = set_pop_first(&mut reader.sequencer_ids()).unwrap();
let mut stream_handler = reader.stream_handler(sequencer_id).await.unwrap();
let mut stream = stream_handler.stream();
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
// empty stream is pending
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream.poll_next_unpin(&mut cx).is_pending());
// adding content allows us to get results
let w1 = write("namespace", &writer, entry_1, sequencer_id, None).await;
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
// stream is pending again
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream.poll_next_unpin(&mut cx).is_pending());
// adding more data unblocks the stream
let w2 = write("namespace", &writer, entry_2, sequencer_id, None).await;
let w3 = write("namespace", &writer, entry_3, sequencer_id, None).await;
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w3);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w3);
// stream is pending again
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream.poll_next_unpin(&mut cx).is_pending());
}
/// Tests multiple subsequently created streams from a single reader.
/// Tests multiple subsequently created streams from a single [`WriteBufferStreamHandler`].
///
/// This tests that:
/// - readers remember their offset (and "pending" state) even when streams are dropped
/// - state is not shared between handlers
async fn test_multi_stream_io<T>(adapter: &T)
where
T: TestAdapter,
@ -291,7 +406,7 @@ pub mod test_utils {
let entry_3 = "upc user=3 300";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
@ -301,35 +416,31 @@ pub mod test_utils {
let w3 = write("namespace", &writer, entry_3, 0, None).await;
// creating stream, drop stream, re-create it => still starts at first entry
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, stream) = map_pop_first(&mut streams).unwrap();
let sequencer_id = set_pop_first(&mut reader.sequencer_ids()).unwrap();
let mut stream_handler = reader.stream_handler(sequencer_id).await.unwrap();
let stream = stream_handler.stream();
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
let mut stream = stream_handler.stream();
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
// re-creating stream after reading remembers offset, but wait a bit to provoke the stream to buffer some
// entries
tokio::time::sleep(Duration::from_millis(10)).await;
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w3);
let mut stream = stream_handler.stream();
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w3);
// re-creating stream after reading everything makes it pending
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
let mut stream = stream_handler.stream();
assert!(stream.poll_next_unpin(&mut cx).is_pending());
// use a different handler => stream starts from beginning
let mut stream_handler2 = reader.stream_handler(sequencer_id).await.unwrap();
let mut stream2 = stream_handler2.stream();
assert_write_op_eq(&stream2.next().await.unwrap().unwrap(), &w1);
assert!(stream.poll_next_unpin(&mut cx).is_pending());
}
/// Test single reader-writer IO w/ multiple sequencers.
@ -348,37 +459,43 @@ pub mod test_utils {
let entry_3 = "upc user=3 300";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut streams = reader.streams();
assert_eq!(streams.len(), 2);
let (sequencer_id_1, mut stream_1) = map_pop_first(&mut streams).unwrap();
let (sequencer_id_2, mut stream_2) = map_pop_first(&mut streams).unwrap();
// check that we have two different sequencer IDs
let mut sequencer_ids = reader.sequencer_ids();
assert_eq!(sequencer_ids.len(), 2);
let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
assert_ne!(sequencer_id_1, sequencer_id_2);
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
let mut stream_handler_1 = reader.stream_handler(sequencer_id_1).await.unwrap();
let mut stream_handler_2 = reader.stream_handler(sequencer_id_2).await.unwrap();
let mut stream_1 = stream_handler_1.stream();
let mut stream_2 = stream_handler_2.stream();
// empty streams are pending
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
// entries arrive at the right target stream
let w1 = write("namespace", &writer, entry_1, sequencer_id_1, None).await;
assert_write_op_eq(&stream_1.stream.next().await.unwrap().unwrap(), &w1);
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
assert_write_op_eq(&stream_1.next().await.unwrap().unwrap(), &w1);
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
let w2 = write("namespace", &writer, entry_2, sequencer_id_2, None).await;
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert_write_op_eq(&stream_2.stream.next().await.unwrap().unwrap(), &w2);
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert_write_op_eq(&stream_2.next().await.unwrap().unwrap(), &w2);
let w3 = write("namespace", &writer, entry_3, sequencer_id_1, None).await;
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
assert_write_op_eq(&stream_1.stream.next().await.unwrap().unwrap(), &w3);
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
assert_write_op_eq(&stream_1.next().await.unwrap().unwrap(), &w3);
// streams are pending again
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
}
/// Test multiple multiple writers and multiple readers on multiple sequencers.
@ -400,8 +517,8 @@ pub mod test_utils {
let writer_1 = context.writing(true).await.unwrap();
let writer_2 = context.writing(true).await.unwrap();
let mut reader_1 = context.reading(true).await.unwrap();
let mut reader_2 = context.reading(true).await.unwrap();
let reader_1 = context.reading(true).await.unwrap();
let reader_2 = context.reading(true).await.unwrap();
let mut sequencer_ids_1 = writer_1.sequencer_ids();
let sequencer_ids_2 = writer_2.sequencer_ids();
@ -414,22 +531,15 @@ pub mod test_utils {
let w_west_1 = write("namespace", &writer_1, entry_west_1, sequencer_id_2, None).await;
let w_east_2 = write("namespace", &writer_2, entry_east_2, sequencer_id_1, None).await;
assert_reader_content(
&mut reader_1,
&[
(sequencer_id_1, &[&w_east_1, &w_east_2]),
(sequencer_id_2, &[&w_west_1]),
],
)
.await;
assert_reader_content(
&mut reader_2,
&[
(sequencer_id_1, &[&w_east_1, &w_east_2]),
(sequencer_id_2, &[&w_west_1]),
],
)
.await;
let mut handler_1_1 = reader_1.stream_handler(sequencer_id_1).await.unwrap();
let mut handler_1_2 = reader_1.stream_handler(sequencer_id_2).await.unwrap();
let mut handler_2_1 = reader_2.stream_handler(sequencer_id_1).await.unwrap();
let mut handler_2_2 = reader_2.stream_handler(sequencer_id_2).await.unwrap();
assert_reader_content(&mut handler_1_1, &[&w_east_1, &w_east_2]).await;
assert_reader_content(&mut handler_1_2, &[&w_west_1]).await;
assert_reader_content(&mut handler_2_1, &[&w_east_1, &w_east_2]).await;
assert_reader_content(&mut handler_2_2, &[&w_west_1]).await;
}
/// Test seek implemention of readers.
@ -455,46 +565,47 @@ pub mod test_utils {
let writer = context.writing(true).await.unwrap();
let w_east_1 = write("namespace", &writer, entry_east_1, 0, None).await;
let w_east_2 = write("namespace", &writer, entry_east_2, 0, None).await;
let w_west_1 = write("namespace", &writer, entry_west_1, 1, None).await;
let mut sequencer_ids = writer.sequencer_ids();
let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
let mut reader_1 = context.reading(true).await.unwrap();
let mut reader_2 = context.reading(true).await.unwrap();
let w_east_1 = write("namespace", &writer, entry_east_1, sequencer_id_1, None).await;
let w_east_2 = write("namespace", &writer, entry_east_2, sequencer_id_1, None).await;
let w_west_1 = write("namespace", &writer, entry_west_1, sequencer_id_2, None).await;
let reader_1 = context.reading(true).await.unwrap();
let reader_2 = context.reading(true).await.unwrap();
let mut handler_1_1_a = reader_1.stream_handler(sequencer_id_1).await.unwrap();
let mut handler_1_2_a = reader_1.stream_handler(sequencer_id_2).await.unwrap();
let mut handler_1_1_b = reader_1.stream_handler(sequencer_id_1).await.unwrap();
let mut handler_1_2_b = reader_1.stream_handler(sequencer_id_2).await.unwrap();
let mut handler_2_1 = reader_2.stream_handler(sequencer_id_1).await.unwrap();
let mut handler_2_2 = reader_2.stream_handler(sequencer_id_2).await.unwrap();
// forward seek
reader_1
.seek(0, w_east_2.meta().sequence().unwrap().number)
handler_1_1_a
.seek(w_east_2.meta().sequence().unwrap().number)
.await
.unwrap();
assert_reader_content(&mut reader_1, &[(0, &[&w_east_2]), (1, &[&w_west_1])]).await;
assert_reader_content(
&mut reader_2,
&[(0, &[&w_east_1, &w_east_2]), (1, &[&w_west_1])],
)
.await;
assert_reader_content(&mut handler_1_1_a, &[&w_east_2]).await;
assert_reader_content(&mut handler_1_2_a, &[&w_west_1]).await;
assert_reader_content(&mut handler_1_1_b, &[&w_east_1, &w_east_2]).await;
assert_reader_content(&mut handler_1_2_b, &[&w_west_1]).await;
assert_reader_content(&mut handler_2_1, &[&w_east_1, &w_east_2]).await;
assert_reader_content(&mut handler_2_2, &[&w_west_1]).await;
// backward seek
reader_1.seek(0, 0).await.unwrap();
assert_reader_content(&mut reader_1, &[(0, &[&w_east_1, &w_east_2]), (1, &[])]).await;
handler_1_1_a.seek(0).await.unwrap();
assert_reader_content(&mut handler_1_1_a, &[&w_east_1, &w_east_2]).await;
// seek to far end and then add data
reader_1.seek(0, 1_000_000).await.unwrap();
handler_1_1_a.seek(1_000_000).await.unwrap();
write("namespace", &writer, entry_east_3, 0, None).await;
let mut streams = reader_1.streams();
assert_eq!(streams.len(), 2);
let (_sequencer_id, mut stream_1) = map_pop_first(&mut streams).unwrap();
let (_sequencer_id, mut stream_2) = map_pop_first(&mut streams).unwrap();
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
drop(stream_1);
drop(stream_2);
drop(streams);
// seeking unknown sequencer is NOT an error
reader_1.seek(0, 42).await.unwrap();
assert!(handler_1_1_a.stream().poll_next_unpin(&mut cx).is_pending());
assert!(handler_1_2_a.stream().poll_next_unpin(&mut cx).is_pending());
}
/// Test watermark fetching.
@ -513,28 +624,33 @@ pub mod test_utils {
let entry_west_1 = "upc,region=west user=1 200";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut streams = reader.streams();
assert_eq!(streams.len(), 2);
let (sequencer_id_1, stream_1) = map_pop_first(&mut streams).unwrap();
let (sequencer_id_2, stream_2) = map_pop_first(&mut streams).unwrap();
let mut sequencer_ids = writer.sequencer_ids();
let sequencer_id_1 = set_pop_first(&mut sequencer_ids).unwrap();
let sequencer_id_2 = set_pop_first(&mut sequencer_ids).unwrap();
// start at watermark 0
assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
assert_eq!(
reader.fetch_high_watermark(sequencer_id_1).await.unwrap(),
0
);
assert_eq!(
reader.fetch_high_watermark(sequencer_id_2).await.unwrap(),
0
);
// high water mark moves
write("namespace", &writer, entry_east_1, sequencer_id_1, None).await;
let w1 = write("namespace", &writer, entry_east_2, sequencer_id_1, None).await;
let w2 = write("namespace", &writer, entry_west_1, sequencer_id_2, None).await;
assert_eq!(
(stream_1.fetch_high_watermark)().await.unwrap(),
reader.fetch_high_watermark(sequencer_id_1).await.unwrap(),
w1.meta().sequence().unwrap().number + 1
);
assert_eq!(
(stream_2.fetch_high_watermark)().await.unwrap(),
reader.fetch_high_watermark(sequencer_id_2).await.unwrap(),
w2.meta().sequence().unwrap().number + 1
);
}
@ -557,11 +673,11 @@ pub mod test_utils {
let entry = "upc user=1 100";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let mut sequencer_ids = writer.sequencer_ids();
assert_eq!(sequencer_ids.len(), 1);
let sequencer_id = set_pop_first(&mut sequencer_ids).unwrap();
let write = write("namespace", &writer, entry, sequencer_id, None).await;
let reported_ts = write.meta().producer_ts().unwrap();
@ -570,7 +686,8 @@ pub mod test_utils {
time.inc(Duration::from_secs(10));
// check that the timestamp records the ingestion time, not the read time
let sequenced_entry = stream.stream.next().await.unwrap().unwrap();
let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
let sequenced_entry = handler.stream().next().await.unwrap().unwrap();
let ts_entry = sequenced_entry.meta().producer_ts().unwrap();
assert_eq!(ts_entry, t0);
assert_eq!(reported_ts, t0);
@ -603,7 +720,7 @@ pub mod test_utils {
context.writing(false).await.unwrap();
}
/// Test sequencer IDs reporting of writers.
/// Test sequencer IDs reporting of readers and writers.
///
/// This tests that:
/// - all sequencers are reported
@ -618,11 +735,17 @@ pub mod test_utils {
let writer_1 = context.writing(true).await.unwrap();
let writer_2 = context.writing(true).await.unwrap();
let reader_1 = context.reading(true).await.unwrap();
let reader_2 = context.reading(true).await.unwrap();
let sequencer_ids_1 = writer_1.sequencer_ids();
let sequencer_ids_2 = writer_2.sequencer_ids();
assert_eq!(sequencer_ids_1, sequencer_ids_2);
let sequencer_ids_3 = reader_1.sequencer_ids();
let sequencer_ids_4 = reader_2.sequencer_ids();
assert_eq!(sequencer_ids_1.len(), n_sequencers as usize);
assert_eq!(sequencer_ids_1, sequencer_ids_2);
assert_eq!(sequencer_ids_1, sequencer_ids_3);
assert_eq!(sequencer_ids_1, sequencer_ids_4);
}
/// Test that span contexts are propagated through the system.
@ -635,11 +758,13 @@ pub mod test_utils {
let entry = "upc user=1 100";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let mut sequencer_ids = writer.sequencer_ids();
assert_eq!(sequencer_ids.len(), 1);
let sequencer_id = set_pop_first(&mut sequencer_ids).unwrap();
let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
// 1: no context
write("namespace", &writer, entry, sequencer_id, None).await;
@ -669,16 +794,16 @@ pub mod test_utils {
.await;
// check write 1
let write_1 = stream.stream.next().await.unwrap().unwrap();
let write_1 = stream.next().await.unwrap().unwrap();
assert!(write_1.meta().span_context().is_none());
// check write 2
let write_2 = stream.stream.next().await.unwrap().unwrap();
let write_2 = stream.next().await.unwrap().unwrap();
let actual_context_1 = write_2.meta().span_context().unwrap();
assert_span_context_eq_or_linked(&span_context_1, actual_context_1, collector.spans());
// check write 3
let write_3 = stream.stream.next().await.unwrap().unwrap();
let write_3 = stream.next().await.unwrap().unwrap();
let actual_context_2 = write_3.meta().span_context().unwrap();
assert_span_context_eq_or_linked(&span_context_2, actual_context_2, collector.spans());
}
@ -719,7 +844,7 @@ pub mod test_utils {
let entry_2 = "upc,region=east user=2 200";
let writer = context.writing(true).await.unwrap();
let mut reader = context.reading(true).await.unwrap();
let reader = context.reading(true).await.unwrap();
let mut sequencer_ids = writer.sequencer_ids();
assert_eq!(sequencer_ids.len(), 1);
@ -728,7 +853,8 @@ pub mod test_utils {
let w1 = write("namespace_1", &writer, entry_2, sequencer_id, None).await;
let w2 = write("namespace_2", &writer, entry_1, sequencer_id, None).await;
assert_reader_content(&mut reader, &[(sequencer_id, &[&w1, &w2])]).await;
let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
assert_reader_content(&mut handler, &[&w1, &w2]).await;
}
/// Dummy test to ensure that flushing somewhat works.
@ -770,57 +896,30 @@ pub mod test_utils {
/// Assert that the content of the reader is as expected.
///
/// This will read `expected.len()` from the reader and then ensures that the stream is pending.
async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&DmlWrite])])
where
R: WriteBufferReading,
{
// normalize expected values
let expected = {
let mut expected = expected.to_vec();
expected.sort_by_key(|(sequencer_id, _entries)| *sequencer_id);
expected
};
/// This will read `expected_writes.len()` from the reader and then ensures that the stream is pending.
async fn assert_reader_content(
actual_stream_handler: &mut Box<dyn WriteBufferStreamHandler>,
expected_writes: &[&DmlWrite],
) {
let actual_stream = actual_stream_handler.stream();
// Ensure content of the streams
let streams = reader.streams();
assert_eq!(streams.len(), expected.len());
for ((actual_sequencer_id, actual_stream), (expected_sequencer_id, expected_writes)) in
streams.into_iter().zip(expected.iter())
{
assert_eq!(actual_sequencer_id, *expected_sequencer_id);
// we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
let results: Vec<_> = actual_stream
.stream
// we need to limit the stream to `expected_writes.len()` elements, otherwise it might be pending forever
let actual_writes: Vec<_> = actual_stream
.take(expected_writes.len())
.try_collect()
.await
.unwrap();
let actual_writes: Vec<_> = results.iter().collect();
assert_eq!(actual_writes.len(), expected_writes.len());
for (actual, expected) in actual_writes.iter().zip(expected_writes.iter()) {
assert_write_op_eq(actual, expected);
}
}
// Ensure that streams a pending
let streams = reader.streams();
assert_eq!(streams.len(), expected.len());
// Ensure that stream is pending
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
for ((actual_sequencer_id, mut actual_stream), (expected_sequencer_id, _expected_writes)) in
streams.into_iter().zip(expected.iter())
{
assert_eq!(actual_sequencer_id, *expected_sequencer_id);
// empty stream is pending
assert!(actual_stream.stream.poll_next_unpin(&mut cx).is_pending());
}
let mut actual_stream = actual_stream_handler.stream();
assert!(actual_stream.poll_next_unpin(&mut cx).is_pending());
}
/// Asserts that given span context are the same or that `second` links back to `first`.
@ -854,20 +953,6 @@ pub mod test_utils {
assert_eq!(first.parent_span_id, second.parent_span_id);
}
/// Pops first entry from map.
///
/// Helper until <https://github.com/rust-lang/rust/issues/62924> is stable.
pub(crate) fn map_pop_first<K, V>(map: &mut BTreeMap<K, V>) -> Option<(K, V)>
where
K: Clone + Ord,
{
map.keys()
.next()
.cloned()
.map(|k| map.remove_entry(&k))
.flatten()
}
/// Pops first entry from set.
///
/// Helper until <https://github.com/rust-lang/rust/issues/62924> is stable.

View File

@ -119,21 +119,21 @@ use std::{
},
};
use crate::codec::{ContentType, IoxHeaders};
use crate::{
codec::{ContentType, IoxHeaders},
core::WriteBufferStreamHandler,
};
use async_trait::async_trait;
use data_types::{sequence::Sequence, write_buffer::WriteBufferCreationConfig};
use dml::{DmlMeta, DmlOperation};
use futures::{FutureExt, Stream, StreamExt};
use futures::{stream::BoxStream, Stream, StreamExt};
use pin_project::pin_project;
use time::{Time, TimeProvider};
use tokio_util::sync::ReusableBoxFuture;
use trace::TraceCollector;
use uuid::Uuid;
use crate::core::{
FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
WriteBufferWriting, WriteStream,
};
use crate::core::{WriteBufferError, WriteBufferReading, WriteBufferWriting};
/// Header used to declare the creation time of the message.
pub const HEADER_TIME: &str = "last-modified";
@ -260,6 +260,35 @@ impl WriteBufferWriting for FileBufferProducer {
}
}
#[derive(Debug)]
pub struct FileBufferStreamHandler {
sequencer_id: u32,
path: PathBuf,
next_sequence_number: Arc<AtomicU64>,
trace_collector: Option<Arc<dyn TraceCollector>>,
}
#[async_trait]
impl WriteBufferStreamHandler for FileBufferStreamHandler {
fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
let committed = self.path.join("committed");
ConsumerStream::new(
self.sequencer_id,
committed,
Arc::clone(&self.next_sequence_number),
self.trace_collector.clone(),
)
.boxed()
}
async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
self.next_sequence_number
.store(sequence_number, Ordering::SeqCst);
Ok(())
}
}
/// File-based write buffer reader.
#[derive(Debug)]
pub struct FileBufferConsumer {
@ -291,56 +320,39 @@ impl FileBufferConsumer {
#[async_trait]
impl WriteBufferReading for FileBufferConsumer {
fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
let mut streams = BTreeMap::default();
for (sequencer_id, (sequencer_path, next_sequence_number)) in &self.dirs {
let committed = sequencer_path.join("committed");
let stream = ConsumerStream::new(
*sequencer_id,
committed.clone(),
Arc::clone(next_sequence_number),
self.trace_collector.clone(),
)
.boxed();
let fetch_high_watermark = move || {
let committed = committed.clone();
let fut = async move { watermark(&committed).await };
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
streams.insert(
*sequencer_id,
WriteStream {
stream,
fetch_high_watermark,
},
);
fn sequencer_ids(&self) -> BTreeSet<u32> {
self.dirs.keys().copied().collect()
}
streams
}
async fn seek(
&mut self,
async fn stream_handler(
&self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError> {
let path_and_next_sequence_number = self
) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
let (path, _next_sequence_number) = self
.dirs
.get(&sequencer_id)
.ok_or_else::<WriteBufferError, _>(|| {
format!("Unknown sequencer: {}", sequencer_id).into()
})?;
path_and_next_sequence_number
.1
.store(sequence_number, Ordering::SeqCst);
Ok(())
Ok(Box::new(FileBufferStreamHandler {
sequencer_id,
path: path.clone(),
next_sequence_number: Arc::new(AtomicU64::new(0)),
trace_collector: self.trace_collector.clone(),
}))
}
async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
let (path, _next_sequence_number) = self
.dirs
.get(&sequencer_id)
.ok_or_else::<WriteBufferError, _>(|| {
format!("Unknown sequencer: {}", sequencer_id).into()
})?;
let committed = path.join("committed");
watermark(&committed).await
}
fn type_name(&self) -> &'static str {
@ -450,7 +462,7 @@ impl ConsumerStream {
}
_ => {
// cannot read file => communicate to user
Err(Box::new(error) as WriteBufferError)
Err(error.into())
}
}
}
@ -466,7 +478,10 @@ impl ConsumerStream {
trace_collector: Option<Arc<dyn TraceCollector>>,
) -> Result<DmlOperation, WriteBufferError> {
let mut headers = [httparse::EMPTY_HEADER; 16];
match httparse::parse_headers(&data, &mut headers)? {
let status =
httparse::parse_headers(&data, &mut headers).map_err(WriteBufferError::invalid_data)?;
match status {
httparse::Status::Complete((offset, headers)) => {
let iox_headers = IoxHeaders::from_headers(
headers.iter().map(|header| (header.name, header.value)),
@ -792,11 +807,12 @@ mod tests {
)
.await;
let mut reader = ctx.reading(true).await.unwrap();
let mut stream = reader.streams().remove(&sequencer_id).unwrap();
let reader = ctx.reading(true).await.unwrap();
let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w1);
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w4);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w1);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w4);
}
#[tokio::test]
@ -820,9 +836,10 @@ mod tests {
)
.await;
let mut reader = ctx.reading(true).await.unwrap();
let mut stream = reader.streams().remove(&sequencer_id).unwrap();
let reader = ctx.reading(true).await.unwrap();
let mut handler = reader.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
assert_write_op_eq(&stream.stream.next().await.unwrap().unwrap(), &w2);
assert_write_op_eq(&stream.next().await.unwrap().unwrap(), &w2);
}
}

View File

@ -1,4 +1,4 @@
use std::{collections::BTreeMap, time::Duration};
use std::{collections::BTreeMap, fmt::Display, str::FromStr, time::Duration};
use data_types::write_buffer::WriteBufferCreationConfig;
@ -18,7 +18,7 @@ impl TryFrom<&BTreeMap<String, String>> for ClientConfig {
fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
Ok(Self {
max_message_size: cfg.get("max_message_size").map(|s| s.parse()).transpose()?,
max_message_size: parse_key(cfg, "max_message_size")?,
})
}
}
@ -45,25 +45,16 @@ impl TryFrom<&WriteBufferCreationConfig> for TopicCreationConfig {
fn try_from(cfg: &WriteBufferCreationConfig) -> Result<Self, Self::Error> {
Ok(Self {
num_partitions: i32::try_from(cfg.n_sequencers.get())?,
replication_factor: cfg
.options
.get("replication_factor")
.map(|s| s.parse())
.transpose()?
.unwrap_or(1),
timeout_ms: cfg
.options
.get("timeout_ms")
.map(|s| s.parse())
.transpose()?
.unwrap_or(5_000),
num_partitions: i32::try_from(cfg.n_sequencers.get())
.map_err(WriteBufferError::invalid_input)?,
replication_factor: parse_key(&cfg.options, "replication_factor")?.unwrap_or(1),
timeout_ms: parse_key(&cfg.options, "timeout_ms")?.unwrap_or(5_000),
})
}
}
/// Config for consumers.
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct ConsumerConfig {
/// Will wait for at least `min_batch_size` bytes of data
///
@ -86,18 +77,9 @@ impl TryFrom<&BTreeMap<String, String>> for ConsumerConfig {
fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
Ok(Self {
max_wait_ms: cfg
.get("consumer_max_wait_ms")
.map(|s| s.parse())
.transpose()?,
min_batch_size: cfg
.get("consumer_min_batch_size")
.map(|s| s.parse())
.transpose()?,
max_batch_size: cfg
.get("consumer_max_batch_size")
.map(|s| s.parse())
.transpose()?,
max_wait_ms: parse_key(cfg, "consumer_max_wait_ms")?,
min_batch_size: parse_key(cfg, "consumer_min_batch_size")?,
max_batch_size: parse_key(cfg, "consumer_max_batch_size")?,
})
}
}
@ -120,25 +102,33 @@ impl TryFrom<&BTreeMap<String, String>> for ProducerConfig {
type Error = WriteBufferError;
fn try_from(cfg: &BTreeMap<String, String>) -> Result<Self, Self::Error> {
let linger_ms: Option<u64> = cfg
.get("producer_linger_ms")
.map(|s| s.parse())
.transpose()?;
let linger_ms: Option<u64> = parse_key(cfg, "producer_linger_ms")?;
Ok(Self {
linger: linger_ms.map(Duration::from_millis),
max_batch_size: cfg
.get("producer_max_batch_size")
.map(|s| s.parse())
.transpose()?
.unwrap_or(100 * 1024),
max_batch_size: parse_key(cfg, "producer_max_batch_size")?.unwrap_or(100 * 1024),
})
}
}
fn parse_key<T>(cfg: &BTreeMap<String, String>, key: &str) -> Result<Option<T>, WriteBufferError>
where
T: FromStr,
T::Err: Display,
{
if let Some(s) = cfg.get(key) {
s.parse()
.map(Some)
.map_err(|e| format!("Cannot parse `{key}` from '{s}': {e}").into())
} else {
Ok(None)
}
}
#[cfg(test)]
mod tests {
use std::{collections::BTreeMap, num::NonZeroU32};
use test_helpers::assert_contains;
use super::*;
@ -164,6 +154,19 @@ mod tests {
assert_eq!(actual, expected);
}
#[test]
fn test_client_config_error() {
let err = ClientConfig::try_from(&BTreeMap::from([(
String::from("max_message_size"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `max_message_size` from 'xyz': invalid digit found in string"
);
}
#[test]
fn test_topic_creation_config_default() {
let actual = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
@ -198,6 +201,29 @@ mod tests {
assert_eq!(actual, expected);
}
#[test]
fn test_topic_creation_config_err() {
let err = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
n_sequencers: NonZeroU32::new(2).unwrap(),
options: BTreeMap::from([(String::from("replication_factor"), String::from("xyz"))]),
})
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `replication_factor` from 'xyz': invalid digit found in string"
);
let err = TopicCreationConfig::try_from(&WriteBufferCreationConfig {
n_sequencers: NonZeroU32::new(2).unwrap(),
options: BTreeMap::from([(String::from("timeout_ms"), String::from("xyz"))]),
})
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `timeout_ms` from 'xyz': invalid digit found in string"
);
}
#[test]
fn test_consumer_config_default() {
let actual = ConsumerConfig::try_from(&BTreeMap::default()).unwrap();
@ -226,6 +252,39 @@ mod tests {
assert_eq!(actual, expected);
}
#[test]
fn test_consumer_config_err() {
let err = ConsumerConfig::try_from(&BTreeMap::from([(
String::from("consumer_max_wait_ms"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `consumer_max_wait_ms` from 'xyz': invalid digit found in string"
);
let err = ConsumerConfig::try_from(&BTreeMap::from([(
String::from("consumer_min_batch_size"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `consumer_min_batch_size` from 'xyz': invalid digit found in string"
);
let err = ConsumerConfig::try_from(&BTreeMap::from([(
String::from("consumer_max_batch_size"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `consumer_max_batch_size` from 'xyz': invalid digit found in string"
);
}
#[test]
fn test_producer_config_default() {
let actual = ProducerConfig::try_from(&BTreeMap::default()).unwrap();
@ -253,4 +312,27 @@ mod tests {
};
assert_eq!(actual, expected);
}
#[test]
fn test_producer_config_err() {
let err = ProducerConfig::try_from(&BTreeMap::from([(
String::from("producer_linger_ms"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `producer_linger_ms` from 'xyz': invalid digit found in string"
);
let err = ProducerConfig::try_from(&BTreeMap::from([(
String::from("producer_max_batch_size"),
String::from("xyz"),
)]))
.unwrap_err();
assert_contains!(
err.to_string(),
"Cannot parse `producer_max_batch_size` from 'xyz': invalid digit found in string"
);
}
}

View File

@ -9,7 +9,7 @@ use std::{
use async_trait::async_trait;
use data_types::{sequence::Sequence, write_buffer::WriteBufferCreationConfig};
use dml::{DmlMeta, DmlOperation};
use futures::{FutureExt, StreamExt};
use futures::{stream::BoxStream, StreamExt};
use rskafka::client::{
consumer::StreamConsumerBuilder,
error::{Error as RSKafkaError, ProtocolError},
@ -22,10 +22,7 @@ use trace::TraceCollector;
use crate::{
codec::IoxHeaders,
core::{
FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
WriteBufferWriting, WriteStream,
},
core::{WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting},
};
use self::{
@ -119,69 +116,22 @@ impl WriteBufferWriting for RSKafkaProducer {
}
#[derive(Debug)]
struct ConsumerPartition {
pub struct RSKafkaStreamHandler {
partition_client: Arc<PartitionClient>,
next_offset: Arc<AtomicI64>,
}
#[derive(Debug)]
pub struct RSKafkaConsumer {
partitions: BTreeMap<u32, ConsumerPartition>,
trace_collector: Option<Arc<dyn TraceCollector>>,
consumer_config: ConsumerConfig,
}
impl RSKafkaConsumer {
pub async fn new(
conn: String,
database_name: String,
connection_config: &BTreeMap<String, String>,
creation_config: Option<&WriteBufferCreationConfig>,
trace_collector: Option<Arc<dyn TraceCollector>>,
) -> Result<Self> {
let partition_clients = setup_topic(
conn,
database_name.clone(),
connection_config,
creation_config,
)
.await?;
let partitions = partition_clients
.into_iter()
.map(|(partition_id, partition_client)| {
let partition_client = Arc::new(partition_client);
let next_offset = Arc::new(AtomicI64::new(0));
(
partition_id,
ConsumerPartition {
partition_client,
next_offset,
},
)
})
.collect();
Ok(Self {
partitions,
trace_collector,
consumer_config: ConsumerConfig::try_from(connection_config)?,
})
}
sequencer_id: u32,
}
#[async_trait]
impl WriteBufferReading for RSKafkaConsumer {
fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
let mut streams = BTreeMap::new();
for (sequencer_id, partition) in &self.partitions {
impl WriteBufferStreamHandler for RSKafkaStreamHandler {
fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
let trace_collector = self.trace_collector.clone();
let next_offset = Arc::clone(&partition.next_offset);
let next_offset = Arc::clone(&self.next_offset);
let mut stream_builder = StreamConsumerBuilder::new(
Arc::clone(&partition.partition_client),
Arc::clone(&self.partition_client),
next_offset.load(Ordering::SeqCst),
);
if let Some(max_wait_ms) = self.consumer_config.max_wait_ms {
@ -207,12 +157,17 @@ impl WriteBufferReading for RSKafkaConsumer {
IoxHeaders::from_headers(record.record.headers, trace_collector.as_ref())?;
let sequence = Sequence {
id: *sequencer_id,
number: record.offset.try_into()?,
id: self.sequencer_id,
number: record
.offset
.try_into()
.map_err(WriteBufferError::invalid_data)?,
};
let timestamp_millis =
i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)?;
i64::try_from(record.record.timestamp.unix_timestamp_nanos() / 1_000_000)
.map_err(WriteBufferError::invalid_data)?;
let timestamp = Time::from_timestamp_millis_opt(timestamp_millis)
.ok_or_else::<WriteBufferError, _>(|| {
format!(
@ -228,48 +183,88 @@ impl WriteBufferReading for RSKafkaConsumer {
.ok_or_else::<WriteBufferError, _>(|| "Value missing".to_string().into())?;
crate::codec::decode(&value, headers, sequence, timestamp, kafka_read_size)
});
let stream = stream.boxed();
let partition_client = Arc::clone(&partition.partition_client);
let fetch_high_watermark = move || {
let partition_client = Arc::clone(&partition_client);
let fut = async move {
let watermark = partition_client.get_high_watermark().await?;
u64::try_from(watermark).map_err(|e| Box::new(e) as WriteBufferError)
};
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
streams.insert(
*sequencer_id,
WriteStream {
stream,
fetch_high_watermark,
},
);
stream.boxed()
}
streams
async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
let offset = i64::try_from(sequence_number).map_err(WriteBufferError::invalid_input)?;
self.next_offset.store(offset, Ordering::SeqCst);
Ok(())
}
}
async fn seek(
&mut self,
#[derive(Debug)]
pub struct RSKafkaConsumer {
partition_clients: BTreeMap<u32, Arc<PartitionClient>>,
trace_collector: Option<Arc<dyn TraceCollector>>,
consumer_config: ConsumerConfig,
}
impl RSKafkaConsumer {
pub async fn new(
conn: String,
database_name: String,
connection_config: &BTreeMap<String, String>,
creation_config: Option<&WriteBufferCreationConfig>,
trace_collector: Option<Arc<dyn TraceCollector>>,
) -> Result<Self> {
let partition_clients = setup_topic(
conn,
database_name.clone(),
connection_config,
creation_config,
)
.await?;
let partition_clients = partition_clients
.into_iter()
.map(|(k, v)| (k, Arc::new(v)))
.collect();
Ok(Self {
partition_clients,
trace_collector,
consumer_config: ConsumerConfig::try_from(connection_config)?,
})
}
}
#[async_trait]
impl WriteBufferReading for RSKafkaConsumer {
fn sequencer_ids(&self) -> BTreeSet<u32> {
self.partition_clients.keys().copied().collect()
}
async fn stream_handler(
&self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError> {
let partition = self
.partitions
.get_mut(&sequencer_id)
) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
let partition_client = self
.partition_clients
.get(&sequencer_id)
.ok_or_else::<WriteBufferError, _>(|| {
format!("Unknown partition: {}", sequencer_id).into()
})?;
let offset = i64::try_from(sequence_number)?;
partition.next_offset.store(offset, Ordering::SeqCst);
Ok(Box::new(RSKafkaStreamHandler {
partition_client: Arc::clone(partition_client),
next_offset: Arc::new(AtomicI64::new(0)),
trace_collector: self.trace_collector.clone(),
consumer_config: self.consumer_config.clone(),
sequencer_id,
}))
}
Ok(())
async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
let partition_client = self
.partition_clients
.get(&sequencer_id)
.ok_or_else::<WriteBufferError, _>(|| {
format!("Unknown partition: {}", sequencer_id).into()
})?;
let watermark = partition_client.get_high_watermark().await?;
u64::try_from(watermark).map_err(WriteBufferError::invalid_data)
}
fn type_name(&self) -> &'static str {
@ -298,7 +293,7 @@ async fn setup_topic(
let mut partition_clients = BTreeMap::new();
for partition in topic.partitions {
let c = client.partition_client(&database_name, partition).await?;
let partition = u32::try_from(partition)?;
let partition = u32::try_from(partition).map_err(WriteBufferError::invalid_data)?;
partition_clients.insert(partition, c);
}
return Ok(partition_clients);
@ -340,12 +335,13 @@ mod tests {
use dml::{test_util::assert_write_op_eq, DmlDelete, DmlWrite};
use futures::{stream::FuturesUnordered, TryStreamExt};
use rskafka::{client::partition::Compression, record::Record};
use test_helpers::assert_contains;
use trace::{ctx::SpanContext, RingBufferTraceCollector};
use crate::{
core::test_utils::{
assert_span_context_eq_or_linked, map_pop_first, perform_generic_tests,
random_topic_name, set_pop_first, TestAdapter, TestContext,
assert_span_context_eq_or_linked, perform_generic_tests, random_topic_name,
set_pop_first, TestAdapter, TestContext,
},
maybe_skip_kafka_integration,
};
@ -506,22 +502,18 @@ mod tests {
)
.await;
let mut consumer = ctx.reading(true).await.unwrap();
let consumer = ctx.reading(true).await.unwrap();
let mut handler = consumer.stream_handler(sequencer_id).await.unwrap();
// read broken message from stream
let mut streams = consumer.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let err = stream.stream.next().await.unwrap().unwrap_err();
assert_eq!(err.to_string(), "No content type header");
let mut stream = handler.stream();
let err = stream.next().await.unwrap().unwrap_err();
assert_contains!(err.to_string(), "No content type header");
// re-creating the stream should advance past the broken message
drop(stream);
drop(streams);
let mut streams = consumer.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let op = stream.stream.next().await.unwrap().unwrap();
let mut stream = handler.stream();
let op = stream.next().await.unwrap().unwrap();
assert_write_op_eq(&op, &w);
}
@ -564,17 +556,16 @@ mod tests {
assert_ne!(w2_1.sequence().unwrap(), w1_1.sequence().unwrap());
assert_eq!(w2_1.sequence().unwrap(), w2_2.sequence().unwrap());
let mut consumer = ctx.reading(true).await.unwrap();
let mut streams = consumer.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = map_pop_first(&mut streams).unwrap();
let consumer = ctx.reading(true).await.unwrap();
let mut handler = consumer.stream_handler(sequencer_id).await.unwrap();
let mut stream = handler.stream();
// get output, note that the write operations were fused
let op_w1_12 = stream.stream.next().await.unwrap().unwrap();
let op_d1_1 = stream.stream.next().await.unwrap().unwrap();
let op_d1_2 = stream.stream.next().await.unwrap().unwrap();
let op_w1_34 = stream.stream.next().await.unwrap().unwrap();
let op_w2_12 = stream.stream.next().await.unwrap().unwrap();
let op_w1_12 = stream.next().await.unwrap().unwrap();
let op_d1_1 = stream.next().await.unwrap().unwrap();
let op_d1_2 = stream.next().await.unwrap().unwrap();
let op_w1_34 = stream.next().await.unwrap().unwrap();
let op_w2_12 = stream.next().await.unwrap().unwrap();
// ensure that sequence numbers map as expected
assert_eq!(

View File

@ -6,7 +6,7 @@ use std::{
};
use async_trait::async_trait;
use futures::{stream, FutureExt, StreamExt};
use futures::{stream::BoxStream, StreamExt};
use parking_lot::Mutex;
use data_types::sequence::Sequence;
@ -15,8 +15,7 @@ use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
use time::TimeProvider;
use crate::core::{
FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
WriteBufferWriting, WriteStream,
WriteBufferError, WriteBufferReading, WriteBufferStreamHandler, WriteBufferWriting,
};
#[derive(Debug, Default)]
@ -344,18 +343,9 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
}
}
/// Sequencer-specific playback state
struct PlaybackState {
/// Index within the entry vector.
vector_index: usize,
/// Offset within the sequencer IDs.
offset: u64,
}
pub struct MockBufferForReading {
shared_state: MockBufferSharedState,
playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
n_sequencers: u32,
}
impl MockBufferForReading {
@ -375,21 +365,10 @@ impl MockBufferForReading {
};
entries.len() as u32
};
let playback_states: BTreeMap<_, _> = (0..n_sequencers)
.map(|sequencer_id| {
(
sequencer_id,
PlaybackState {
vector_index: 0,
offset: 0,
},
)
})
.collect();
Ok(Self {
shared_state: state,
playback_states: Arc::new(Mutex::new(playback_states)),
n_sequencers,
})
}
}
@ -400,39 +379,42 @@ impl std::fmt::Debug for MockBufferForReading {
}
}
/// Sequencer-specific playback state
#[derive(Debug)]
pub struct MockBufferStreamHandler {
/// Shared state.
shared_state: MockBufferSharedState,
/// Own sequencer ID.
sequencer_id: u32,
/// Index within the entry vector.
vector_index: usize,
/// Offset within the sequencer IDs.
offset: u64,
}
#[async_trait]
impl WriteBufferReading for MockBufferForReading {
fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
let sequencer_ids: Vec<_> = {
let playback_states = self.playback_states.lock();
playback_states.keys().copied().collect()
};
let mut streams = BTreeMap::new();
for sequencer_id in sequencer_ids {
let shared_state = self.shared_state.clone();
let playback_states = Arc::clone(&self.playback_states);
let stream = stream::poll_fn(move |cx| {
let mut guard = shared_state.writes.lock();
impl WriteBufferStreamHandler for MockBufferStreamHandler {
fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
futures::stream::poll_fn(|cx| {
let mut guard = self.shared_state.writes.lock();
let writes = guard.as_mut().unwrap();
let writes_vec = writes.get_mut(&sequencer_id).unwrap();
let mut playback_states = playback_states.lock();
let playback_state = playback_states.get_mut(&sequencer_id).unwrap();
let writes_vec = writes.get_mut(&self.sequencer_id).unwrap();
let entries = &writes_vec.writes;
while entries.len() > playback_state.vector_index {
let write_result = &entries[playback_state.vector_index];
while entries.len() > self.vector_index {
let write_result = &entries[self.vector_index];
// consume entry
playback_state.vector_index += 1;
self.vector_index += 1;
match write_result {
Ok(write) => {
// found an entry => need to check if it is within the offset
let sequence = write.meta().sequence().unwrap();
if sequence.number >= playback_state.offset {
if sequence.number >= self.offset {
// within offset => return entry to caller
return Poll::Ready(Some(Ok(write.clone())));
} else {
@ -451,52 +433,51 @@ impl WriteBufferReading for MockBufferForReading {
writes_vec.register_waker(cx.waker());
Poll::Pending
})
.boxed();
.boxed()
}
let shared_state = self.shared_state.clone();
async fn seek(&mut self, sequence_number: u64) -> Result<(), WriteBufferError> {
self.offset = sequence_number;
let fetch_high_watermark = move || {
let shared_state = shared_state.clone();
// reset position to start since seeking might go backwards
self.vector_index = 0;
let fut = async move {
let guard = shared_state.writes.lock();
Ok(())
}
}
#[async_trait]
impl WriteBufferReading for MockBufferForReading {
fn sequencer_ids(&self) -> BTreeSet<u32> {
(0..self.n_sequencers).into_iter().collect()
}
async fn stream_handler(
&self,
sequencer_id: u32,
) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
if sequencer_id >= self.n_sequencers {
return Err(format!("Unknown sequencer: {}", sequencer_id).into());
}
Ok(Box::new(MockBufferStreamHandler {
shared_state: self.shared_state.clone(),
sequencer_id,
vector_index: 0,
offset: 0,
}))
}
async fn fetch_high_watermark(&self, sequencer_id: u32) -> Result<u64, WriteBufferError> {
let guard = self.shared_state.writes.lock();
let entries = guard.as_ref().unwrap();
let entry_vec = entries.get(&sequencer_id).unwrap();
let entry_vec = entries
.get(&sequencer_id)
.ok_or_else::<WriteBufferError, _>(|| {
format!("Unknown sequencer: {}", sequencer_id).into()
})?;
let watermark = entry_vec.max_seqno.map(|n| n + 1).unwrap_or(0);
Ok(watermark)
};
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
streams.insert(
sequencer_id,
WriteStream {
stream,
fetch_high_watermark,
},
);
}
streams
}
async fn seek(
&mut self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError> {
let mut playback_states = self.playback_states.lock();
if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
playback_state.offset = sequence_number;
// reset position to start since seeking might go backwards
playback_state.vector_index = 0;
}
Ok(())
}
fn type_name(&self) -> &'static str {
@ -507,40 +488,42 @@ impl WriteBufferReading for MockBufferForReading {
#[derive(Debug, Default, Clone, Copy)]
pub struct MockBufferForReadingThatAlwaysErrors;
#[derive(Debug, Default, Clone, Copy)]
pub struct MockStreamHandlerThatAlwaysErrors;
#[async_trait]
impl WriteBufferReading for MockBufferForReadingThatAlwaysErrors {
fn streams(&mut self) -> BTreeMap<u32, WriteStream<'_>> {
let stream = stream::poll_fn(|_ctx| {
impl WriteBufferStreamHandler for MockStreamHandlerThatAlwaysErrors {
fn stream(&mut self) -> BoxStream<'_, Result<DmlOperation, WriteBufferError>> {
futures::stream::poll_fn(|_cx| {
Poll::Ready(Some(Err(String::from(
"Something bad happened while reading from stream",
)
.into())))
})
.boxed();
let fetch_high_watermark = move || {
let fut = async move {
Err(String::from("Something bad happened while fetching the high watermark").into())
};
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
IntoIterator::into_iter([(
0,
WriteStream {
stream,
fetch_high_watermark,
},
)])
.collect()
.boxed()
}
async fn seek(
&mut self,
_sequencer_id: u32,
_sequence_number: u64,
) -> Result<(), WriteBufferError> {
async fn seek(&mut self, _sequence_number: u64) -> Result<(), WriteBufferError> {
Err(String::from("Something bad happened while seeking the stream").into())
}
}
#[async_trait]
impl WriteBufferReading for MockBufferForReadingThatAlwaysErrors {
fn sequencer_ids(&self) -> BTreeSet<u32> {
BTreeSet::from([0])
}
async fn stream_handler(
&self,
_sequencer_id: u32,
) -> Result<Box<dyn WriteBufferStreamHandler>, WriteBufferError> {
Ok(Box::new(MockStreamHandlerThatAlwaysErrors {}))
}
async fn fetch_high_watermark(&self, _sequencer_id: u32) -> Result<u64, WriteBufferError> {
Err(String::from("Something bad happened while fetching the high watermark").into())
}
fn type_name(&self) -> &'static str {
"mock_failing"
@ -552,11 +535,13 @@ mod tests {
use std::convert::TryFrom;
use std::time::Duration;
use futures::StreamExt;
use mutable_batch_lp::lines_to_batches;
use test_helpers::assert_contains;
use time::TimeProvider;
use trace::RingBufferTraceCollector;
use crate::core::test_utils::{map_pop_first, perform_generic_tests, TestAdapter, TestContext};
use crate::core::test_utils::{perform_generic_tests, TestAdapter, TestContext};
use super::*;
@ -739,26 +724,34 @@ mod tests {
#[tokio::test]
async fn test_always_error_read() {
let mut reader = MockBufferForReadingThatAlwaysErrors {};
let reader = MockBufferForReadingThatAlwaysErrors {};
assert_eq!(
reader.seek(0, 0).await.unwrap_err().to_string(),
"Something bad happened while seeking the stream"
);
let mut streams = reader.streams();
let (_id, mut stream) = map_pop_first(&mut streams).unwrap();
assert_eq!(
stream.stream.next().await.unwrap().unwrap_err().to_string(),
"Something bad happened while reading from stream"
);
assert_eq!(
(stream.fetch_high_watermark)()
assert_contains!(
reader
.fetch_high_watermark(0)
.await
.unwrap_err()
.to_string(),
"Something bad happened while fetching the high watermark"
);
let mut stream_handler = reader.stream_handler(0).await.unwrap();
assert_contains!(
stream_handler.seek(0).await.unwrap_err().to_string(),
"Something bad happened while seeking the stream"
);
assert_contains!(
stream_handler
.stream()
.next()
.await
.unwrap()
.unwrap_err()
.to_string(),
"Something bad happened while reading from stream"
);
}
#[tokio::test]
@ -768,7 +761,7 @@ mod tests {
let tables = lines_to_batches("upc user=1 100", 0).unwrap();
let operation = DmlOperation::Write(DmlWrite::new("test_db", tables, Default::default()));
assert_eq!(
assert_contains!(
writer
.store_operation(0, &operation)
.await
@ -823,19 +816,20 @@ mod tests {
state.push_lp(Sequence::new(0, 0), "mem foo=1 10");
let mut read = MockBufferForReading::new(state.clone(), None).unwrap();
let playback_state = Arc::clone(&read.playback_states);
let read = MockBufferForReading::new(state.clone(), None).unwrap();
let barrier = Arc::new(tokio::sync::Barrier::new(2));
let barrier_captured = Arc::clone(&barrier);
let consumer = tokio::spawn(async move {
let mut stream = map_pop_first(&mut read.streams()).unwrap().1.stream;
let mut stream_handler = read.stream_handler(0).await.unwrap();
let mut stream = stream_handler.stream();
stream.next().await.unwrap().unwrap();
barrier_captured.wait().await;
stream.next().await.unwrap().unwrap();
});
// Wait for consumer to read first entry
while playback_state.lock().get(&0).unwrap().vector_index < 1 {
tokio::time::sleep(Duration::from_millis(1)).await;
}
barrier.wait().await;
state.push_lp(Sequence::new(0, 1), "mem foo=2 20");