Merge branch 'main' into savage/hook-up-wal-reference-counter-actor
commit
4572057a11
|
@ -161,8 +161,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
|||
|
||||
[[package]]
|
||||
name = "arrow"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2feeebd77b34b0bc88f224e06d01c27da4733997cc4789a4e056196656cdc59a"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow-arith",
|
||||
|
@ -182,8 +183,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-arith"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7173f5dc49c0ecb5135f52565af33afd3fdc9a12d13bd6f9973e8b96305e4b2e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -196,8 +198,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-array"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63d7ea725f7d1f8bb2cffc53ef538557e95fc802e217d5be25122d402e22f3d0"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow-buffer",
|
||||
|
@ -212,8 +215,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-buffer"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bdbe439e077f484e5000b9e1d47b5e4c0d15f2b311a8f5bcc682553d5d67a722"
|
||||
dependencies = [
|
||||
"half 2.3.1",
|
||||
"num",
|
||||
|
@ -221,8 +225,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-cast"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93913cc14875770aa1eef5e310765e855effa352c094cb1c7c00607d0f37b4e1"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -238,8 +243,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-csv"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef55b67c55ed877e6fe7b923121c19dae5e31ca70249ea2779a17b58fb0fbd9a"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -256,8 +262,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-data"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4f4f4a3c54614126a71ab91f6631c9743eb4643d6e9318b74191da9dc6e028b"
|
||||
dependencies = [
|
||||
"arrow-buffer",
|
||||
"arrow-schema",
|
||||
|
@ -267,8 +274,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-flight"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1128a9f099b4e8dc9a67aed274061f3cc95afd8b7aab98f2b44cb8b7b542b71"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
|
@ -293,8 +301,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-ipc"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d41a3659f984a524ef1c2981d43747b24d8eec78e2425267fcd0ef34ce71cd18"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -306,8 +315,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-json"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10b95faa95a378f56ef32d84cc0104ea998c39ef7cd1faaa6b4cebf8ea92846d"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -316,7 +326,7 @@ dependencies = [
|
|||
"arrow-schema",
|
||||
"chrono",
|
||||
"half 2.3.1",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.0",
|
||||
"lexical-core",
|
||||
"num",
|
||||
"serde",
|
||||
|
@ -325,8 +335,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-ord"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c68549a4284d9f8b39586afb8d5ff8158b8f0286353a4844deb1d11cf1ba1f26"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -339,8 +350,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-row"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a75a4a757afc301ce010adadff54d79d66140c4282ed3de565f6ccb716a5cf3"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow-array",
|
||||
|
@ -353,13 +365,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-schema"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bebcb57eef570b15afbcf2d07d813eb476fde9f6dd69c81004d6476c197e87e"
|
||||
|
||||
[[package]]
|
||||
name = "arrow-select"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6e2943fa433a48921e914417173816af64eef61c0a3d448280e6c40a62df221"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -370,16 +384,18 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-string"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbc92ed638851774f6d7af1ad900b92bc1486746497511868b4298fcbcfa35af"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-data",
|
||||
"arrow-schema",
|
||||
"arrow-select",
|
||||
"num",
|
||||
"regex",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-syntax 0.7.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -434,9 +450,9 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
|
|||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.8.0"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf46fee83e5ccffc220104713af3292ff9bc7c64c7de289f66dae8e38d826833"
|
||||
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"event-listener",
|
||||
|
@ -485,9 +501,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.70"
|
||||
version = "0.1.71"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79fa67157abdfd688a259b6648808757db9347af834624f27ec646da976aee5d"
|
||||
checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -686,7 +702,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
|
|||
dependencies = [
|
||||
"memchr",
|
||||
"once_cell",
|
||||
"regex-automata",
|
||||
"regex-automata 0.1.10",
|
||||
"serde",
|
||||
]
|
||||
|
||||
|
@ -841,9 +857,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.3.10"
|
||||
version = "4.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384e169cc618c613d5e3ca6404dda77a8685a63e08660dcc64abaf7da7cb0c7a"
|
||||
checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
|
@ -873,9 +889,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.3.10"
|
||||
version = "4.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef137bbe35aab78bdb468ccfba75a5f4d8321ae011d34063770780545176af2d"
|
||||
checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
|
@ -1302,16 +1318,6 @@ dependencies = [
|
|||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "5.4.0"
|
||||
|
@ -1353,8 +1359,8 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow",
|
||||
|
@ -1377,8 +1383,8 @@ dependencies = [
|
|||
"futures",
|
||||
"glob",
|
||||
"hashbrown 0.14.0",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"indexmap 2.0.0",
|
||||
"itertools 0.11.0",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"num_cpus",
|
||||
|
@ -1389,7 +1395,7 @@ dependencies = [
|
|||
"pin-project-lite",
|
||||
"rand",
|
||||
"smallvec",
|
||||
"sqlparser 0.34.0",
|
||||
"sqlparser",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
|
@ -1401,8 +1407,8 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
|
@ -1410,13 +1416,13 @@ dependencies = [
|
|||
"num_cpus",
|
||||
"object_store",
|
||||
"parquet",
|
||||
"sqlparser 0.34.0",
|
||||
"sqlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"dashmap",
|
||||
"datafusion-common",
|
||||
|
@ -1432,22 +1438,22 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
"lazy_static",
|
||||
"sqlparser 0.34.0",
|
||||
"sqlparser",
|
||||
"strum 0.25.0",
|
||||
"strum_macros 0.25.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
|
@ -1456,21 +1462,22 @@ dependencies = [
|
|||
"datafusion-expr",
|
||||
"datafusion-physical-expr",
|
||||
"hashbrown 0.14.0",
|
||||
"itertools 0.10.5",
|
||||
"itertools 0.11.0",
|
||||
"log",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-syntax 0.7.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-schema",
|
||||
"base64 0.21.2",
|
||||
"blake2",
|
||||
"blake3",
|
||||
"chrono",
|
||||
|
@ -1479,8 +1486,9 @@ dependencies = [
|
|||
"datafusion-row",
|
||||
"half 2.3.1",
|
||||
"hashbrown 0.14.0",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"hex",
|
||||
"indexmap 2.0.0",
|
||||
"itertools 0.11.0",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"md-5",
|
||||
|
@ -1495,8 +1503,8 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion-proto"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
|
@ -1509,8 +1517,8 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion-row"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
|
@ -1520,15 +1528,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "26.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=e0330d6c957c724fcc91b673c6ae10c535d9a33a#e0330d6c957c724fcc91b673c6ae10c535d9a33a"
|
||||
version = "27.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=04ecaf7405dbbfd43f43acec972f2435ada5ee81#04ecaf7405dbbfd43f43acec972f2435ada5ee81"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"log",
|
||||
"sqlparser 0.34.0",
|
||||
"sqlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1723,7 +1731,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"rustix 0.38.2",
|
||||
"rustix 0.38.4",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -2010,6 +2018,24 @@ version = "0.3.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "gossip"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
"hashbrown 0.14.0",
|
||||
"metric",
|
||||
"prost",
|
||||
"prost-build",
|
||||
"test_helpers",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"uuid",
|
||||
"workspace-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "grpc-binary-logger"
|
||||
version = "0.1.0"
|
||||
|
@ -2287,7 +2313,7 @@ checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
|
|||
dependencies = [
|
||||
"http",
|
||||
"hyper",
|
||||
"rustls 0.21.2",
|
||||
"rustls 0.21.5",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.1",
|
||||
]
|
||||
|
@ -2341,9 +2367,16 @@ dependencies = [
|
|||
name = "import_export"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"data_types",
|
||||
"futures-util",
|
||||
"generated_types",
|
||||
"influxdb_iox_client",
|
||||
"iox_catalog",
|
||||
"object_store",
|
||||
"observability_deps",
|
||||
"parquet_file",
|
||||
"schema",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
|
@ -2585,7 +2618,7 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"generated_types",
|
||||
"snafu",
|
||||
"sqlparser 0.35.0",
|
||||
"sqlparser",
|
||||
"workspace-hack",
|
||||
]
|
||||
|
||||
|
@ -2642,6 +2675,7 @@ dependencies = [
|
|||
"tokio-util",
|
||||
"tonic",
|
||||
"trace",
|
||||
"tracker",
|
||||
"uuid",
|
||||
"wal",
|
||||
"workspace-hack",
|
||||
|
@ -3110,12 +3144,12 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
|
|||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.8"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24fddda5af7e54bf7da53067d6e802dbcc381d0a8eef629df528e3ebf68755cb"
|
||||
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"rustix 0.38.2",
|
||||
"rustix 0.38.4",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
|
@ -3337,7 +3371,7 @@ version = "0.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3565,6 +3599,15 @@ version = "0.3.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
|
||||
|
||||
[[package]]
|
||||
name = "ntapi"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
|
@ -3777,15 +3820,6 @@ dependencies = [
|
|||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "output_vt100"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "628223faebab4e3e40667ee0b2336d34a5b960ff60ea743ddfdbcf7770bcfb66"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
|
@ -3852,8 +3886,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "42.0.0"
|
||||
source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
|
||||
version = "43.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec7267a9607c3f955d4d0ac41b88a67cecc0d8d009173ad3da390699a6cb3750"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"arrow-array",
|
||||
|
@ -4174,7 +4209,7 @@ dependencies = [
|
|||
"query_functions",
|
||||
"schema",
|
||||
"snafu",
|
||||
"sqlparser 0.35.0",
|
||||
"sqlparser",
|
||||
"test_helpers",
|
||||
"workspace-hack",
|
||||
]
|
||||
|
@ -4212,13 +4247,11 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pretty_assertions"
|
||||
version = "1.3.0"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a25e9bcb20aa780fd0bb16b72403a9064d6b3f22f026946029acb941a50af755"
|
||||
checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66"
|
||||
dependencies = [
|
||||
"ctor",
|
||||
"diff",
|
||||
"output_vt100",
|
||||
"yansi",
|
||||
]
|
||||
|
||||
|
@ -4383,7 +4416,6 @@ dependencies = [
|
|||
"tokio-util",
|
||||
"tonic",
|
||||
"trace",
|
||||
"trace_exporters",
|
||||
"trace_http",
|
||||
"tracker",
|
||||
"uuid",
|
||||
|
@ -4401,7 +4433,7 @@ dependencies = [
|
|||
"itertools 0.11.0",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-syntax 0.7.4",
|
||||
"schema",
|
||||
"snafu",
|
||||
"tokio",
|
||||
|
@ -4528,13 +4560,14 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.8.4"
|
||||
version = "1.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
|
||||
checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-automata 0.3.2",
|
||||
"regex-syntax 0.7.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -4546,6 +4579,17 @@ dependencies = [
|
|||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.7.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.29"
|
||||
|
@ -4554,9 +4598,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
|||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.7.2"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
|
||||
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
|
@ -4581,7 +4625,7 @@ dependencies = [
|
|||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.2",
|
||||
"rustls 0.21.5",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -4708,9 +4752,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.2"
|
||||
version = "0.38.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aabcb0461ebd01d6b79945797c27f8529082226cb630a9865a71870ff63532a4"
|
||||
checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"errno",
|
||||
|
@ -4733,13 +4777,13 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.21.2"
|
||||
version = "0.21.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e32ca28af694bc1bbf399c33a516dbdf1c90090b8ab23c2bc24f834aa2247f5f"
|
||||
checksum = "79ea77c539259495ce8ca47f53e66ae0330a8819f67e23ac96ca02f50e7b7d36"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring",
|
||||
"rustls-webpki",
|
||||
"rustls-webpki 0.101.1",
|
||||
"sct",
|
||||
]
|
||||
|
||||
|
@ -4762,6 +4806,16 @@ dependencies = [
|
|||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.101.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15f36a6828982f422756984e47912a7a51dcbc2a197aa791158f8ca61cd8204e"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.12"
|
||||
|
@ -4846,18 +4900,18 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc"
|
|||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.166"
|
||||
version = "1.0.168"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d01b7404f9d441d3ad40e6a636a7782c377d2abdbe4fa2440e2edcc2f4f10db8"
|
||||
checksum = "d614f89548720367ded108b3c843be93f3a341e22d5674ca0dd5cd57f34926af"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.166"
|
||||
version = "1.0.168"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dd83d6dde2b6b2d466e14d9d1acce8816dedee94f735eac6395808b3483c6d6"
|
||||
checksum = "d4fe589678c688e44177da4f27152ee2d190757271dc7f1d5b6b9f68d869d641"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -4866,9 +4920,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.100"
|
||||
version = "1.0.102"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c"
|
||||
checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"ryu",
|
||||
|
@ -4954,6 +5008,7 @@ dependencies = [
|
|||
"serde_json",
|
||||
"service_common",
|
||||
"snafu",
|
||||
"test_helpers",
|
||||
"tokio",
|
||||
"tonic",
|
||||
"trace",
|
||||
|
@ -5150,15 +5205,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.10.0"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "snafu"
|
||||
version = "0.7.4"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb0656e7e3ffb70f6c39b3c2a86332bb74aa3c679da781642590f3c1118c5045"
|
||||
checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6"
|
||||
dependencies = [
|
||||
"doc-comment",
|
||||
"snafu-derive",
|
||||
|
@ -5166,9 +5221,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "snafu-derive"
|
||||
version = "0.7.4"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "475b3bbe5245c26f2d8a6f62d67c1f30eb9fffeccee721c45d162c3ebbdf81b2"
|
||||
checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
|
@ -5218,16 +5273,6 @@ dependencies = [
|
|||
"unicode_categories",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlparser"
|
||||
version = "0.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59"
|
||||
dependencies = [
|
||||
"log",
|
||||
"sqlparser_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlparser"
|
||||
version = "0.35.0"
|
||||
|
@ -5235,6 +5280,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "ca597d77c98894be1f965f2e4e2d2a61575d4998088e655476c73715c54b2b43"
|
||||
dependencies = [
|
||||
"log",
|
||||
"sqlparser_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -5503,6 +5549,21 @@ version = "1.1.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3543ca0810e71767052bdcdd5653f23998b192642a22c5164bfa6581e40a4a2"
|
||||
|
||||
[[package]]
|
||||
name = "sysinfo"
|
||||
version = "0.29.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "751e810399bba86e9326f5762b7f32ac5a085542df78da6a78d94e07d14d7c11"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"ntapi",
|
||||
"once_cell",
|
||||
"rayon",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.6.0"
|
||||
|
@ -5578,18 +5639,18 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.41"
|
||||
version = "1.0.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c16a64ba9387ef3fdae4f9c1a7f07a0997fce91985c0336f1ddc1822b3b37802"
|
||||
checksum = "a35fc5b8971143ca348fa6df4f024d4d55264f3468c71ad1c2f365b0a4d58c42"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.41"
|
||||
version = "1.0.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d14928354b01c4d6a4f0e549069adef399a284e7995c7ccca94e8a07a5346c59"
|
||||
checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -5742,7 +5803,7 @@ version = "0.24.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
|
||||
dependencies = [
|
||||
"rustls 0.21.2",
|
||||
"rustls 0.21.5",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
|
@ -5784,9 +5845,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.7.5"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ebafdf5ad1220cb59e7d17cf4d2c72015297b75b19a10472f99b89225089240"
|
||||
checksum = "c17e963a819c331dcacd7ab957d80bc2b9a9c1e71c804826d2f283dd65306542"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
|
@ -5805,9 +5866,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.19.11"
|
||||
version = "0.19.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "266f016b7f039eec8a1a80dfe6156b633d208b9fccca5e4db1d6775b0c4e34a7"
|
||||
checksum = "c500344a19072298cd05a7224b3c0c629348b78692bf48466c5238656e315a78"
|
||||
dependencies = [
|
||||
"indexmap 2.0.0",
|
||||
"serde",
|
||||
|
@ -6074,6 +6135,8 @@ dependencies = [
|
|||
"observability_deps",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project",
|
||||
"sysinfo",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"trace",
|
||||
|
@ -6410,7 +6473,7 @@ version = "0.23.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
|
||||
dependencies = [
|
||||
"rustls-webpki",
|
||||
"rustls-webpki 0.100.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -6662,6 +6725,7 @@ dependencies = [
|
|||
"hashbrown 0.14.0",
|
||||
"heck",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.0",
|
||||
"itertools 0.10.5",
|
||||
"libc",
|
||||
"lock_api",
|
||||
|
@ -6684,17 +6748,19 @@ dependencies = [
|
|||
"rand",
|
||||
"rand_core",
|
||||
"regex",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-automata 0.3.2",
|
||||
"regex-syntax 0.7.4",
|
||||
"reqwest",
|
||||
"ring",
|
||||
"rustix 0.38.2",
|
||||
"rustls 0.21.2",
|
||||
"rustix 0.38.4",
|
||||
"rustls 0.21.5",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"similar",
|
||||
"smallvec",
|
||||
"sqlparser",
|
||||
"sqlx",
|
||||
"sqlx-core",
|
||||
"sqlx-macros",
|
||||
|
|
35
Cargo.toml
35
Cargo.toml
|
@ -17,6 +17,7 @@ members = [
|
|||
"flightsql",
|
||||
"garbage_collector",
|
||||
"generated_types",
|
||||
"gossip",
|
||||
"grpc-binary-logger-proto",
|
||||
"grpc-binary-logger-test-proto",
|
||||
"grpc-binary-logger",
|
||||
|
@ -29,9 +30,9 @@ members = [
|
|||
"influxdb_tsm",
|
||||
"influxdb2_client",
|
||||
"influxrpc_parser",
|
||||
"ingester_query_grpc",
|
||||
"ingester_test_ctx",
|
||||
"ingester",
|
||||
"ingester_query_grpc",
|
||||
"iox_catalog",
|
||||
"iox_data_generator",
|
||||
"iox_query_influxql",
|
||||
|
@ -82,8 +83,8 @@ members = [
|
|||
"trace",
|
||||
"tracker",
|
||||
"trogging",
|
||||
"wal",
|
||||
"wal_inspect",
|
||||
"wal",
|
||||
"workspace-hack",
|
||||
]
|
||||
default-members = ["influxdb_iox"]
|
||||
|
@ -118,13 +119,14 @@ edition = "2021"
|
|||
license = "MIT OR Apache-2.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
arrow = { version = "42.0.0" }
|
||||
arrow-flight = { version = "42.0.0" }
|
||||
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "e0330d6c957c724fcc91b673c6ae10c535d9a33a", default-features = false }
|
||||
datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "e0330d6c957c724fcc91b673c6ae10c535d9a33a" }
|
||||
arrow = { version = "43.0.0" }
|
||||
arrow-flight = { version = "43.0.0" }
|
||||
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "04ecaf7405dbbfd43f43acec972f2435ada5ee81", default-features = false }
|
||||
datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "04ecaf7405dbbfd43f43acec972f2435ada5ee81" }
|
||||
|
||||
hashbrown = { version = "0.14.0" }
|
||||
object_store = { version = "0.6.0" }
|
||||
parquet = { version = "42.0.0" }
|
||||
parquet = { version = "43.0.0" }
|
||||
tonic = { version = "0.9.2", features = ["tls", "tls-webpki-roots"] }
|
||||
tonic-build = { version = "0.9.2" }
|
||||
tonic-health = { version = "0.9.2" }
|
||||
|
@ -154,22 +156,3 @@ opt-level = 3
|
|||
|
||||
[profile.dev.package.similar]
|
||||
opt-level = 3
|
||||
|
||||
[patch.crates-io]
|
||||
# TODO remove on upgrade to 43.0.0
|
||||
# Use https://github.com/apache/arrow-rs/pull/4467 to get the fix for
|
||||
# https://github.com/apache/arrow-rs/issues/4459
|
||||
parquet = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-data = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-array = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-select = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-cast = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-ipc = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-row = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-arith = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-string = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/42.0.0_patched" }
|
||||
|
|
|
@ -16,7 +16,7 @@ comfy-table = { version = "7.0", default-features = false }
|
|||
hashbrown = { workspace = true }
|
||||
num-traits = "0.2"
|
||||
once_cell = { version = "1.18", features = ["parking_lot"] }
|
||||
regex = "1.8.4"
|
||||
regex = "1.9.1"
|
||||
snafu = "0.7"
|
||||
uuid = "1"
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
|
|
@ -6,7 +6,7 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1.70"
|
||||
async-trait = "0.1.71"
|
||||
backoff = { path = "../backoff" }
|
||||
futures = "0.3"
|
||||
iox_time = { path = "../iox_time" }
|
||||
|
|
|
@ -9,7 +9,7 @@ license.workspace = true
|
|||
[dependencies]
|
||||
http = "0.2.9"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
tonic = { workspace = true }
|
||||
tower = "0.4"
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
|
|
@ -6,7 +6,7 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1.70"
|
||||
async-trait = "0.1.71"
|
||||
backoff = { path = "../backoff" }
|
||||
bytes = "1.4"
|
||||
compactor_scheduler = { path = "../compactor_scheduler" }
|
||||
|
|
|
@ -7,8 +7,8 @@ use datafusion::{
|
|||
execution::context::TaskContext,
|
||||
physical_expr::PhysicalSortExpr,
|
||||
physical_plan::{
|
||||
stream::RecordBatchStreamAdapter, ExecutionPlan, Partitioning, SendableRecordBatchStream,
|
||||
Statistics,
|
||||
stream::RecordBatchStreamAdapter, DisplayAs, DisplayFormatType, ExecutionPlan,
|
||||
Partitioning, SendableRecordBatchStream, Statistics,
|
||||
},
|
||||
};
|
||||
use schema::SchemaBuilder;
|
||||
|
@ -93,6 +93,16 @@ impl ExecutionPlan for PanicPlan {
|
|||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for PanicPlan {
|
||||
fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match t {
|
||||
DisplayFormatType::Default | DisplayFormatType::Verbose => {
|
||||
write!(f, "PanicPlan")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use data_types::CompactionLevel;
|
||||
|
|
|
@ -44,7 +44,9 @@ use super::{
|
|||
mock::MockPartitionDoneSink, PartitionDoneSink,
|
||||
},
|
||||
partition_files_source::{
|
||||
catalog::CatalogPartitionFilesSource, rate_limit::QueryRateLimit, PartitionFilesSource,
|
||||
catalog::{CatalogPartitionFilesSource, QueryRateLimiter},
|
||||
rate_limit::RateLimit,
|
||||
PartitionFilesSource,
|
||||
},
|
||||
partition_filter::{
|
||||
and::AndPartitionFilter, greater_matching_files::GreaterMatchingFilesPartitionFilter,
|
||||
|
@ -237,7 +239,7 @@ fn make_partition_files_source(config: &Config) -> Arc<dyn PartitionFilesSource>
|
|||
match config.max_partition_fetch_queries_per_second {
|
||||
Some(rps) => Arc::new(CatalogPartitionFilesSource::new(
|
||||
config.backoff_config.clone(),
|
||||
QueryRateLimit::new(Arc::clone(&config.catalog), rps),
|
||||
QueryRateLimiter::new(Arc::clone(&config.catalog), RateLimit::new(rps)),
|
||||
)),
|
||||
None => Arc::new(CatalogPartitionFilesSource::new(
|
||||
config.backoff_config.clone(),
|
||||
|
|
|
@ -5,10 +5,11 @@ use std::{
|
|||
|
||||
use async_trait::async_trait;
|
||||
use backoff::{Backoff, BackoffConfig};
|
||||
use data_types::{ParquetFile, PartitionId};
|
||||
use data_types::{ParquetFile, PartitionId, TransitionPartitionId};
|
||||
use iox_catalog::interface::Catalog;
|
||||
use observability_deps::tracing::warn;
|
||||
|
||||
use super::{rate_limit::QueryRateLimit, PartitionFilesSource};
|
||||
use super::{rate_limit::RateLimit, PartitionFilesSource};
|
||||
|
||||
#[async_trait]
|
||||
pub(crate) trait CatalogQuerier: Send + Sync + Debug {
|
||||
|
@ -18,6 +19,39 @@ pub(crate) trait CatalogQuerier: Send + Sync + Debug {
|
|||
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error>;
|
||||
}
|
||||
|
||||
/// a QueryRateLimiter applies a RateLimit to a CatalogQuerier.
|
||||
#[derive(Debug)]
|
||||
pub struct QueryRateLimiter<T> {
|
||||
inner: T,
|
||||
rate_limit: RateLimit,
|
||||
}
|
||||
|
||||
impl<T> QueryRateLimiter<T> {
|
||||
pub fn new(inner: T, rate_limit: RateLimit) -> Self {
|
||||
Self { inner, rate_limit }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T> CatalogQuerier for QueryRateLimiter<T>
|
||||
where
|
||||
T: CatalogQuerier,
|
||||
{
|
||||
async fn get_partitions(
|
||||
&self,
|
||||
partition_id: PartitionId,
|
||||
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
|
||||
while let Some(d) = self.rate_limit.can_proceed() {
|
||||
warn!(%partition_id, "partition fetch rate limited");
|
||||
|
||||
// Don't busy loop - wait the fractions of a second before a retry
|
||||
// is allowed.
|
||||
tokio::time::sleep(d).await;
|
||||
}
|
||||
self.inner.get_partitions(partition_id).await
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl CatalogQuerier for Arc<dyn Catalog> {
|
||||
async fn get_partitions(
|
||||
|
@ -27,13 +61,13 @@ impl CatalogQuerier for Arc<dyn Catalog> {
|
|||
self.repositories()
|
||||
.await
|
||||
.parquet_files()
|
||||
.list_by_partition_not_to_delete(partition_id)
|
||||
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CatalogPartitionFilesSource<T = QueryRateLimit<Arc<dyn Catalog>>> {
|
||||
pub struct CatalogPartitionFilesSource<T = QueryRateLimiter<Arc<dyn Catalog>>> {
|
||||
backoff_config: BackoffConfig,
|
||||
catalog: T,
|
||||
}
|
||||
|
@ -67,3 +101,53 @@ where
|
|||
.expect("retry forever")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::{sync::Mutex, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
|
||||
/// A [`CatalogQuerier`] that always returns OK, and counts the number of
|
||||
/// calls made.
|
||||
#[derive(Debug, Default)]
|
||||
struct MockInner(Mutex<usize>);
|
||||
#[async_trait]
|
||||
impl CatalogQuerier for &MockInner {
|
||||
async fn get_partitions(
|
||||
&self,
|
||||
_partition_id: PartitionId,
|
||||
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
|
||||
*self.0.lock().unwrap() += 1;
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rate_limit() {
|
||||
const ALLOWED_PER_SECOND: usize = 100;
|
||||
|
||||
let inner = MockInner::default();
|
||||
let r = QueryRateLimiter::new(&inner, RateLimit::new(ALLOWED_PER_SECOND));
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// If there are ALLOWED_PER_SECOND queries allowed per second, then it
|
||||
// should take 1 second to issue ALLOWED_PER_SECOND number of queries.
|
||||
//
|
||||
// Attempt to make 1/10th the number of permissible queries per second,
|
||||
// which should take at least 1/10th of a second due to smoothing, so
|
||||
// the test does not take so long.
|
||||
for _ in 0..(ALLOWED_PER_SECOND / 10) {
|
||||
r.get_partitions(PartitionId::new(42)).await.unwrap();
|
||||
}
|
||||
|
||||
// It should have taken at least 1/10th of a second
|
||||
let duration = Instant::now() - start;
|
||||
assert!(duration > Duration::from_millis(ALLOWED_PER_SECOND as u64 / 10));
|
||||
|
||||
// Exactly 1/10th the number of queries should be dispatched to the
|
||||
// inner impl.
|
||||
assert_eq!(*inner.0.lock().unwrap(), ALLOWED_PER_SECOND / 10);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,36 +1,30 @@
|
|||
use std::{sync::Mutex, time::Duration};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use data_types::{ParquetFile, PartitionId};
|
||||
use observability_deps::tracing::warn;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::catalog::CatalogQuerier;
|
||||
|
||||
/// A [`CatalogQuerier`] rate limiter that smooths `N` queries over a second.
|
||||
/// A [`RateLimit`] rate limiter that smooths `N` queries over a second.
|
||||
#[derive(Debug)]
|
||||
pub struct QueryRateLimit<T> {
|
||||
inner: T,
|
||||
|
||||
pub struct RateLimit {
|
||||
last_query: Mutex<Instant>,
|
||||
min_interval: Duration,
|
||||
min_interval: Mutex<Duration>,
|
||||
}
|
||||
|
||||
impl<T> QueryRateLimit<T> {
|
||||
pub(crate) fn new(inner: T, rps: usize) -> Self {
|
||||
impl RateLimit {
|
||||
pub(crate) fn new(rps: usize) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
last_query: Mutex::new(Instant::now()),
|
||||
min_interval: Duration::from_secs(1) / rps as u32,
|
||||
min_interval: Mutex::new(Duration::from_secs(1) / rps as u32),
|
||||
}
|
||||
}
|
||||
|
||||
fn can_proceed(&self) -> Option<Duration> {
|
||||
pub fn can_proceed(&self) -> Option<Duration> {
|
||||
let mut last_query = self.last_query.lock().unwrap();
|
||||
let now = Instant::now();
|
||||
|
||||
// Has enough time passed since the last query was allowed?
|
||||
let next_allowed = last_query.checked_add(self.min_interval).unwrap();
|
||||
let next_allowed = last_query
|
||||
.checked_add(*self.min_interval.lock().unwrap())
|
||||
.unwrap();
|
||||
if now < next_allowed {
|
||||
return Some(next_allowed - now);
|
||||
}
|
||||
|
@ -38,72 +32,8 @@ impl<T> QueryRateLimit<T> {
|
|||
*last_query = now;
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T> CatalogQuerier for QueryRateLimit<T>
|
||||
where
|
||||
T: CatalogQuerier,
|
||||
{
|
||||
async fn get_partitions(
|
||||
&self,
|
||||
partition_id: PartitionId,
|
||||
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
|
||||
while let Some(d) = self.can_proceed() {
|
||||
warn!(%partition_id, "partition fetch rate limited");
|
||||
|
||||
// Don't busy loop - wait the fractions of a second before a retry
|
||||
// is allowed.
|
||||
tokio::time::sleep(d).await;
|
||||
}
|
||||
self.inner.get_partitions(partition_id).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// A [`CatalogQuerier`] that always returns OK, and counts the number of
|
||||
/// calls made.
|
||||
#[derive(Debug, Default)]
|
||||
struct MockInner(Mutex<usize>);
|
||||
#[async_trait]
|
||||
impl CatalogQuerier for &MockInner {
|
||||
async fn get_partitions(
|
||||
&self,
|
||||
_partition_id: PartitionId,
|
||||
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
|
||||
*self.0.lock().unwrap() += 1;
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rate_limit() {
|
||||
const ALLOWED_PER_SECOND: usize = 100;
|
||||
|
||||
let inner = MockInner::default();
|
||||
let r = QueryRateLimit::new(&inner, ALLOWED_PER_SECOND);
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// If there are ALLOWED_PER_SECOND queries allowed per second, then it
|
||||
// should take 1 second to issue ALLOWED_PER_SECOND number of queries.
|
||||
//
|
||||
// Attempt to make 1/10th the number of permissible queries per second,
|
||||
// which should take at least 1/10th of a second due to smoothing, so
|
||||
// the test does not take so long.
|
||||
for _ in 0..(ALLOWED_PER_SECOND / 10) {
|
||||
r.get_partitions(PartitionId::new(42)).await.unwrap();
|
||||
}
|
||||
|
||||
// It should have taken at least 1/10th of a second
|
||||
let duration = Instant::now() - start;
|
||||
assert!(duration > Duration::from_millis(ALLOWED_PER_SECOND as u64 / 10));
|
||||
|
||||
// Exactly 1/10th the number of queries should be dispatched to the
|
||||
// inner impl.
|
||||
assert_eq!(*inner.0.lock().unwrap(), ALLOWED_PER_SECOND / 10);
|
||||
pub fn update_rps(&self, rps: usize) {
|
||||
*self.min_interval.lock().unwrap() = Duration::from_secs(1) / rps as u32;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,8 +2,8 @@ use std::{fmt::Display, sync::Arc};
|
|||
|
||||
use async_trait::async_trait;
|
||||
use backoff::{Backoff, BackoffConfig};
|
||||
use data_types::{Partition, PartitionId};
|
||||
use iox_catalog::interface::Catalog;
|
||||
use data_types::{Partition, PartitionId, TransitionPartitionId};
|
||||
use iox_catalog::{interface::Catalog, partition_lookup};
|
||||
|
||||
use super::PartitionSource;
|
||||
|
||||
|
@ -33,12 +33,9 @@ impl PartitionSource for CatalogPartitionSource {
|
|||
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition> {
|
||||
Backoff::new(&self.backoff_config)
|
||||
.retry_all_errors("partition_by_id", || async {
|
||||
self.catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.get_by_id(partition_id)
|
||||
.await
|
||||
let mut repos = self.catalog.repositories().await;
|
||||
let id = TransitionPartitionId::Deprecated(partition_id);
|
||||
partition_lookup(repos.as_mut(), &id).await
|
||||
})
|
||||
.await
|
||||
.expect("retry forever")
|
||||
|
|
|
@ -4,6 +4,7 @@ use compactor_scheduler::PartitionsSource;
|
|||
use data_types::PartitionId;
|
||||
use futures::{stream::BoxStream, StreamExt};
|
||||
|
||||
use super::super::partition_files_source::rate_limit::RateLimit;
|
||||
use super::PartitionStream;
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -12,6 +13,7 @@ where
|
|||
T: PartitionsSource,
|
||||
{
|
||||
source: Arc<T>,
|
||||
limiter: RateLimit,
|
||||
}
|
||||
|
||||
impl<T> EndlessPartititionStream<T>
|
||||
|
@ -21,6 +23,7 @@ where
|
|||
pub fn new(source: T) -> Self {
|
||||
Self {
|
||||
source: Arc::new(source),
|
||||
limiter: RateLimit::new(1), // Initial rate is irrelevant, it will be updated before first use.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -47,12 +50,25 @@ where
|
|||
let source = Arc::clone(&source);
|
||||
async move {
|
||||
loop {
|
||||
while let Some(d) = self.limiter.can_proceed() {
|
||||
// Throttling because either we don't need to go this fast, or we're at risk of hitting the catalog
|
||||
// to hard, or both.
|
||||
tokio::time::sleep(d).await;
|
||||
}
|
||||
|
||||
if let Some(p_id) = buffer.pop_front() {
|
||||
return Some((p_id, buffer));
|
||||
}
|
||||
|
||||
// fetch new data
|
||||
buffer = VecDeque::from(source.fetch().await);
|
||||
|
||||
// update rate limiter so we can complete the batch in 5m, which is plenty fast.
|
||||
let mut rate = buffer.len() / (5 * 60);
|
||||
if rate < 1 {
|
||||
rate = 1;
|
||||
}
|
||||
self.limiter.update_rps(rate);
|
||||
}
|
||||
}
|
||||
})
|
||||
|
|
|
@ -52,7 +52,7 @@ async fn compact_partition(
|
|||
df_semaphore: Arc<InstrumentedAsyncSemaphore>,
|
||||
components: Arc<Components>,
|
||||
) {
|
||||
info!(partition_id = partition_id.get(), "compact partition",);
|
||||
info!(partition_id = partition_id.get(), timeout = ?partition_timeout, "compact partition",);
|
||||
let scratchpad = components.scratchpad_gen.pad();
|
||||
|
||||
let res = timeout_with_progress_checking(partition_timeout, |transmit_progress_signal| {
|
||||
|
|
|
@ -6,7 +6,7 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1.70"
|
||||
async-trait = "0.1.71"
|
||||
backoff = { path = "../backoff" }
|
||||
data_types = { path = "../data_types" }
|
||||
iox_catalog = { path = "../iox_catalog" }
|
||||
|
|
|
@ -5,6 +5,7 @@ use backoff::{Backoff, BackoffConfig};
|
|||
use data_types::PartitionId;
|
||||
use iox_catalog::interface::Catalog;
|
||||
use iox_time::{Time, TimeProvider};
|
||||
use observability_deps::tracing::info;
|
||||
|
||||
use crate::PartitionsSource;
|
||||
|
||||
|
@ -74,21 +75,34 @@ impl PartitionsSource for CatalogToCompactPartitionsSource {
|
|||
// we're going check the time range we'd like to query for against the end time of the last query.
|
||||
let mut last = self.last_maximum_time.lock().unwrap();
|
||||
|
||||
// if the last query ended further back in time than this query starts, we're about to skip something.
|
||||
if *last < minimum_time {
|
||||
if minimum_time.sub(*last) < self.min_threshold * 3 {
|
||||
// the end of the last query says we're skipping less than 3x our configured lookback, so
|
||||
// back up and query everything since the last query.
|
||||
minimum_time = *last;
|
||||
} else {
|
||||
// end of the last query says we're skiping a lot. We should limit how far we lookback to avoid
|
||||
// returning all partitions, so we'll just backup 3x the configured lookback.
|
||||
// this might skip something (until cold compaction), but we need a limit in how far we look back.
|
||||
minimum_time = self.time_provider.now() - self.min_threshold * 3;
|
||||
}
|
||||
// query for partitions with activity since the last query. We shouldn't query for a time range
|
||||
// we've already covered. So if the prior query was 2m ago, and the query covered 10m, ending at
|
||||
// the time of that query, we just need to query for activity in the last 2m. Asking for more than
|
||||
// that creates busy-work that will spam the catalog with more queries to determine no compaction
|
||||
// needed. But we also don't want to query so far back in time that we get all partitions, so the
|
||||
// lookback is limited to 3x the configured threshold.
|
||||
if minimum_time < *last || minimum_time.sub(*last) < self.min_threshold * 3 {
|
||||
// the end of the last query is less than 3x our configured lookback, so we can query everything
|
||||
// since the last query.
|
||||
minimum_time = *last;
|
||||
} else {
|
||||
// end of the last query says we're skiping a lot. We should limit how far we lookback to avoid
|
||||
// returning all partitions, so we'll just backup 3x the configured lookback.
|
||||
// this might skip something (until cold compaction), but we need a limit in how far we look back.
|
||||
minimum_time = self.time_provider.now() - self.min_threshold * 3;
|
||||
}
|
||||
maximum_time = self.max_threshold.map(|max| self.time_provider.now() - max);
|
||||
|
||||
info!(
|
||||
minimum_time = minimum_time.to_string().as_str(),
|
||||
maximum_time = maximum_time
|
||||
.map(|mt| mt.to_string())
|
||||
.unwrap_or(String::from(""))
|
||||
.as_str(),
|
||||
last_maximum_time = (*last).to_string().as_str(),
|
||||
"Fetching partitions to consider for compaction",
|
||||
);
|
||||
|
||||
// save the maximum time used in this query to self.last_maximum_time
|
||||
*last = maximum_time.unwrap_or(self.time_provider.now());
|
||||
}
|
||||
|
@ -113,6 +127,7 @@ mod tests {
|
|||
use data_types::Timestamp;
|
||||
use iox_catalog::mem::MemCatalog;
|
||||
use iox_tests::PartitionBuilder;
|
||||
use iox_time::MockProvider;
|
||||
|
||||
fn partition_ids(ids: &[i64]) -> Vec<PartitionId> {
|
||||
ids.iter().cloned().map(PartitionId::new).collect()
|
||||
|
@ -122,17 +137,18 @@ mod tests {
|
|||
catalog: Arc<MemCatalog>,
|
||||
min_threshold: Duration,
|
||||
max_threshold: Option<Duration>,
|
||||
second_query_delta: Duration, // time between first and second query
|
||||
first_expected_ids: &[i64], // expected values on first fetch, which does a 3x on min_threshold
|
||||
second_expected_ids: &[i64], // expected values on second fetch, which uses min_threshold unmodified
|
||||
) {
|
||||
let time_provider = catalog.time_provider();
|
||||
let time_provider = Arc::new(MockProvider::new(catalog.time_provider().now()));
|
||||
|
||||
let partitions_source = CatalogToCompactPartitionsSource::new(
|
||||
Default::default(),
|
||||
catalog,
|
||||
min_threshold,
|
||||
max_threshold,
|
||||
time_provider,
|
||||
Arc::<iox_time::MockProvider>::clone(&time_provider),
|
||||
);
|
||||
|
||||
let mut actual_partition_ids = partitions_source.fetch().await;
|
||||
|
@ -145,6 +161,7 @@ mod tests {
|
|||
max_threshold {max_threshold:?} failed (first fetch, 3x lookback)",
|
||||
);
|
||||
|
||||
time_provider.inc(second_query_delta);
|
||||
let mut actual_partition_ids = partitions_source.fetch().await;
|
||||
actual_partition_ids.sort();
|
||||
|
||||
|
@ -163,10 +180,15 @@ mod tests {
|
|||
|
||||
let time_three_hour_ago = Timestamp::from(time_provider.hours_ago(3));
|
||||
let time_six_hour_ago = Timestamp::from(time_provider.hours_ago(6));
|
||||
let time_one_min_future = Timestamp::from(time_provider.minutes_into_future(1));
|
||||
|
||||
for (id, time) in [(1, time_three_hour_ago), (2, time_six_hour_ago)]
|
||||
.iter()
|
||||
.cloned()
|
||||
for (id, time) in [
|
||||
(1, time_three_hour_ago),
|
||||
(2, time_six_hour_ago),
|
||||
(3, time_one_min_future),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
{
|
||||
let partition = PartitionBuilder::new(id as i64)
|
||||
.with_new_file_at(time)
|
||||
|
@ -175,13 +197,44 @@ mod tests {
|
|||
}
|
||||
|
||||
let one_minute = Duration::from_secs(60);
|
||||
fetch_test(Arc::clone(&catalog), one_minute, None, &[], &[]).await;
|
||||
let ten_minute = Duration::from_secs(60) * 10;
|
||||
|
||||
// the lack of end time means it gets the future file (3) in the first query, this is an
|
||||
// oddity of a test case that has files with a future timestamp (not a real world concern).
|
||||
// the second query 10m later with a cap of 3m lookback doesn't get it.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
one_minute,
|
||||
None,
|
||||
ten_minute,
|
||||
&[3],
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
|
||||
let four_hours = Duration::from_secs(60 * 60 * 4);
|
||||
fetch_test(Arc::clone(&catalog), four_hours, None, &[1, 2], &[1]).await;
|
||||
// again the future file is included in he first query, just an oddity of the test case.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
four_hours,
|
||||
None,
|
||||
ten_minute,
|
||||
&[1, 2, 3],
|
||||
&[3],
|
||||
)
|
||||
.await;
|
||||
|
||||
let seven_hours = Duration::from_secs(60 * 60 * 7);
|
||||
fetch_test(Arc::clone(&catalog), seven_hours, None, &[1, 2], &[1, 2]).await;
|
||||
// again the future file is included in he first query, just an oddity of the test case.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
seven_hours,
|
||||
None,
|
||||
ten_minute,
|
||||
&[1, 2, 3],
|
||||
&[3],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
@ -192,11 +245,13 @@ mod tests {
|
|||
let time_now = Timestamp::from(time_provider.now());
|
||||
let time_three_hour_ago = Timestamp::from(time_provider.hours_ago(3));
|
||||
let time_six_hour_ago = Timestamp::from(time_provider.hours_ago(6));
|
||||
let time_one_min_future = Timestamp::from(time_provider.minutes_into_future(1));
|
||||
|
||||
for (id, time) in [
|
||||
(1, time_now),
|
||||
(2, time_three_hour_ago),
|
||||
(3, time_six_hour_ago),
|
||||
(4, time_one_min_future),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
@ -209,54 +264,80 @@ mod tests {
|
|||
|
||||
let one_minute = Duration::from_secs(60);
|
||||
let one_hour = Duration::from_secs(60 * 60);
|
||||
let two_hour = Duration::from_secs(60 * 60 * 2);
|
||||
let four_hours = Duration::from_secs(60 * 60 * 4);
|
||||
let seven_hours = Duration::from_secs(60 * 60 * 7);
|
||||
|
||||
// File 3 is all that falls within the 7-4h lookback window. With 1m to the next query,
|
||||
// nothing is found with windows advanced by 1m.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
seven_hours,
|
||||
Some(four_hours),
|
||||
one_minute,
|
||||
&[3],
|
||||
&[3],
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
|
||||
// With a 7-1h lookback window, files 2 and 3 are found. With 2h to the next query, the
|
||||
// window advances to find the two newer files.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
seven_hours,
|
||||
Some(one_hour),
|
||||
two_hour,
|
||||
&[2, 3],
|
||||
&[2, 3],
|
||||
&[1, 4],
|
||||
)
|
||||
.await;
|
||||
|
||||
// With a 7h-1m lookback window, files 2 and 3 are found. With 1m to the next query, the
|
||||
// window advances to find the one newer file.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
seven_hours,
|
||||
Some(one_minute),
|
||||
one_minute,
|
||||
&[2, 3],
|
||||
&[2, 3],
|
||||
&[1],
|
||||
)
|
||||
.await;
|
||||
|
||||
// With a 4h-1h lookback window, files 2 and 3 are found. With 1m to the next query, there's
|
||||
// nothing new in the next window.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
four_hours,
|
||||
Some(one_hour),
|
||||
one_minute,
|
||||
&[2, 3],
|
||||
&[2],
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
|
||||
// With a 4h-1m lookback window, files 2 and 3 are found. With 4h to the next query, the
|
||||
// remaining files are found.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
four_hours,
|
||||
Some(one_minute),
|
||||
four_hours,
|
||||
&[2, 3],
|
||||
&[2],
|
||||
&[1, 4],
|
||||
)
|
||||
.await;
|
||||
|
||||
fetch_test(Arc::clone(&catalog), one_hour, Some(one_minute), &[], &[]).await;
|
||||
// With a 1h-1m lookback window, nothing is found. In the second query 1m later, it finds
|
||||
// the file create 'now'.
|
||||
fetch_test(
|
||||
Arc::clone(&catalog),
|
||||
one_hour,
|
||||
Some(one_minute),
|
||||
one_minute,
|
||||
&[],
|
||||
&[1],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1.70"
|
||||
async-trait = "0.1.71"
|
||||
backoff = { path = "../backoff" }
|
||||
compactor = { path = "../compactor" }
|
||||
compactor_scheduler = { path = "../compactor_scheduler" }
|
||||
|
|
|
@ -18,7 +18,7 @@ ordered-float = "3"
|
|||
schema = { path = "../schema" }
|
||||
sha2 = "0.10"
|
||||
sqlx = { version = "0.6", features = ["runtime-tokio-rustls", "postgres", "uuid"] }
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
percent-encoding = "2.2.0"
|
||||
|
|
|
@ -412,7 +412,7 @@ mod tests {
|
|||
let ts_predicate_expr = make_range_expr(101, 202, "time");
|
||||
let expected_string =
|
||||
"TimestampNanosecond(101, None) <= time AND time < TimestampNanosecond(202, None)";
|
||||
let actual_string = format!("{ts_predicate_expr:?}");
|
||||
let actual_string = format!("{ts_predicate_expr}");
|
||||
|
||||
assert_eq!(actual_string, expected_string);
|
||||
}
|
||||
|
|
|
@ -542,7 +542,7 @@ mod tests {
|
|||
|
||||
async fn list_by_partition_not_to_delete(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
|
||||
self.inner
|
||||
.list_by_partition_not_to_delete(partition_id)
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
[package]
|
||||
name = "gossip"
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1.68"
|
||||
futures = "0.3.28"
|
||||
hashbrown.workspace = true
|
||||
metric = { version = "0.1.0", path = "../metric" }
|
||||
prost = "0.11.9"
|
||||
thiserror = "1.0.40"
|
||||
tokio = { version = "1.28.2", features = ["net", "io-util", "time", "rt", "sync", "macros"] }
|
||||
tracing = "0.1.37"
|
||||
uuid = { version = "1.3.3", features = ["v4"] }
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
||||
[build-dependencies]
|
||||
prost-build = "0.11.9"
|
||||
|
||||
[dev-dependencies]
|
||||
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
|
|
@ -0,0 +1,16 @@
|
|||
use std::{error::Error, path::PathBuf};
|
||||
|
||||
use prost_build::Config;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("proto");
|
||||
let proto = root.join("gossip.proto");
|
||||
|
||||
println!("cargo:rerun-if-changed={}", proto.display());
|
||||
|
||||
Config::new()
|
||||
.bytes(["."])
|
||||
.compile_protos(&[proto], &[root])?;
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
syntax = "proto3";
|
||||
package influxdata.iox.gossip.v1;
|
||||
option go_package = "github.com/influxdata/iox/gossip/v1";
|
||||
|
||||
// The payload of a single gossip datagram.
|
||||
message Frame {
|
||||
// Per-instance UUID as raw BE bytes.
|
||||
bytes identity = 1;
|
||||
|
||||
// One or more user/control frames packed into a single message.
|
||||
repeated FrameMessage messages = 2;
|
||||
}
|
||||
|
||||
// A single gossip message within a frame.
|
||||
message FrameMessage {
|
||||
// Various user/control message types.
|
||||
oneof payload {
|
||||
Ping ping = 1;
|
||||
Pong pong = 2;
|
||||
|
||||
// User-provided data payload.
|
||||
UserPayload user_data = 3;
|
||||
}
|
||||
}
|
||||
|
||||
message Ping {}
|
||||
message Pong {}
|
||||
|
||||
// An application payload from the caller of the gossip library.
|
||||
message UserPayload {
|
||||
// An opaque user payload - this is handed back to the gossip library user
|
||||
// unmodified.
|
||||
bytes payload = 1;
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use tokio::{
|
||||
net::{ToSocketAddrs, UdpSocket},
|
||||
sync::mpsc,
|
||||
};
|
||||
|
||||
use crate::{handle::GossipHandle, reactor::Reactor, Dispatcher};
|
||||
|
||||
/// Gossip subsystem configuration and initialisation.
|
||||
#[derive(Debug)]
|
||||
pub struct Builder<T> {
|
||||
seed_addrs: Vec<String>,
|
||||
dispatcher: T,
|
||||
metric: Arc<metric::Registry>,
|
||||
}
|
||||
|
||||
impl<T> Builder<T> {
|
||||
/// Use `seed_addrs` as seed peer addresses, and dispatch any application
|
||||
/// messages to `dispatcher`.
|
||||
///
|
||||
/// Each address in `seed_addrs` is re-resolved periodically and the first
|
||||
/// resolved IP address is used for peer communication.
|
||||
pub fn new(seed_addrs: Vec<String>, dispatcher: T, metric: Arc<metric::Registry>) -> Self {
|
||||
Self {
|
||||
seed_addrs,
|
||||
dispatcher,
|
||||
metric,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Builder<T>
|
||||
where
|
||||
T: Dispatcher + 'static,
|
||||
{
|
||||
/// Initialise the gossip subsystem using `socket` for communication.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This call spawns a tokio task, and as such must be called from within a
|
||||
/// tokio runtime.
|
||||
pub fn build(self, socket: UdpSocket) -> GossipHandle {
|
||||
// Obtain a channel to communicate between the actor, and all handles
|
||||
let (tx, rx) = mpsc::channel(1000);
|
||||
|
||||
// Initialise the reactor
|
||||
let reactor = Reactor::new(self.seed_addrs, socket, self.dispatcher, &self.metric);
|
||||
let identity = reactor.identity().clone();
|
||||
|
||||
// Start the message reactor.
|
||||
tokio::spawn(reactor.run(rx));
|
||||
|
||||
GossipHandle::new(tx, identity)
|
||||
}
|
||||
|
||||
/// Bind to the provided socket address and initialise the gossip subsystem.
|
||||
pub async fn bind<A>(self, bind_addr: A) -> Result<GossipHandle, std::io::Error>
|
||||
where
|
||||
A: ToSocketAddrs + Send,
|
||||
{
|
||||
Ok(self.build(UdpSocket::bind(bind_addr).await?))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
use async_trait::async_trait;
|
||||
use tracing::warn;
|
||||
|
||||
// Re-export the bytes type to ensure upstream users of this crate are
|
||||
// interacting with the same type.
|
||||
pub use prost::bytes::Bytes;
|
||||
|
||||
/// A delegate abstraction through which the gossip subsystem propagates
|
||||
/// application-level messages received from other peers.
|
||||
#[async_trait]
|
||||
pub trait Dispatcher: Send + Sync {
|
||||
/// Invoked when an application-level payload is received from a peer.
|
||||
///
|
||||
/// This call should not block / should complete quickly to avoid blocking
|
||||
/// the gossip reactor loop - if a long-running job must be started within
|
||||
/// this call, consider spawning a separate task.
|
||||
async fn dispatch(&self, payload: Bytes);
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Dispatcher for tokio::sync::mpsc::Sender<Bytes> {
|
||||
async fn dispatch(&self, payload: Bytes) {
|
||||
if let Err(e) = self.send(payload).await {
|
||||
warn!(error=%e, "error dispatching payload to application handler");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A no-op dispatcher.
|
||||
#[cfg(test)]
|
||||
#[async_trait::async_trait]
|
||||
impl Dispatcher for () {
|
||||
async fn dispatch(&self, _payload: crate::Bytes) {}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
use crate::{Bytes, MAX_USER_PAYLOAD_BYTES};
|
||||
use thiserror::Error;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::peers::Identity;
|
||||
|
||||
/// An error indicating a send was attempted with a payload that exceeds
|
||||
/// [`MAX_USER_PAYLOAD_BYTES`].
|
||||
#[derive(Error, Debug)]
|
||||
#[error("max allowed payload size exceeded")]
|
||||
#[allow(missing_copy_implementations)]
|
||||
pub struct PayloadSizeError {}
|
||||
|
||||
/// Requests sent to the [`Reactor`] actor task.
|
||||
///
|
||||
/// [`Reactor`]: crate::reactor::Reactor
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum Request {
|
||||
/// Broadcast the given payload to all known peers.
|
||||
Broadcast(Bytes),
|
||||
|
||||
/// Get a snapshot of the peer identities.
|
||||
GetPeers(oneshot::Sender<Vec<Uuid>>),
|
||||
}
|
||||
|
||||
/// A handle to the gossip subsystem.
|
||||
///
|
||||
/// All resources used by the gossip system will be released once this
|
||||
/// [`GossipHandle`] is dropped. To share the handle, wrap it in an [`Arc`].
|
||||
///
|
||||
/// [`Arc`]: std::sync::Arc
|
||||
#[derive(Debug)]
|
||||
pub struct GossipHandle {
|
||||
tx: mpsc::Sender<Request>,
|
||||
identity: Identity,
|
||||
}
|
||||
|
||||
impl GossipHandle {
|
||||
pub(crate) fn new(tx: mpsc::Sender<Request>, identity: Identity) -> Self {
|
||||
Self { tx, identity }
|
||||
}
|
||||
|
||||
/// Return the randomly generated identity of this gossip instance.
|
||||
pub fn identity(&self) -> Uuid {
|
||||
*self.identity
|
||||
}
|
||||
|
||||
/// Broadcast `payload` to all known peers.
|
||||
///
|
||||
/// This is a best-effort operation - peers are not guaranteed to receive
|
||||
/// this broadcast.
|
||||
pub async fn broadcast<T>(&self, payload: T) -> Result<(), PayloadSizeError>
|
||||
where
|
||||
T: Into<Bytes> + Send,
|
||||
{
|
||||
let payload = payload.into();
|
||||
if payload.len() > MAX_USER_PAYLOAD_BYTES {
|
||||
return Err(PayloadSizeError {});
|
||||
}
|
||||
|
||||
self.tx.send(Request::Broadcast(payload)).await.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Retrieve a snapshot of the connected peer list.
|
||||
pub async fn get_peers(&self) -> Vec<Uuid> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
self.tx.send(Request::GetPeers(tx)).await.unwrap();
|
||||
rx.await.unwrap()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
//! A work-in-progress, simple gossip primitive for metadata distribution
|
||||
//! between IOx nodes.
|
||||
//!
|
||||
//! # Transport
|
||||
//!
|
||||
//! Prefer small payloads where possible, and expect loss of some messages -
|
||||
//! this primitive provides *best effort* delivery.
|
||||
//!
|
||||
//! This implementation sends unicast UDP frames between peers, with support for
|
||||
//! both control frames & user payloads. The maximum message size is 65,507
|
||||
//! bytes ([`MAX_USER_PAYLOAD_BYTES`] for application-level payloads), but a
|
||||
//! packet this large is fragmented into smaller (at most MTU-sized) packets and
|
||||
//! is at greater risk of being dropped due to a lost fragment.
|
||||
//!
|
||||
//! # Security
|
||||
//!
|
||||
//! Messages exchanged between peers are unauthenticated and connectionless -
|
||||
//! it's trivial to forge a message appearing to come from a different peer, or
|
||||
//! include malicious payloads.
|
||||
//!
|
||||
//! The security model of this implementation expects the peers to be running in
|
||||
//! a trusted environment, secure from malicious users.
|
||||
|
||||
#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
|
||||
#![warn(
|
||||
clippy::clone_on_ref_ptr,
|
||||
clippy::dbg_macro,
|
||||
clippy::explicit_iter_loop,
|
||||
clippy::future_not_send,
|
||||
clippy::todo,
|
||||
clippy::use_self,
|
||||
missing_copy_implementations,
|
||||
missing_debug_implementations,
|
||||
unused_crate_dependencies,
|
||||
missing_docs
|
||||
)]
|
||||
|
||||
mod builder;
|
||||
mod dispatcher;
|
||||
mod handle;
|
||||
mod metric;
|
||||
mod peers;
|
||||
mod proto;
|
||||
mod reactor;
|
||||
pub(crate) mod seed;
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
/// Work around the unused_crate_dependencies false positives for test deps.
|
||||
#[cfg(test)]
|
||||
use test_helpers as _;
|
||||
use workspace_hack as _;
|
||||
|
||||
pub use builder::*;
|
||||
pub use dispatcher::*;
|
||||
pub use handle::*;
|
||||
|
||||
/// The maximum duration of time allotted to performing a DNS resolution against
|
||||
/// a seed/peer address.
|
||||
const RESOLVE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
/// Defines the interval between PING frames sent to all configured seed peers.
|
||||
const SEED_PING_INTERVAL: std::time::Duration = Duration::from_secs(15);
|
||||
|
||||
/// The maximum payload size allowed.
|
||||
///
|
||||
/// Attempting to send a serialised packet (inclusive of control frames/fields)
|
||||
/// in excess of this amount will result in an error.
|
||||
const MAX_FRAME_BYTES: usize = 1024 * 10;
|
||||
|
||||
/// The frame header overhead for user payloads.
|
||||
const USER_PAYLOAD_OVERHEAD: usize = 22;
|
||||
|
||||
/// The maximum allowed byte size of user payloads.
|
||||
///
|
||||
/// Sending payloads of this size is discouraged as it leads to fragmentation of
|
||||
/// the message and increases the chance of the message being undelivered /
|
||||
/// dropped. Smaller is always better for UDP transports!
|
||||
pub const MAX_USER_PAYLOAD_BYTES: usize = MAX_FRAME_BYTES - USER_PAYLOAD_OVERHEAD;
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::assertions_on_constants)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_max_msg_size() {
|
||||
assert!(MAX_FRAME_BYTES < 65_536, "cannot exceed UDP maximum");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_user_payload_size() {
|
||||
assert_eq!(
|
||||
MAX_USER_PAYLOAD_BYTES, 10_218,
|
||||
"applications may depend on this value not changing"
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
//! Metric newtype wrappers for type safety.
|
||||
//!
|
||||
//! The metrics are easily confused (they're all counters) so have the compiler
|
||||
//! check the right ones are being used in the right places.
|
||||
|
||||
use metric::U64Counter;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct SentFrames(metric::U64Counter);
|
||||
|
||||
impl SentFrames {
|
||||
pub(crate) fn inc(&self, v: usize) {
|
||||
self.0.inc(v as u64)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ReceivedFrames(metric::U64Counter);
|
||||
|
||||
impl ReceivedFrames {
|
||||
pub(crate) fn inc(&self, v: usize) {
|
||||
self.0.inc(v as u64)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct SentBytes(metric::U64Counter);
|
||||
|
||||
impl SentBytes {
|
||||
pub(crate) fn inc(&self, v: usize) {
|
||||
self.0.inc(v as u64)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ReceivedBytes(metric::U64Counter);
|
||||
|
||||
impl ReceivedBytes {
|
||||
pub(crate) fn inc(&self, v: usize) {
|
||||
self.0.inc(v as u64)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_metrics(
|
||||
metrics: &metric::Registry,
|
||||
) -> (SentFrames, ReceivedFrames, SentBytes, ReceivedBytes) {
|
||||
let metric_frames = metrics.register_metric::<U64Counter>(
|
||||
"gossip_frames",
|
||||
"number of frames sent/received by this node",
|
||||
);
|
||||
let metric_bytes = metrics
|
||||
.register_metric::<U64Counter>("gossip_bytes", "sum of bytes sent/received by this node");
|
||||
|
||||
(
|
||||
SentFrames(metric_frames.recorder(&[("direction", "sent")])),
|
||||
ReceivedFrames(metric_frames.recorder(&[("direction", "received")])),
|
||||
SentBytes(metric_bytes.recorder(&[("direction", "sent")])),
|
||||
ReceivedBytes(metric_bytes.recorder(&[("direction", "received")])),
|
||||
)
|
||||
}
|
|
@ -0,0 +1,244 @@
|
|||
use std::{io, net::SocketAddr};
|
||||
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use hashbrown::{hash_map::RawEntryMut, HashMap};
|
||||
use metric::U64Counter;
|
||||
use prost::bytes::Bytes;
|
||||
use tokio::net::UdpSocket;
|
||||
use tracing::{trace, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
metric::{SentBytes, SentFrames},
|
||||
MAX_FRAME_BYTES,
|
||||
};
|
||||
|
||||
/// A unique generated identity containing 128 bits of randomness (V4 UUID).
|
||||
#[derive(Debug, Eq, Clone)]
|
||||
pub(crate) struct Identity(Bytes, Uuid);
|
||||
|
||||
impl std::ops::Deref for Identity {
|
||||
type Target = Uuid;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.1
|
||||
}
|
||||
}
|
||||
|
||||
impl std::hash::Hash for Identity {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.0.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Identity {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
debug_assert!((self.1 == other.1) == (self.0 == other.0));
|
||||
self.0 == other.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Identity {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.1.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<Bytes> for Identity {
|
||||
type Error = uuid::Error;
|
||||
|
||||
fn try_from(value: Bytes) -> Result<Self, Self::Error> {
|
||||
let uuid = Uuid::from_slice(&value)?;
|
||||
Ok(Self(value, uuid))
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity {
|
||||
/// Generate a new random identity.
|
||||
pub(crate) fn new() -> Self {
|
||||
let id = Uuid::new_v4();
|
||||
Self(Bytes::from(id.as_bytes().to_vec()), id)
|
||||
}
|
||||
|
||||
pub(crate) fn as_bytes(&self) -> &Bytes {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// A discovered peer within the gossip cluster.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct Peer {
|
||||
identity: Identity,
|
||||
addr: SocketAddr,
|
||||
}
|
||||
|
||||
impl Peer {
|
||||
pub(crate) async fn send(
|
||||
&self,
|
||||
buf: &[u8],
|
||||
socket: &UdpSocket,
|
||||
frames_sent: &SentFrames,
|
||||
bytes_sent: &SentBytes,
|
||||
) -> Result<usize, io::Error> {
|
||||
// If the frame is larger than the allowed maximum, then the receiver
|
||||
// will truncate the frame when reading the socket.
|
||||
//
|
||||
// Never send frames that will be unprocessable.
|
||||
if buf.len() > MAX_FRAME_BYTES {
|
||||
warn!(
|
||||
n_bytes = buf.len(),
|
||||
"not sending oversized packet - receiver would truncate"
|
||||
);
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"max frame size exceeded",
|
||||
));
|
||||
}
|
||||
|
||||
let ret = socket.send_to(buf, self.addr).await;
|
||||
match &ret {
|
||||
Ok(n_bytes) => {
|
||||
frames_sent.inc(1);
|
||||
bytes_sent.inc(*n_bytes);
|
||||
trace!(identity=%self.identity, n_bytes, peer_addr=%self.addr, "send frame")
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(error=%e, identity=%self.identity, peer_addr=%self.addr, "frame send error")
|
||||
}
|
||||
}
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
/// The set of currently active/known peers.
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct PeerList {
|
||||
list: HashMap<Identity, Peer>,
|
||||
|
||||
/// The number of known, believed-to-be-healthy peers.
|
||||
metric_peer_count: metric::U64Counter,
|
||||
}
|
||||
|
||||
impl PeerList {
|
||||
/// Initialise the [`PeerList`] with capacity for `cap` number of [`Peer`]
|
||||
/// instances.
|
||||
pub(crate) fn with_capacity(cap: usize, metrics: &metric::Registry) -> Self {
|
||||
let metric_peer_count = metrics
|
||||
.register_metric::<U64Counter>(
|
||||
"gossip_known_peers",
|
||||
"number of likely healthy peers known to this node",
|
||||
)
|
||||
.recorder(&[]);
|
||||
|
||||
Self {
|
||||
list: HashMap::with_capacity(cap),
|
||||
metric_peer_count,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the UUIDs of all known peers.
|
||||
pub(crate) fn peer_uuids(&self) -> Vec<Uuid> {
|
||||
self.list.keys().map(|v| **v).collect()
|
||||
}
|
||||
|
||||
/// Upsert a peer identified by `identity` to the peer list, associating it
|
||||
/// with the provided `peer_addr`.
|
||||
pub(crate) fn upsert(&mut self, identity: &Identity, peer_addr: SocketAddr) -> &mut Peer {
|
||||
let p = match self.list.raw_entry_mut().from_key(identity) {
|
||||
RawEntryMut::Vacant(v) => {
|
||||
self.metric_peer_count.inc(1);
|
||||
v.insert(
|
||||
identity.to_owned(),
|
||||
Peer {
|
||||
addr: peer_addr,
|
||||
identity: identity.to_owned(),
|
||||
},
|
||||
)
|
||||
.1
|
||||
}
|
||||
RawEntryMut::Occupied(v) => v.into_mut(),
|
||||
};
|
||||
|
||||
p.addr = peer_addr;
|
||||
p
|
||||
}
|
||||
|
||||
/// Broadcast `buf` to all known peers over `socket`, returning the number
|
||||
/// of bytes sent in total.
|
||||
pub(crate) async fn broadcast(
|
||||
&self,
|
||||
buf: &[u8],
|
||||
socket: &UdpSocket,
|
||||
frames_sent: &SentFrames,
|
||||
bytes_sent: &SentBytes,
|
||||
) -> usize {
|
||||
self.list
|
||||
.values()
|
||||
.map(|v| v.send(buf, socket, frames_sent, bytes_sent))
|
||||
.collect::<FuturesUnordered<_>>()
|
||||
.fold(0, |acc, res| async move {
|
||||
match res {
|
||||
Ok(n) => acc + n,
|
||||
Err(_) => acc,
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::hash_map::DefaultHasher,
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_identity_round_trip() {
|
||||
let a = Identity::new();
|
||||
|
||||
let encoded = a.as_bytes().to_owned();
|
||||
let decoded = Identity::try_from(encoded).unwrap();
|
||||
|
||||
assert_eq!(decoded, a);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity_length_mismatch() {
|
||||
let v = Bytes::from_static(&[42, 42, 42, 42]);
|
||||
let _ = Identity::try_from(v).expect_err("short ID should fail");
|
||||
|
||||
let v = Bytes::from_static(&[
|
||||
42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
|
||||
]);
|
||||
let _ = Identity::try_from(v).expect_err("long ID should fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity_eq() {
|
||||
let v = Identity::new();
|
||||
assert_eq!(v.clone(), v);
|
||||
assert_eq!(hash_identity(&v), hash_identity(&v));
|
||||
|
||||
let other = Identity::new();
|
||||
assert_ne!(v, other);
|
||||
assert_ne!(hash_identity(&other), hash_identity(&v));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity_display() {
|
||||
let v = Identity::new();
|
||||
let text = v.to_string();
|
||||
|
||||
let uuid = Uuid::try_parse(&text).expect("display impl should output valid uuids");
|
||||
assert_eq!(*v, uuid);
|
||||
}
|
||||
|
||||
fn hash_identity(v: &Identity) -> u64 {
|
||||
let mut h = DefaultHasher::default();
|
||||
v.hash(&mut h);
|
||||
h.finish()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
//! Proto definitions of gossip message wire types.
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/influxdata.iox.gossip.v1.rs"));
|
|
@ -0,0 +1,455 @@
|
|||
use std::{net::SocketAddr, sync::Arc};
|
||||
|
||||
use prost::{bytes::BytesMut, Message};
|
||||
use tokio::{
|
||||
net::UdpSocket,
|
||||
sync::mpsc::{self},
|
||||
};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::{
|
||||
metric::*,
|
||||
peers::{Identity, PeerList},
|
||||
proto::{self, frame_message::Payload, FrameMessage},
|
||||
seed::{seed_ping_task, Seed},
|
||||
Dispatcher, Request, MAX_FRAME_BYTES,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Error {
|
||||
NoPayload {
|
||||
peer: Identity,
|
||||
addr: SocketAddr,
|
||||
},
|
||||
|
||||
Deserialise {
|
||||
addr: SocketAddr,
|
||||
source: prost::DecodeError,
|
||||
},
|
||||
|
||||
Identity {
|
||||
addr: SocketAddr,
|
||||
},
|
||||
|
||||
Io(std::io::Error),
|
||||
|
||||
MaxSize(usize),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for Error {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
Self::Io(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AbortOnDrop(tokio::task::JoinHandle<()>);
|
||||
impl Drop for AbortOnDrop {
|
||||
fn drop(&mut self) {
|
||||
self.0.abort()
|
||||
}
|
||||
}
|
||||
|
||||
/// An event loop for gossip frames processing.
|
||||
///
|
||||
/// This actor task is responsible for driving peer discovery, managing the set
|
||||
/// of known peers and exchanging gossip frames between peers.
|
||||
///
|
||||
/// A user interacts with a [`Reactor`] through a [`GossipHandle`].
|
||||
///
|
||||
/// [`GossipHandle`]: crate::GossipHandle
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Reactor<T> {
|
||||
dispatch: T,
|
||||
|
||||
/// The random identity of this gossip instance.
|
||||
identity: Identity,
|
||||
|
||||
/// A cached wire frame, used to generate outgoing messages.
|
||||
cached_frame: proto::Frame,
|
||||
/// A re-used buffer for serialising outgoing messages into.
|
||||
serialisation_buf: Vec<u8>,
|
||||
|
||||
/// The immutable list of seed addresses provided by the user, periodically
|
||||
/// pinged.
|
||||
seed_list: Arc<[Seed]>,
|
||||
/// A task that periodically sends PING frames to all seeds, executing in a
|
||||
/// separate task so that DNS resolution does not block the reactor loop.
|
||||
_seed_ping_task: AbortOnDrop,
|
||||
|
||||
/// The set of active peers this node has communicated with and believes to
|
||||
/// be (recently) healthy.
|
||||
///
|
||||
/// Depending on the perceived availability of the seed nodes, this may
|
||||
/// contain less peers than the number of initial seeds.
|
||||
peer_list: PeerList,
|
||||
|
||||
/// The UDP socket used for communication with peers.
|
||||
socket: Arc<UdpSocket>,
|
||||
|
||||
/// The count of frames sent and received.
|
||||
metric_frames_sent: SentFrames,
|
||||
metric_frames_received: ReceivedFrames,
|
||||
|
||||
/// The sum of bytes sent and received.
|
||||
metric_bytes_sent: SentBytes,
|
||||
metric_bytes_received: ReceivedBytes,
|
||||
}
|
||||
|
||||
impl<T> Reactor<T>
|
||||
where
|
||||
T: Dispatcher,
|
||||
{
|
||||
pub(crate) fn new(
|
||||
seed_list: Vec<String>,
|
||||
socket: UdpSocket,
|
||||
dispatch: T,
|
||||
metrics: &metric::Registry,
|
||||
) -> Self {
|
||||
// Generate a unique UUID for this Reactor instance, and cache the wire
|
||||
// representation.
|
||||
let identity = Identity::new();
|
||||
|
||||
let seed_list = seed_list.into_iter().map(Seed::new).collect();
|
||||
let socket = Arc::new(socket);
|
||||
let mut serialisation_buf = Vec::with_capacity(1024);
|
||||
|
||||
// Generate a pre-populated frame header.
|
||||
let mut cached_frame = proto::Frame {
|
||||
identity: identity.as_bytes().clone(),
|
||||
messages: Vec::with_capacity(1),
|
||||
};
|
||||
|
||||
// A ping frame is static over the lifetime of a Reactor instance, so it
|
||||
// can be pre-serialised, cached, and reused for every ping.
|
||||
let cached_ping_frame = {
|
||||
populate_frame(
|
||||
&mut cached_frame,
|
||||
vec![new_payload(Payload::Ping(proto::Ping {}))],
|
||||
&mut serialisation_buf,
|
||||
)
|
||||
.unwrap();
|
||||
serialisation_buf.clone()
|
||||
};
|
||||
|
||||
// Initialise the various metrics with wrappers to help distinguish
|
||||
// between the (very similar) counters.
|
||||
let (metric_frames_sent, metric_frames_received, metric_bytes_sent, metric_bytes_received) =
|
||||
new_metrics(metrics);
|
||||
|
||||
// Spawn a task that periodically pings all known seeds.
|
||||
//
|
||||
// Pinging all seeds announces this node as alive, propagating the
|
||||
// instance UUID, and requesting PONG responses to drive population of
|
||||
// the active peer list.
|
||||
let seed_ping_task = AbortOnDrop(tokio::spawn(seed_ping_task(
|
||||
Arc::clone(&seed_list),
|
||||
Arc::clone(&socket),
|
||||
cached_ping_frame,
|
||||
metric_frames_sent.clone(),
|
||||
metric_bytes_sent.clone(),
|
||||
)));
|
||||
|
||||
Self {
|
||||
dispatch,
|
||||
identity,
|
||||
cached_frame,
|
||||
serialisation_buf,
|
||||
peer_list: PeerList::with_capacity(seed_list.len(), metrics),
|
||||
seed_list,
|
||||
_seed_ping_task: seed_ping_task,
|
||||
socket,
|
||||
metric_frames_sent,
|
||||
metric_frames_received,
|
||||
metric_bytes_sent,
|
||||
metric_bytes_received,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn run(mut self, mut rx: mpsc::Receiver<Request>) {
|
||||
info!(
|
||||
identity = %self.identity,
|
||||
seed_list = ?self.seed_list,
|
||||
"gossip reactor started",
|
||||
);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
msg = self.read() => {
|
||||
match msg {
|
||||
Ok(()) => {},
|
||||
Err(Error::NoPayload { peer, addr }) => {
|
||||
warn!(%peer, %addr, "message contains no payload");
|
||||
continue;
|
||||
}
|
||||
Err(Error::Deserialise { addr, source }) => {
|
||||
warn!(error=%source, %addr, "error deserialising frame");
|
||||
continue;
|
||||
}
|
||||
Err(Error::Identity { addr }) => {
|
||||
warn!(%addr, "invalid identity value in frame");
|
||||
continue;
|
||||
}
|
||||
Err(Error::Io(error)) => {
|
||||
error!(%error, "i/o error");
|
||||
continue;
|
||||
}
|
||||
Err(Error::MaxSize(_)) => {
|
||||
// Logged at source
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
op = rx.recv() => {
|
||||
match op {
|
||||
None => {
|
||||
info!("stopping gossip reactor");
|
||||
return;
|
||||
}
|
||||
Some(Request::GetPeers(tx)) => {
|
||||
let _ = tx.send(self.peer_list.peer_uuids());
|
||||
},
|
||||
Some(Request::Broadcast(payload)) => {
|
||||
// The user is guaranteed MAX_USER_PAYLOAD_BYTES to
|
||||
// be send-able, so send this frame without packing
|
||||
// others with it for simplicity.
|
||||
populate_frame(
|
||||
&mut self.cached_frame,
|
||||
vec![new_payload(Payload::UserData(proto::UserPayload{payload}))],
|
||||
&mut self.serialisation_buf
|
||||
).expect("size validated in handle at enqueue time");
|
||||
|
||||
self.peer_list.broadcast(
|
||||
&self.serialisation_buf,
|
||||
&self.socket,
|
||||
&self.metric_frames_sent,
|
||||
&self.metric_bytes_sent
|
||||
).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a gossip frame from the socket and potentially respond.
|
||||
///
|
||||
/// This method waits for a frame to be made available by the OS, enumerates
|
||||
/// the contents, batches any responses to those frames and if non-empty,
|
||||
/// returns the result to the sender of the original frame.
|
||||
///
|
||||
/// Returns the bytes read and bytes sent during execution of this method.
|
||||
async fn read(&mut self) -> Result<(), Error> {
|
||||
// Read a frame into buf.
|
||||
let (bytes_read, frame, peer_addr) = read_frame(&self.socket).await?;
|
||||
self.metric_frames_received.inc(1);
|
||||
self.metric_bytes_received.inc(bytes_read as _);
|
||||
|
||||
// Read the peer identity from the frame
|
||||
let identity =
|
||||
Identity::try_from(frame.identity).map_err(|_| Error::Identity { addr: peer_addr })?;
|
||||
|
||||
// Don't process messages from this node.
|
||||
//
|
||||
// It's expected that all N servers will be included in a peer list,
|
||||
// rather than the N-1 peers to this node. By dropping messages from
|
||||
// this node, pings sent by this node will go unprocessed and therefore
|
||||
// this node will not be added to the active peer list.
|
||||
if identity == self.identity {
|
||||
debug!(%identity, %peer_addr, bytes_read, "dropping frame from self");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Find or create the peer in the peer list.
|
||||
let peer = self.peer_list.upsert(&identity, peer_addr);
|
||||
|
||||
let mut out_messages = Vec::with_capacity(1);
|
||||
for msg in frame.messages {
|
||||
// Extract the payload from the frame message
|
||||
let payload = msg.payload.ok_or_else(|| Error::NoPayload {
|
||||
peer: identity.clone(),
|
||||
addr: peer_addr,
|
||||
})?;
|
||||
|
||||
// Handle the frame message from the peer, optionally returning a
|
||||
// response frame.
|
||||
let response = match payload {
|
||||
Payload::Ping(_) => Some(Payload::Pong(proto::Pong {})),
|
||||
Payload::Pong(_) => {
|
||||
debug!(%identity, %peer_addr, "pong");
|
||||
None
|
||||
}
|
||||
Payload::UserData(data) => {
|
||||
self.dispatch.dispatch(data.payload).await;
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(payload) = response {
|
||||
out_messages.push(new_payload(payload));
|
||||
}
|
||||
}
|
||||
|
||||
// Sometimes no message will be returned to the peer - there's no need
|
||||
// to send an empty frame.
|
||||
if out_messages.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Serialise the frame into the serialisation buffer.
|
||||
populate_frame(
|
||||
&mut self.cached_frame,
|
||||
out_messages,
|
||||
&mut self.serialisation_buf,
|
||||
)?;
|
||||
|
||||
peer.send(
|
||||
&self.serialisation_buf,
|
||||
&self.socket,
|
||||
&self.metric_frames_sent,
|
||||
&self.metric_bytes_sent,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the randomised identity assigned to this instance.
|
||||
pub(crate) fn identity(&self) -> &Identity {
|
||||
&self.identity
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for a UDP datagram to become ready, and read it entirely into `buf`.
|
||||
async fn recv(socket: &UdpSocket, buf: &mut BytesMut) -> (usize, SocketAddr) {
|
||||
let (n_bytes, addr) = socket
|
||||
.recv_buf_from(buf)
|
||||
.await
|
||||
// These errors are libc's recvfrom() or converting the kernel-provided
|
||||
// socket structure to rust's SocketAddr - neither should ever happen.
|
||||
.expect("invalid recvfrom");
|
||||
|
||||
trace!(%addr, n_bytes, "socket read");
|
||||
(n_bytes, addr)
|
||||
}
|
||||
|
||||
/// Wait for a UDP datagram to arrive, and decode it into a gossip Frame.
|
||||
///
|
||||
/// Clears the contents of `buf` before reading the frame.
|
||||
async fn read_frame(socket: &UdpSocket) -> Result<(usize, proto::Frame, SocketAddr), Error> {
|
||||
// Pre-allocate a buffer large enough to hold the maximum message size.
|
||||
//
|
||||
// Reading data from a UDP socket silently truncates if there's not enough
|
||||
// buffer space to write the full packet payload (tokio doesn't support
|
||||
// MSG_TRUNC-like flags on reads).
|
||||
let mut buf = BytesMut::with_capacity(MAX_FRAME_BYTES);
|
||||
|
||||
let (n_bytes, addr) = recv(socket, &mut buf).await;
|
||||
|
||||
// Decode the frame, re-using byte arrays from the underlying buffer.
|
||||
match proto::Frame::decode(buf.freeze()) {
|
||||
Ok(frame) => {
|
||||
debug!(?frame, %addr, n_bytes, "read frame");
|
||||
Ok((n_bytes, frame, addr))
|
||||
}
|
||||
Err(e) => Err(Error::Deserialise { addr, source: e }),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a pre-allocated `frame`, clear and populate it with the provided
|
||||
/// `payload` containing a set of [`FrameMessage`], serialising it to `buf`.
|
||||
fn populate_frame(
|
||||
frame: &mut proto::Frame,
|
||||
payload: Vec<FrameMessage>,
|
||||
buf: &mut Vec<u8>,
|
||||
) -> Result<(), Error> {
|
||||
frame.messages = payload;
|
||||
|
||||
// Reading data from a UDP socket silently truncates if there's not enough
|
||||
// buffer space to write the full packet payload. This library will
|
||||
// pre-allocate a buffer of this size to read packets into, therefore all
|
||||
// messages must be shorter than this value.
|
||||
if frame.encoded_len() > MAX_FRAME_BYTES {
|
||||
error!(
|
||||
n_bytes=buf.len(),
|
||||
n_max=%MAX_FRAME_BYTES,
|
||||
"attempted to send frame larger than configured maximum"
|
||||
);
|
||||
return Err(Error::MaxSize(buf.len()));
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
frame.encode(buf).expect("buffer should grow");
|
||||
|
||||
debug_assert!(proto::Frame::decode(crate::Bytes::from(buf.clone())).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Instantiate a new [`FrameMessage`] from the given [`Payload`].
|
||||
fn new_payload(p: Payload) -> proto::FrameMessage {
|
||||
proto::FrameMessage { payload: Some(p) }
|
||||
}
|
||||
|
||||
/// Send a PING message to `socket`.
|
||||
pub(crate) async fn ping(
|
||||
ping_frame: &[u8],
|
||||
socket: &UdpSocket,
|
||||
addr: SocketAddr,
|
||||
sent_frames: &SentFrames,
|
||||
sent_bytes: &SentBytes,
|
||||
) -> usize {
|
||||
match socket.send_to(ping_frame, &addr).await {
|
||||
Ok(n_bytes) => {
|
||||
debug!(addr = %addr, "ping");
|
||||
sent_frames.inc(1);
|
||||
sent_bytes.inc(n_bytes);
|
||||
n_bytes
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
error=%e,
|
||||
addr = %addr,
|
||||
"ping failed"
|
||||
);
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{MAX_USER_PAYLOAD_BYTES, USER_PAYLOAD_OVERHEAD};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_user_frame_overhead() {
|
||||
let identity = Identity::new();
|
||||
|
||||
// Generate a pre-populated frame header.
|
||||
let mut frame = proto::Frame {
|
||||
identity: identity.as_bytes().clone(),
|
||||
messages: vec![],
|
||||
};
|
||||
|
||||
let mut buf = Vec::new();
|
||||
populate_frame(
|
||||
&mut frame,
|
||||
vec![new_payload(Payload::UserData(proto::UserPayload {
|
||||
payload: crate::Bytes::new(), // Empty/0-sized
|
||||
}))],
|
||||
&mut buf,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// The proto type should self-report the same size.
|
||||
assert_eq!(buf.len(), frame.encoded_len());
|
||||
|
||||
// The overhead const should be accurate
|
||||
assert_eq!(buf.len(), USER_PAYLOAD_OVERHEAD);
|
||||
|
||||
// The max user payload size should be accurate.
|
||||
assert_eq!(MAX_FRAME_BYTES - buf.len(), MAX_USER_PAYLOAD_BYTES);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
use std::{future, net::SocketAddr, sync::Arc};
|
||||
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use tokio::{
|
||||
net::{self, UdpSocket},
|
||||
time::{timeout, MissedTickBehavior},
|
||||
};
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::{
|
||||
metric::{SentBytes, SentFrames},
|
||||
reactor::ping,
|
||||
RESOLVE_TIMEOUT, SEED_PING_INTERVAL,
|
||||
};
|
||||
|
||||
/// The user-provided seed peer address.
|
||||
///
|
||||
/// NOTE: the IP/socket address this resolves to may change over the
|
||||
/// lifetime of the peer, so the raw address is retained instead of
|
||||
/// the [`SocketAddr`] to ensure it is constantly re-resolved when the peer
|
||||
/// is unreachable.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Seed(String);
|
||||
|
||||
impl Seed {
|
||||
pub(crate) fn new(addr: String) -> Self {
|
||||
Self(addr)
|
||||
}
|
||||
|
||||
/// Resolve this peer address, returning an error if resolution is not
|
||||
/// complete within [`RESOLVE_TIMEOUT`].
|
||||
pub(crate) async fn resolve(&self) -> Option<SocketAddr> {
|
||||
match timeout(RESOLVE_TIMEOUT, resolve(&self.0)).await {
|
||||
Ok(v) => v,
|
||||
Err(_) => {
|
||||
warn!(addr = %self.0, "timeout resolving seed address");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve `addr`, returning the first IP address, if any.
|
||||
async fn resolve(addr: &str) -> Option<SocketAddr> {
|
||||
match net::lookup_host(addr).await.map(|mut v| v.next()) {
|
||||
Ok(Some(v)) => {
|
||||
debug!(%addr, peer=%v, "resolved peer address");
|
||||
Some(v)
|
||||
}
|
||||
Ok(None) => {
|
||||
warn!(%addr, "resolved peer address contains no IPs");
|
||||
None
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(%addr, error=%e, "failed to resolve peer address");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Block forever, sending `ping_frame` over `socket` to all the entries in
|
||||
/// `seeds`.
|
||||
///
|
||||
/// This method immediately pings all the seeds, and then pings periodically at
|
||||
/// [`SEED_PING_INTERVAL`].
|
||||
pub(super) async fn seed_ping_task(
|
||||
seeds: Arc<[Seed]>,
|
||||
socket: Arc<UdpSocket>,
|
||||
ping_frame: Vec<u8>,
|
||||
sent_frames: SentFrames,
|
||||
sent_bytes: SentBytes,
|
||||
) {
|
||||
let mut interval = tokio::time::interval(SEED_PING_INTERVAL);
|
||||
|
||||
// Do not burden seeds with faster PING frames to catch up this timer.
|
||||
interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
|
||||
// Start the ping loop, with the first iteration starting immediately.
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
let bytes_sent = seeds
|
||||
.iter()
|
||||
.map(|seed| async {
|
||||
if let Some(addr) = seed.resolve().await {
|
||||
ping(&ping_frame, &socket, addr, &sent_frames, &sent_bytes).await
|
||||
} else {
|
||||
0
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>()
|
||||
.fold(0, |acc, x| future::ready(acc + x))
|
||||
.await;
|
||||
|
||||
debug!(bytes_sent, "seed ping sweep complete");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use test_helpers::{maybe_start_logging, timeout::FutureTimeout};
|
||||
use tokio::{net::UdpSocket, sync::mpsc};
|
||||
|
||||
use gossip::*;
|
||||
|
||||
/// Assert that starting up a reactor performs the initial peer discovery
|
||||
/// from a set of seeds, resulting in both peers known of one another.
|
||||
#[tokio::test]
|
||||
async fn test_payload_exchange() {
|
||||
maybe_start_logging();
|
||||
|
||||
let metrics = Arc::new(metric::Registry::default());
|
||||
|
||||
// How long to wait for peer discovery to complete.
|
||||
const TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
// Bind a UDP socket to a random port
|
||||
let a_socket = UdpSocket::bind("127.0.0.1:0")
|
||||
.await
|
||||
.expect("failed to bind UDP socket");
|
||||
let a_addr = a_socket.local_addr().expect("failed to read local addr");
|
||||
|
||||
// And a socket for the second reactor
|
||||
let b_socket = UdpSocket::bind("127.0.0.1:0")
|
||||
.await
|
||||
.expect("failed to bind UDP socket");
|
||||
let b_addr = b_socket.local_addr().expect("failed to read local addr");
|
||||
|
||||
// Initialise the dispatchers for the reactors
|
||||
let (a_tx, mut a_rx) = mpsc::channel(5);
|
||||
let (b_tx, mut b_rx) = mpsc::channel(5);
|
||||
|
||||
// Initialise both reactors
|
||||
let addrs = vec![a_addr.to_string(), b_addr.to_string()];
|
||||
let a = Builder::new(addrs.clone(), a_tx, Arc::clone(&metrics)).build(a_socket);
|
||||
let b = Builder::new(addrs, b_tx, Arc::clone(&metrics)).build(b_socket);
|
||||
|
||||
// Wait for peer discovery to occur
|
||||
async {
|
||||
loop {
|
||||
if a.get_peers().await.len() == 1 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
.with_timeout_panic(TIMEOUT)
|
||||
.await;
|
||||
|
||||
// Send the payload through peer A
|
||||
let a_payload = Bytes::from_static(b"bananas");
|
||||
a.broadcast(a_payload.clone()).await.unwrap();
|
||||
|
||||
// Assert it was received by peer B
|
||||
let got = b_rx
|
||||
.recv()
|
||||
.with_timeout_panic(TIMEOUT)
|
||||
.await
|
||||
.expect("reactor stopped");
|
||||
assert_eq!(got, a_payload);
|
||||
|
||||
// Do the reverse - send from B to A
|
||||
let b_payload = Bytes::from_static(b"platanos");
|
||||
b.broadcast(b_payload.clone()).await.unwrap();
|
||||
let got = a_rx
|
||||
.recv()
|
||||
.with_timeout_panic(TIMEOUT)
|
||||
.await
|
||||
.expect("reactor stopped");
|
||||
assert_eq!(got, b_payload);
|
||||
|
||||
// Send another payload through peer A (ensuring scratch buffers are
|
||||
// correctly wiped, etc)
|
||||
let a_payload = Bytes::from_static(b"platanos");
|
||||
a.broadcast(a_payload.clone()).await.unwrap();
|
||||
let got = b_rx
|
||||
.recv()
|
||||
.with_timeout_panic(TIMEOUT)
|
||||
.await
|
||||
.expect("reactor stopped");
|
||||
assert_eq!(got, a_payload);
|
||||
}
|
|
@ -6,11 +6,18 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bytes = "1.4"
|
||||
data_types = { path = "../data_types" }
|
||||
futures-util = { version = "0.3" }
|
||||
generated_types = { path = "../generated_types" }
|
||||
influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] }
|
||||
iox_catalog = { path = "../iox_catalog" }
|
||||
parquet_file = { path = "../parquet_file" }
|
||||
object_store = { workspace=true }
|
||||
observability_deps = { path = "../observability_deps" }
|
||||
serde_json = "1.0.100"
|
||||
thiserror = "1.0.41"
|
||||
schema = { path = "../schema" }
|
||||
serde_json = "1.0.102"
|
||||
thiserror = "1.0.43"
|
||||
tokio = { version = "1.29" }
|
||||
tokio-util = { version = "0.7.8" }
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
|
|
@ -1,60 +1,132 @@
|
|||
//! Utilities for importing catalog and data from files
|
||||
//! MORE COMING SOON: <https://github.com/influxdata/influxdb_iox/issues/7744>
|
||||
|
||||
use observability_deps::tracing::{debug, warn};
|
||||
use bytes::Bytes;
|
||||
use data_types::{
|
||||
partition_template::{
|
||||
NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, PARTITION_BY_DAY_PROTO,
|
||||
},
|
||||
ColumnSet, ColumnType, CompactionLevel, Namespace, NamespaceName, NamespaceNameError,
|
||||
ParquetFileParams, Partition, PartitionHashId, Statistics, Table, TableId, Timestamp,
|
||||
};
|
||||
use generated_types::influxdata::iox::catalog::v1 as proto;
|
||||
// ParquetFile as ProtoParquetFile, Partition as ProtoPartition,
|
||||
use iox_catalog::interface::{CasFailure, Catalog, RepoCollection, SoftDeletedRows};
|
||||
use object_store::ObjectStore;
|
||||
use observability_deps::tracing::{debug, info, warn};
|
||||
use parquet_file::{
|
||||
metadata::{DecodedIoxParquetMetaData, IoxMetadata, IoxParquetMetaData},
|
||||
ParquetFilePath,
|
||||
};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
io::Read,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ImportError {
|
||||
pub enum Error {
|
||||
#[error("Reading {path:?}: {e}")]
|
||||
Reading { path: PathBuf, e: std::io::Error },
|
||||
|
||||
#[error("Not a directory: {0:?}")]
|
||||
NotDirectory(PathBuf),
|
||||
|
||||
#[error("Error setting sort key: {0}")]
|
||||
SetSortKey(iox_catalog::interface::Error),
|
||||
|
||||
#[error("Error decoding json in {path:?}: {e}")]
|
||||
Json { path: PathBuf, e: serde_json::Error },
|
||||
|
||||
#[error("Parquet Metadata Not Found in {path:?}")]
|
||||
ParquetMetadataNotFound { path: PathBuf },
|
||||
|
||||
#[error("Invalid Parquet Metadata: {0}")]
|
||||
ParquetMetadata(#[from] parquet_file::metadata::Error),
|
||||
|
||||
#[error("Error creating default partition template override: {0}")]
|
||||
PartitionOveride(#[from] data_types::partition_template::ValidationError),
|
||||
|
||||
#[error("Expected timestamp stats to be i64, but got: {stats:?}")]
|
||||
BadStats { stats: Option<Statistics> },
|
||||
|
||||
#[error("Expected timestamp to have both min and max stats, had min={min:?}, max={max:?}")]
|
||||
NoMinMax { min: Option<i64>, max: Option<i64> },
|
||||
|
||||
#[error("Mismatched sort key. Exported sort key is {exported}, existing is {existing}")]
|
||||
MismatchedSortKey { exported: String, existing: String },
|
||||
|
||||
#[error("Unexpected parquet filename. Expected a name like <id>.<partition_id>.parquet, got {path:?}")]
|
||||
UnexpectedFileName { path: PathBuf },
|
||||
|
||||
#[error("Invalid Namespace: {0}")]
|
||||
NamespaceName(#[from] NamespaceNameError),
|
||||
|
||||
#[error(
|
||||
"Unexpected error: cound not find sort key in catalog export or embedded parquet metadata"
|
||||
)]
|
||||
NoSortKey,
|
||||
|
||||
#[error("Unknown compaction level in encoded metadata: {0}")]
|
||||
UnknownCompactionLevel(Box<dyn std::error::Error + std::marker::Send + Sync>),
|
||||
|
||||
#[error("Catalog error: {0}")]
|
||||
Catalog(#[from] iox_catalog::interface::Error),
|
||||
|
||||
#[error("Object store error: {0}")]
|
||||
ObjectStore(#[from] object_store::Error),
|
||||
}
|
||||
|
||||
impl ImportError {
|
||||
impl Error {
|
||||
fn reading(path: impl Into<PathBuf>, e: std::io::Error) -> Self {
|
||||
let path = path.into();
|
||||
Self::Reading { path, e }
|
||||
}
|
||||
}
|
||||
|
||||
type Result<T, E = ImportError> = std::result::Result<T, E>;
|
||||
type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
|
||||
/// Represents the contents of a directory exported using [`RemoteExporter`]
|
||||
/// Represents the contents of a directory exported using
|
||||
/// [`RemoteExporter`]. This is a partial catalog snapshot.
|
||||
///
|
||||
/// [`RemoteExporter`]: crate::file::RemoteExporter
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ExportedContents {
|
||||
// .parquet files
|
||||
/// .parquet files
|
||||
parquet_files: Vec<PathBuf>,
|
||||
|
||||
// .parquet.json files (json that correspond to the parquet files)
|
||||
/// .parquet.json files (json that correspond to the parquet files)
|
||||
parquet_json_files: Vec<PathBuf>,
|
||||
|
||||
// table .json files
|
||||
/// table .json files
|
||||
table_json_files: Vec<PathBuf>,
|
||||
|
||||
// partition .json files
|
||||
/// partition .json files
|
||||
partition_json_files: Vec<PathBuf>,
|
||||
|
||||
/// Decoded partition metadata, found in the export
|
||||
partition_metadata: Vec<proto::Partition>,
|
||||
|
||||
/// Decoded parquet metata found in the export
|
||||
/// Key is object_store_id, value is decoded metadata
|
||||
parquet_metadata: Vec<proto::ParquetFile>,
|
||||
}
|
||||
|
||||
impl ExportedContents {
|
||||
/// Read the contents of the directory in `dir_path`, categorizing
|
||||
/// files in that directory.
|
||||
pub fn try_new(dir_path: &Path) -> Result<Self> {
|
||||
info!(?dir_path, "Reading exported catalog contents");
|
||||
|
||||
if !dir_path.is_dir() {
|
||||
return Err(ImportError::NotDirectory(dir_path.into()));
|
||||
return Err(Error::NotDirectory(dir_path.into()));
|
||||
};
|
||||
|
||||
let entries: Vec<_> = dir_path
|
||||
.read_dir()
|
||||
.map_err(|e| ImportError::reading(dir_path, e))?
|
||||
.map_err(|e| Error::reading(dir_path, e))?
|
||||
.flatten()
|
||||
.collect();
|
||||
|
||||
|
@ -92,9 +164,50 @@ impl ExportedContents {
|
|||
}
|
||||
}
|
||||
|
||||
new_self.try_decode_files()?;
|
||||
|
||||
Ok(new_self)
|
||||
}
|
||||
|
||||
/// tries to decode all the metadata files found in the export
|
||||
fn try_decode_files(&mut self) -> Result<()> {
|
||||
debug!("Decoding partition files");
|
||||
|
||||
for path in &self.partition_json_files {
|
||||
debug!(?path, "Reading partition json file");
|
||||
let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
|
||||
path: path.clone(),
|
||||
e,
|
||||
})?;
|
||||
|
||||
let partition: proto::Partition =
|
||||
serde_json::from_str(&json).map_err(|e| Error::Json {
|
||||
path: path.clone(),
|
||||
e,
|
||||
})?;
|
||||
|
||||
self.partition_metadata.push(partition);
|
||||
}
|
||||
|
||||
for path in &self.parquet_json_files {
|
||||
debug!(?path, "Reading parquet json file");
|
||||
let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
|
||||
path: path.clone(),
|
||||
e,
|
||||
})?;
|
||||
|
||||
let parquet_file: proto::ParquetFile =
|
||||
serde_json::from_str(&json).map_err(|e| Error::Json {
|
||||
path: path.clone(),
|
||||
e,
|
||||
})?;
|
||||
|
||||
self.parquet_metadata.push(parquet_file);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the name of the i'th entry in `self.parquet_files`, if
|
||||
/// any
|
||||
pub fn parquet_file_name(&self, i: usize) -> Option<Cow<'_, str>> {
|
||||
|
@ -116,6 +229,27 @@ impl ExportedContents {
|
|||
pub fn partition_json_files(&self) -> &[PathBuf] {
|
||||
self.partition_json_files.as_ref()
|
||||
}
|
||||
|
||||
/// Returns partition information retrieved from the exported
|
||||
/// catalog, if any, with the given table id and partition key
|
||||
pub fn partition_metadata(
|
||||
&self,
|
||||
table_id: i64,
|
||||
partition_key: &str,
|
||||
) -> Option<proto::Partition> {
|
||||
self.partition_metadata
|
||||
.iter()
|
||||
.find(|p| p.table_id == table_id && p.key == partition_key)
|
||||
.cloned()
|
||||
}
|
||||
|
||||
/// Returns parquet file metadata, for the given object_store id, if any
|
||||
pub fn parquet_metadata(&self, object_store_id: &str) -> Option<proto::ParquetFile> {
|
||||
self.parquet_metadata
|
||||
.iter()
|
||||
.find(|p| p.object_store_id == object_store_id)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the name of the file
|
||||
|
@ -124,3 +258,440 @@ fn file_name(p: &Path) -> Cow<'_, str> {
|
|||
.map(|p| p.to_string_lossy())
|
||||
.unwrap_or_else(|| Cow::Borrowed(""))
|
||||
}
|
||||
|
||||
/// Imports the contents of a [`ExportedContents`] into a catalog and
|
||||
/// object_store instance
|
||||
#[derive(Debug)]
|
||||
pub struct RemoteImporter {
|
||||
exported_contents: ExportedContents,
|
||||
catalog: Arc<dyn Catalog>,
|
||||
object_store: Arc<dyn ObjectStore>,
|
||||
}
|
||||
|
||||
impl RemoteImporter {
|
||||
pub fn new(
|
||||
exported_contents: ExportedContents,
|
||||
catalog: Arc<dyn Catalog>,
|
||||
object_store: Arc<dyn ObjectStore>,
|
||||
) -> Self {
|
||||
Self {
|
||||
exported_contents,
|
||||
catalog,
|
||||
object_store,
|
||||
}
|
||||
}
|
||||
|
||||
/// Performs the import, reporting status to observer and erroring
|
||||
/// if a failure occurs
|
||||
pub async fn import(&self) -> Result<()> {
|
||||
let parquet_files = self.exported_contents.parquet_files();
|
||||
|
||||
let total_files = parquet_files.len();
|
||||
info!(%total_files, "Begin importing files");
|
||||
for (files_done, file) in parquet_files.iter().enumerate() {
|
||||
self.import_parquet(file).await?;
|
||||
|
||||
// print a log message every 50 files
|
||||
if files_done % 50 == 0 {
|
||||
let pct = (files_done as f64 / total_files as f64).floor() * 100.0;
|
||||
info!(%files_done, %total_files, %pct, "Import running");
|
||||
}
|
||||
}
|
||||
|
||||
info!(%total_files, "Completed importing files");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// tries to import the specified parquet file into the catalog
|
||||
async fn import_parquet(&self, file_path: &Path) -> Result<()> {
|
||||
info!(?file_path, "Beginning Import");
|
||||
|
||||
// step 1: figure out the location to write the parquet file in object store and do so
|
||||
let mut in_file =
|
||||
std::fs::File::open(file_path).map_err(|e| Error::reading(file_path, e))?;
|
||||
|
||||
let mut file_bytes = vec![];
|
||||
in_file
|
||||
.read_to_end(&mut file_bytes)
|
||||
.map_err(|e| Error::reading(file_path, e))?;
|
||||
let bytes = Bytes::from(file_bytes);
|
||||
let file_size_bytes = bytes.len();
|
||||
|
||||
let Some(iox_parquet_metadata) = IoxParquetMetaData::from_file_bytes(bytes.clone())? else {
|
||||
return Err(Error::ParquetMetadataNotFound {
|
||||
path: PathBuf::from(file_path)
|
||||
});
|
||||
};
|
||||
|
||||
let decoded_iox_parquet_metadata = iox_parquet_metadata.decode()?;
|
||||
|
||||
let iox_metadata = decoded_iox_parquet_metadata.read_iox_metadata_new()?;
|
||||
|
||||
debug!(?iox_metadata, "read metadata");
|
||||
|
||||
// step 2: Add the appropriate entry to the catalog
|
||||
let namespace_name = iox_metadata.namespace_name.as_ref();
|
||||
let mut repos = self.catalog.repositories().await;
|
||||
|
||||
let namespace = repos
|
||||
.namespaces()
|
||||
.get_by_name(namespace_name, SoftDeletedRows::ExcludeDeleted)
|
||||
.await?;
|
||||
|
||||
// create it if it doesn't exist
|
||||
let namespace = match namespace {
|
||||
Some(namespace) => {
|
||||
debug!(%namespace_name, "Found existing namespace");
|
||||
namespace
|
||||
}
|
||||
None => {
|
||||
let namespace_name = NamespaceName::try_from(namespace_name)?;
|
||||
let partition_template = None;
|
||||
let retention_period_ns = None;
|
||||
let service_protection_limits = None;
|
||||
|
||||
info!(%namespace_name, "Namespace found, creating new namespace");
|
||||
repos
|
||||
.namespaces()
|
||||
.create(
|
||||
&namespace_name,
|
||||
partition_template,
|
||||
retention_period_ns,
|
||||
service_protection_limits,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
let table = self
|
||||
.table_for_parquet_file(repos.as_mut(), &namespace, &iox_metadata)
|
||||
.await?;
|
||||
let table_id = table.id;
|
||||
debug!(%table_id, "Inserting catalog records into table");
|
||||
|
||||
let partition = self
|
||||
.partition_for_parquet_file(repos.as_mut(), &table, &iox_metadata)
|
||||
.await?;
|
||||
|
||||
// Note that for some reason, the object_store_id that is
|
||||
// actually used in object_storage from the source system is
|
||||
// different than what is stored in the metadata embedded in
|
||||
// the parquet file itself. Thus use the object_store_id
|
||||
// encoded into the parquet file name
|
||||
let object_store_id =
|
||||
object_store_id_from_parquet_filename(file_path).ok_or_else(|| {
|
||||
Error::UnexpectedFileName {
|
||||
path: file_path.into(),
|
||||
}
|
||||
})?;
|
||||
debug!(partition_id=%partition.id, %object_store_id, "Inserting into partition");
|
||||
|
||||
let parquet_metadata = self.exported_contents.parquet_metadata(&object_store_id);
|
||||
|
||||
let parquet_params = self
|
||||
.parquet_file_params(
|
||||
repos.as_mut(),
|
||||
&namespace,
|
||||
&table,
|
||||
&partition,
|
||||
parquet_metadata,
|
||||
&iox_metadata,
|
||||
&decoded_iox_parquet_metadata,
|
||||
file_size_bytes,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let object_store_id = parquet_params.object_store_id;
|
||||
let parquet_file = repos.parquet_files().create(parquet_params).await;
|
||||
|
||||
match parquet_file {
|
||||
Ok(parquet_file) => {
|
||||
debug!(parquet_file_id=?parquet_file.id, " Created parquet file entry {}", parquet_file.id);
|
||||
}
|
||||
Err(iox_catalog::interface::Error::FileExists { .. }) => {
|
||||
warn!(%object_store_id, "parquet file already exists, skipping");
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(Error::Catalog(e));
|
||||
}
|
||||
};
|
||||
|
||||
// Now copy the parquet files into the object store
|
||||
//let partition_id = TransitionPartitionId::Deprecated(partition.id);
|
||||
let transition_partition_id = partition.transition_partition_id();
|
||||
|
||||
let parquet_path = ParquetFilePath::new(
|
||||
namespace.id,
|
||||
table_id,
|
||||
&transition_partition_id,
|
||||
object_store_id,
|
||||
);
|
||||
let object_store_path = parquet_path.object_store_path();
|
||||
debug!(?object_store_path, "copying data to object store");
|
||||
self.object_store.put(&object_store_path, bytes).await?;
|
||||
|
||||
info!(?file_path, %namespace_name, %object_store_path, %transition_partition_id, %table_id, "Successfully imported file");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the relevant Catlog [`Table`] for the specified parquet
|
||||
/// file.
|
||||
///
|
||||
/// If the table does not yet exist, it is created, using any
|
||||
/// available catalog metadata and falling back to what is in the
|
||||
/// iox metadata if needed
|
||||
async fn table_for_parquet_file(
|
||||
&self,
|
||||
repos: &mut dyn RepoCollection,
|
||||
namespace: &Namespace,
|
||||
iox_metadata: &IoxMetadata,
|
||||
) -> Result<Table> {
|
||||
let tables = repos.tables();
|
||||
|
||||
// Note the export format doesn't currently have any table level information
|
||||
let table_name = iox_metadata.table_name.as_ref();
|
||||
|
||||
if let Some(table) = tables
|
||||
.get_by_namespace_and_name(namespace.id, table_name)
|
||||
.await?
|
||||
{
|
||||
return Ok(table);
|
||||
}
|
||||
|
||||
// need to make a new table, create the default partitioning scheme...
|
||||
let partition_template = PARTITION_BY_DAY_PROTO.as_ref().clone();
|
||||
let namespace_template = NamespacePartitionTemplateOverride::try_from(partition_template)?;
|
||||
let custom_table_template = None;
|
||||
let partition_template =
|
||||
TablePartitionTemplateOverride::try_new(custom_table_template, &namespace_template)?;
|
||||
|
||||
let table = tables
|
||||
.create(table_name, partition_template, namespace.id)
|
||||
.await?;
|
||||
Ok(table)
|
||||
}
|
||||
|
||||
/// Return the catalog [`Partition`] into which the specified parquet
|
||||
/// file shoudl be inserted.
|
||||
///
|
||||
/// First attempts to use any available metadata from the
|
||||
/// catalog export, and falls back to what is in the iox
|
||||
/// metadata stored in the parquet file, if needed
|
||||
async fn partition_for_parquet_file(
|
||||
&self,
|
||||
repos: &mut dyn RepoCollection,
|
||||
table: &Table,
|
||||
iox_metadata: &IoxMetadata,
|
||||
) -> Result<Partition> {
|
||||
let partition_key = iox_metadata.partition_key.clone();
|
||||
|
||||
let partition = repos
|
||||
.partitions()
|
||||
.create_or_get(partition_key.clone(), table.id)
|
||||
.await?;
|
||||
|
||||
// Note we use the table_id embedded in the file's metadata
|
||||
// from the source catalog to match the exported catlog (which
|
||||
// is dfferent than the new table we just created in the
|
||||
// target catalog);
|
||||
let proto_partition = self
|
||||
.exported_contents
|
||||
.partition_metadata(iox_metadata.table_id.get(), partition_key.inner());
|
||||
|
||||
let new_sort_key: Vec<&str> = if let Some(proto_partition) = proto_partition.as_ref() {
|
||||
// Use the sort key from the source catalog
|
||||
debug!(array_sort_key=?proto_partition.array_sort_key, "Using sort key from catalog export");
|
||||
proto_partition
|
||||
.array_sort_key
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect()
|
||||
} else {
|
||||
warn!("Could not find sort key in catalog metadata export, falling back to embedded metadata");
|
||||
let sort_key = iox_metadata
|
||||
.sort_key
|
||||
.as_ref()
|
||||
.ok_or_else(|| Error::NoSortKey)?;
|
||||
|
||||
sort_key.to_columns().collect()
|
||||
};
|
||||
|
||||
if !partition.sort_key.is_empty() && partition.sort_key != new_sort_key {
|
||||
let exported = new_sort_key.join(",");
|
||||
let existing = partition.sort_key.join(",");
|
||||
return Err(Error::MismatchedSortKey { exported, existing });
|
||||
}
|
||||
|
||||
loop {
|
||||
let res = repos
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
&partition.transition_partition_id(),
|
||||
Some(partition.sort_key.clone()),
|
||||
&new_sort_key,
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(partition) => return Ok(partition),
|
||||
Err(CasFailure::ValueMismatch(_)) => {
|
||||
debug!("Value mismatch when setting sort key, retrying...");
|
||||
continue;
|
||||
}
|
||||
Err(CasFailure::QueryError(e)) => return Err(Error::SetSortKey(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a [`ParquetFileParams`] (information needed to insert
|
||||
/// the data into the target catalog).
|
||||
///
|
||||
/// First attempts to use any available metadata from the
|
||||
/// catalog export, and falls back to what is in the iox
|
||||
/// metadata stored in the parquet file, if needed
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn parquet_file_params(
|
||||
&self,
|
||||
repos: &mut dyn RepoCollection,
|
||||
namespace: &Namespace,
|
||||
table: &Table,
|
||||
partition: &Partition,
|
||||
// parquet metadata, if known
|
||||
parquet_metadata: Option<proto::ParquetFile>,
|
||||
iox_metadata: &IoxMetadata,
|
||||
decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
|
||||
file_size_bytes: usize,
|
||||
) -> Result<ParquetFileParams> {
|
||||
let object_store_id = iox_metadata.object_store_id;
|
||||
|
||||
// need to make columns in the target catalog
|
||||
let column_set = insert_columns(table.id, decoded_iox_parquet_metadata, repos).await?;
|
||||
|
||||
// Create the the partition_hash_id
|
||||
let partition_hash_id = Some(PartitionHashId::new(table.id, &partition.partition_key));
|
||||
|
||||
let params = if let Some(proto_parquet_file) = &parquet_metadata {
|
||||
let compaction_level = proto_parquet_file
|
||||
.compaction_level
|
||||
.try_into()
|
||||
.map_err(Error::UnknownCompactionLevel)?;
|
||||
|
||||
ParquetFileParams {
|
||||
namespace_id: namespace.id,
|
||||
table_id: table.id,
|
||||
partition_hash_id,
|
||||
partition_id: partition.id,
|
||||
object_store_id,
|
||||
min_time: Timestamp::new(proto_parquet_file.min_time),
|
||||
max_time: Timestamp::new(proto_parquet_file.max_time),
|
||||
file_size_bytes: proto_parquet_file.file_size_bytes,
|
||||
row_count: proto_parquet_file.row_count,
|
||||
compaction_level,
|
||||
created_at: Timestamp::new(proto_parquet_file.created_at),
|
||||
column_set,
|
||||
max_l0_created_at: Timestamp::new(proto_parquet_file.max_l0_created_at),
|
||||
}
|
||||
} else {
|
||||
warn!("Could not read parquet file metadata, reconstructing based on encoded metadata");
|
||||
|
||||
let (min_time, max_time) = get_min_max_times(decoded_iox_parquet_metadata)?;
|
||||
let created_at = Timestamp::new(iox_metadata.creation_timestamp.timestamp_nanos());
|
||||
ParquetFileParams {
|
||||
namespace_id: namespace.id,
|
||||
table_id: table.id,
|
||||
partition_hash_id,
|
||||
partition_id: partition.id,
|
||||
object_store_id,
|
||||
min_time,
|
||||
max_time,
|
||||
// use unwrap: if we can't fit the file size or row
|
||||
// counts into usize, something is very wrong and we
|
||||
// should stop immediately (and get an exact stack trace)
|
||||
file_size_bytes: file_size_bytes.try_into().unwrap(),
|
||||
row_count: decoded_iox_parquet_metadata.row_count().try_into().unwrap(),
|
||||
//compaction_level: CompactionLevel::Final,
|
||||
compaction_level: CompactionLevel::Initial,
|
||||
created_at,
|
||||
column_set,
|
||||
max_l0_created_at: created_at,
|
||||
}
|
||||
};
|
||||
debug!(?params, "Created ParquetFileParams");
|
||||
Ok(params)
|
||||
}
|
||||
}
|
||||
/// Returns a `ColumnSet` that represents all the columns specified in
|
||||
/// `decoded_iox_parquet_metadata`.
|
||||
///
|
||||
/// Insert the appropriate column entries in the catalog they are not
|
||||
/// already present.
|
||||
async fn insert_columns(
|
||||
table_id: TableId,
|
||||
decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
|
||||
repos: &mut dyn RepoCollection,
|
||||
) -> Result<ColumnSet> {
|
||||
let schema = decoded_iox_parquet_metadata.read_schema()?;
|
||||
|
||||
let mut column_ids = vec![];
|
||||
|
||||
for (iox_column_type, field) in schema.iter() {
|
||||
let column_name = field.name();
|
||||
let column_type = ColumnType::from(iox_column_type);
|
||||
|
||||
let column = repos
|
||||
.columns()
|
||||
.create_or_get(column_name, table_id, column_type)
|
||||
.await?;
|
||||
column_ids.push(column.id);
|
||||
}
|
||||
|
||||
Ok(ColumnSet::new(column_ids))
|
||||
}
|
||||
|
||||
/// Reads out the min and max value for the decoded_iox_parquet_metadata column
|
||||
fn get_min_max_times(
|
||||
decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
|
||||
) -> Result<(Timestamp, Timestamp)> {
|
||||
let schema = decoded_iox_parquet_metadata.read_schema()?;
|
||||
let stats = decoded_iox_parquet_metadata.read_statistics(&schema)?;
|
||||
|
||||
let Some(summary) = stats
|
||||
.iter()
|
||||
.find(|s| s.name == schema::TIME_COLUMN_NAME) else {
|
||||
return Err(Error::BadStats { stats: None });
|
||||
};
|
||||
|
||||
let Statistics::I64(stats) = &summary.stats else {
|
||||
return Err(Error::BadStats { stats: Some(summary.stats.clone()) });
|
||||
};
|
||||
|
||||
let (Some(min), Some(max)) = (stats.min, stats.max) else {
|
||||
return Err(Error::NoMinMax {
|
||||
min: stats.min,
|
||||
max: stats.max,
|
||||
})
|
||||
};
|
||||
|
||||
Ok((Timestamp::new(min), Timestamp::new(max)))
|
||||
}
|
||||
|
||||
/// Given a filename of the store parquet metadata, returns the object_store_id
|
||||
///
|
||||
/// For example, `e65790df-3e42-0094-048f-0b69a7ee402c.13180488.parquet`,
|
||||
/// returns `e65790df-3e42-0094-048f-0b69a7ee402c`
|
||||
///
|
||||
/// For some reason the object store id embedded in the parquet file's
|
||||
/// [`IoxMetadata`] and the of the actual file in object storage are
|
||||
/// different, so we need to use the object_store_id actually used in
|
||||
/// the source system, which is embedded in the filename
|
||||
fn object_store_id_from_parquet_filename(path: &Path) -> Option<String> {
|
||||
let stem = path
|
||||
// <uuid>.partition_id.parquet --> <uuid>.partition_id
|
||||
.file_stem()?
|
||||
.to_string_lossy();
|
||||
|
||||
// <uuid>.partition_id --> (<uuid>, partition_id)
|
||||
let (object_store_id, _partition_id) = stem.split_once('.')?;
|
||||
|
||||
Some(object_store_id.to_string())
|
||||
}
|
||||
|
|
|
@ -3,4 +3,4 @@ mod export;
|
|||
mod import;
|
||||
|
||||
pub use export::{ExportError, RemoteExporter};
|
||||
pub use import::{ExportedContents, ImportError};
|
||||
pub use import::{Error, ExportedContents, RemoteImporter};
|
||||
|
|
|
@ -10,7 +10,7 @@ bytes = "1.4"
|
|||
futures = { version = "0.3", default-features = false }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0.100"
|
||||
serde_json = "1.0.102"
|
||||
snafu = "0.7"
|
||||
url = "2.4.0"
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
|
|
|
@ -67,10 +67,10 @@ libc = { version = "0.2" }
|
|||
num_cpus = "1.16.0"
|
||||
once_cell = { version = "1.18", features = ["parking_lot"] }
|
||||
rustyline = { version = "12.0", default-features = false, features = ["with-file-history"]}
|
||||
serde_json = "1.0.100"
|
||||
serde_json = "1.0.102"
|
||||
snafu = "0.7"
|
||||
tempfile = "3.6.0"
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
|
||||
tokio = { version = "1.29", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time", "io-std"] }
|
||||
tokio-stream = { version = "0.1", features = ["net"] }
|
||||
|
@ -91,9 +91,9 @@ async-trait = "0.1"
|
|||
mutable_batch = { path = "../mutable_batch" }
|
||||
predicate = { path = "../predicate" }
|
||||
predicates = "3.0.3"
|
||||
pretty_assertions = "1.3.0"
|
||||
pretty_assertions = "1.4.0"
|
||||
proptest = { version = "1.2.0", default-features = false }
|
||||
serde = "1.0.166"
|
||||
serde = "1.0.168"
|
||||
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
|
||||
test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
|
||||
insta = { version = "1", features = ["yaml"] }
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
//! This module implements the `build_catalog` CLI command
|
||||
use import_export::file::{ExportedContents, RemoteImporter};
|
||||
use iox_catalog::interface::Catalog;
|
||||
use metric::Registry;
|
||||
use object_store::ObjectStore;
|
||||
use observability_deps::tracing::info;
|
||||
use snafu::{ResultExt, Snafu};
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
#[derive(Debug, Snafu)]
|
||||
pub enum Error {
|
||||
#[snafu(display("Not yet implemented"))]
|
||||
NotYetImplemented,
|
||||
|
||||
#[snafu(display("Catalog error:: {}", source))]
|
||||
#[snafu(context(false))]
|
||||
Catalog {
|
||||
source: iox_catalog::interface::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Object store error:: {}", source))]
|
||||
#[snafu(context(false))]
|
||||
ObjectStore { source: object_store::Error },
|
||||
|
||||
#[snafu(display("Import error:: {}", source))]
|
||||
#[snafu(context(false))]
|
||||
Import { source: import_export::file::Error },
|
||||
|
||||
#[snafu(display("Cannot {} output file '{:?}': {}", operation, path, source))]
|
||||
File {
|
||||
operation: String,
|
||||
path: PathBuf,
|
||||
source: std::io::Error,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Config {
|
||||
/// Directory containing the output of running `influxdb_iox remote store get-table`
|
||||
#[clap(value_parser)]
|
||||
input_dir: PathBuf,
|
||||
|
||||
/// Target data directory to create a sqlite catalog and file
|
||||
/// object_store.
|
||||
///
|
||||
/// After a successful rebuild, you can examine the catalog locally using
|
||||
/// `influxdb_iox --data-dir <dir>`.
|
||||
#[clap(value_parser)]
|
||||
pub data_dir: PathBuf,
|
||||
}
|
||||
|
||||
pub async fn command(config: Config) -> Result<(), Error> {
|
||||
let Config {
|
||||
input_dir,
|
||||
data_dir,
|
||||
} = config;
|
||||
|
||||
let exported_contents = ExportedContents::try_new(&input_dir)?;
|
||||
|
||||
// create a catalog / object store
|
||||
let catalog = get_catalog(&data_dir).await?;
|
||||
catalog.setup().await?;
|
||||
|
||||
let object_store = get_object_store(&data_dir)?;
|
||||
|
||||
info!("Initialized catalog, object store, and input path ...");
|
||||
|
||||
let importer = RemoteImporter::new(exported_contents, catalog, object_store);
|
||||
|
||||
info!(
|
||||
?input_dir,
|
||||
?data_dir,
|
||||
"Beginning catalog / object_store build"
|
||||
);
|
||||
|
||||
Ok(importer.import().await?)
|
||||
}
|
||||
|
||||
async fn get_catalog(data_dir: &Path) -> Result<Arc<dyn Catalog>> {
|
||||
std::fs::create_dir_all(data_dir).context(FileSnafu {
|
||||
operation: "create data directory",
|
||||
path: data_dir,
|
||||
})?;
|
||||
|
||||
let file_path = data_dir.join("catalog.sqlite");
|
||||
let metrics = Arc::new(Registry::default());
|
||||
let options = iox_catalog::sqlite::SqliteConnectionOptions {
|
||||
file_path: file_path.display().to_string(),
|
||||
};
|
||||
|
||||
info!(?file_path, "Using sqlite local catalog");
|
||||
let catalog = iox_catalog::sqlite::SqliteCatalog::connect(options, metrics).await?;
|
||||
Ok(Arc::new(catalog))
|
||||
}
|
||||
|
||||
fn get_object_store(data_dir: &Path) -> Result<Arc<dyn ObjectStore>> {
|
||||
let os_dir = data_dir.join("object_store");
|
||||
std::fs::create_dir_all(&os_dir).context(FileSnafu {
|
||||
operation: "create object_store directory",
|
||||
path: &os_dir,
|
||||
})?;
|
||||
|
||||
info!(?os_dir, "Using local object store");
|
||||
let object_store = object_store::local::LocalFileSystem::new_with_prefix(os_dir)?;
|
||||
|
||||
Ok(Arc::new(object_store))
|
||||
}
|
|
@ -2,6 +2,7 @@ use futures::Future;
|
|||
use influxdb_iox_client::connection::Connection;
|
||||
use snafu::prelude::*;
|
||||
|
||||
mod build_catalog;
|
||||
mod parquet_to_lp;
|
||||
mod print_cpu;
|
||||
mod schema;
|
||||
|
@ -14,6 +15,10 @@ pub enum Error {
|
|||
#[snafu(display("Error in schema subcommand: {}", source))]
|
||||
Schema { source: schema::Error },
|
||||
|
||||
#[snafu(context(false))]
|
||||
#[snafu(display("Error in build_catalog subcommand: {}", source))]
|
||||
BuildCatalog { source: build_catalog::Error },
|
||||
|
||||
#[snafu(context(false))]
|
||||
#[snafu(display("Error in parquet_to_lp subcommand: {}", source))]
|
||||
ParquetToLp { source: parquet_to_lp::Error },
|
||||
|
@ -44,6 +49,23 @@ enum Command {
|
|||
/// Interrogate the schema of a namespace
|
||||
Schema(schema::Config),
|
||||
|
||||
// NB: The example formatting below is weird so Clap make a nice help text
|
||||
/// Build a local catalog from the output of `remote get-table`.
|
||||
///
|
||||
/// For example:
|
||||
/// ```text
|
||||
/// # download contents of table_name into a directory named 'table_name'
|
||||
/// influxdb_iox remote get-table <namespace> <table_name>
|
||||
///
|
||||
/// # Create a catalog and object_store in /tmp/data_dir
|
||||
/// influxdb_iox debug build-catalog <table_dir> /tmp/data_dir
|
||||
///
|
||||
/// # Start iox using this data directory (you can now query `table_name` locally):
|
||||
/// influxdb_iox --data-dir /tmp/data_dir
|
||||
/// ```
|
||||
#[clap(verbatim_doc_comment)]
|
||||
BuildCatalog(build_catalog::Config),
|
||||
|
||||
/// Convert IOx Parquet files back into line protocol format
|
||||
ParquetToLp(parquet_to_lp::Config),
|
||||
|
||||
|
@ -65,6 +87,7 @@ where
|
|||
let connection = connection().await;
|
||||
schema::command(connection, config).await?
|
||||
}
|
||||
Command::BuildCatalog(config) => build_catalog::command(config).await?,
|
||||
Command::ParquetToLp(config) => parquet_to_lp::command(config).await?,
|
||||
Command::SkippedCompactions(config) => {
|
||||
let connection = connection().await;
|
||||
|
|
|
@ -41,7 +41,10 @@ struct Get {
|
|||
file_name: String,
|
||||
}
|
||||
|
||||
/// Get data for a particular namespace's table into a local directory
|
||||
/// Get data for a particular namespace's table into a local directory.
|
||||
///
|
||||
/// See `influxdb_iox debug build-catalog` to create a local catalog
|
||||
/// from these files.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
struct GetTable {
|
||||
/// The namespace to get the Parquet files for
|
||||
|
|
|
@ -33,7 +33,6 @@ use object_store::DynObjectStore;
|
|||
use observability_deps::tracing::*;
|
||||
use parquet_file::storage::{ParquetStorage, StorageId};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
num::NonZeroUsize,
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
|
@ -436,6 +435,15 @@ impl Config {
|
|||
catalog_dsn.dsn = Some(dsn);
|
||||
};
|
||||
|
||||
// TODO: make num_threads a parameter (other modes have it
|
||||
// configured by a command line)
|
||||
let num_threads =
|
||||
NonZeroUsize::new(num_cpus::get()).unwrap_or_else(|| NonZeroUsize::new(1).unwrap());
|
||||
|
||||
// Target allowing the compactor to use as many as 1/2 the
|
||||
// cores by default, but at least one.
|
||||
let compactor_concurrency = NonZeroUsize::new((num_threads.get() / 2).max(1)).unwrap();
|
||||
|
||||
let ingester_addresses =
|
||||
vec![IngesterAddress::from_str(&ingester_grpc_bind_address.to_string()).unwrap()];
|
||||
|
||||
|
@ -487,15 +495,15 @@ impl Config {
|
|||
// parameters are redundant with ingester's
|
||||
let compactor_config = CompactorConfig {
|
||||
compactor_scheduler_config,
|
||||
compaction_partition_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
compaction_df_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
compaction_partition_scratchpad_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
query_exec_thread_count: Some(NonZeroUsize::new(1).unwrap()),
|
||||
compaction_partition_concurrency: compactor_concurrency,
|
||||
compaction_df_concurrency: compactor_concurrency,
|
||||
compaction_partition_scratchpad_concurrency: compactor_concurrency,
|
||||
query_exec_thread_count: Some(num_threads),
|
||||
exec_mem_pool_bytes,
|
||||
max_desired_file_size_bytes: 30_000,
|
||||
max_desired_file_size_bytes: 100 * 1024 * 1024, // 100 MB
|
||||
percentage_max_file_size: 30,
|
||||
split_percentage: 80,
|
||||
partition_timeout_secs: 0,
|
||||
partition_timeout_secs: 30 * 60, // 30 minutes
|
||||
shadow_mode: false,
|
||||
enable_scratchpad: true,
|
||||
ignore_partition_skip_marker: false,
|
||||
|
@ -519,6 +527,8 @@ impl Config {
|
|||
};
|
||||
|
||||
SpecializedConfig {
|
||||
num_threads,
|
||||
|
||||
router_run_config,
|
||||
querier_run_config,
|
||||
|
||||
|
@ -539,7 +549,10 @@ impl Config {
|
|||
/// panic's if the directory does not exist and can not be created
|
||||
fn ensure_directory_exists(p: &Path) {
|
||||
if !p.exists() {
|
||||
println!("Creating directory {p:?}");
|
||||
info!(
|
||||
p=%p.display(),
|
||||
"Creating directory",
|
||||
);
|
||||
std::fs::create_dir_all(p).expect("Could not create default directory");
|
||||
}
|
||||
}
|
||||
|
@ -547,6 +560,8 @@ fn ensure_directory_exists(p: &Path) {
|
|||
/// Different run configs for the different services (needed as they
|
||||
/// listen on different ports)
|
||||
struct SpecializedConfig {
|
||||
num_threads: NonZeroUsize,
|
||||
|
||||
router_run_config: RunConfig,
|
||||
querier_run_config: RunConfig,
|
||||
ingester_run_config: RunConfig,
|
||||
|
@ -561,6 +576,7 @@ struct SpecializedConfig {
|
|||
|
||||
pub async fn command(config: Config) -> Result<()> {
|
||||
let SpecializedConfig {
|
||||
num_threads,
|
||||
router_run_config,
|
||||
querier_run_config,
|
||||
ingester_run_config,
|
||||
|
@ -592,20 +608,23 @@ pub async fn command(config: Config) -> Result<()> {
|
|||
// create common state from the router and use it below
|
||||
let common_state = CommonServerState::from_config(router_run_config.clone())?;
|
||||
|
||||
// TODO: make num_threads a parameter (other modes have it
|
||||
// configured by a command line)
|
||||
let num_threads = NonZeroUsize::new(num_cpus::get())
|
||||
.unwrap_or_else(|| NonZeroUsize::new(1).expect("1 is valid"));
|
||||
info!(%num_threads, "Creating shared query executor");
|
||||
|
||||
let parquet_store_real = ParquetStorage::new(Arc::clone(&object_store), StorageId::from("iox"));
|
||||
let parquet_store_scratchpad = ParquetStorage::new(
|
||||
Arc::new(MetricsStore::new(
|
||||
Arc::new(object_store::memory::InMemory::new()),
|
||||
&metrics,
|
||||
"scratchpad",
|
||||
)),
|
||||
StorageId::from("iox_scratchpad"),
|
||||
);
|
||||
let exec = Arc::new(Executor::new_with_config(ExecutorConfig {
|
||||
num_threads,
|
||||
target_query_partitions: num_threads,
|
||||
object_stores: HashMap::from([(
|
||||
parquet_store_real.id(),
|
||||
Arc::clone(parquet_store_real.object_store()),
|
||||
)]),
|
||||
object_stores: [&parquet_store_real, &parquet_store_scratchpad]
|
||||
.into_iter()
|
||||
.map(|store| (store.id(), Arc::clone(store.object_store())))
|
||||
.collect(),
|
||||
metric_registry: Arc::clone(&metrics),
|
||||
mem_pool_size: querier_config.exec_mem_pool_bytes,
|
||||
}));
|
||||
|
@ -633,14 +652,6 @@ pub async fn command(config: Config) -> Result<()> {
|
|||
.expect("failed to start ingester");
|
||||
|
||||
info!("starting compactor");
|
||||
let parquet_store_scratchpad = ParquetStorage::new(
|
||||
Arc::new(MetricsStore::new(
|
||||
Arc::new(object_store::memory::InMemory::new()),
|
||||
&metrics,
|
||||
"scratchpad",
|
||||
)),
|
||||
StorageId::from("iox_scratchpad"),
|
||||
);
|
||||
|
||||
let compactor = create_compactor_server_type(
|
||||
&common_state,
|
||||
|
@ -663,6 +674,10 @@ pub async fn command(config: Config) -> Result<()> {
|
|||
exec,
|
||||
time_provider,
|
||||
querier_config,
|
||||
trace_context_header_name: querier_run_config
|
||||
.tracing_config()
|
||||
.traces_jaeger_trace_context_header_name
|
||||
.clone(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
|
|
@ -115,6 +115,11 @@ pub async fn command(config: Config) -> Result<(), Error> {
|
|||
exec,
|
||||
time_provider,
|
||||
querier_config: config.querier_config,
|
||||
trace_context_header_name: config
|
||||
.run_config
|
||||
.tracing_config()
|
||||
.traces_jaeger_trace_context_header_name
|
||||
.clone(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
|
|
@ -1,5 +1,22 @@
|
|||
//! Tests the `influxdb_iox debug` commands
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_util::assert_batches_sorted_eq;
|
||||
use assert_cmd::Command;
|
||||
use futures::FutureExt;
|
||||
use predicates::prelude::*;
|
||||
use tempfile::TempDir;
|
||||
use test_helpers::timeout::FutureTimeout;
|
||||
use test_helpers_end_to_end::{
|
||||
maybe_skip_integration, run_sql, MiniCluster, ServerFixture, Step, StepTest, StepTestState,
|
||||
TestConfig,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_git_version() {
|
||||
|
@ -27,3 +44,261 @@ async fn test_print_cpu() {
|
|||
"rustc is using the following target options",
|
||||
));
|
||||
}
|
||||
|
||||
/// Tests that we can
|
||||
///
|
||||
/// 1. export a table from one IOx instance into a directory of files
|
||||
/// 2. build a catalog from that directory of that files
|
||||
/// 3. Start a all-in-one instance from that rebuilt catalog
|
||||
/// 4. Can run a query successfully
|
||||
#[tokio::test]
|
||||
// Ignore due to https://github.com/influxdata/influxdb_iox/issues/8203
|
||||
#[ignore]
|
||||
async fn build_catalog() {
|
||||
test_helpers::maybe_start_logging();
|
||||
let database_url = maybe_skip_integration!();
|
||||
let table_name = "my_awesome_table";
|
||||
|
||||
let mut cluster = MiniCluster::create_shared(database_url).await;
|
||||
|
||||
let sql = "select tag1, tag2, val from my_awesome_table";
|
||||
let expected = [
|
||||
"+------+------+-----+",
|
||||
"| tag1 | tag2 | val |",
|
||||
"+------+------+-----+",
|
||||
"| C | D | 43 |",
|
||||
"+------+------+-----+",
|
||||
];
|
||||
|
||||
StepTest::new(
|
||||
&mut cluster,
|
||||
vec![
|
||||
// Persist some data
|
||||
Step::RecordNumParquetFiles,
|
||||
Step::WriteLineProtocol(format!("{table_name},tag1=C,tag2=D val=43i 123456")),
|
||||
Step::WaitForPersisted {
|
||||
expected_increase: 1,
|
||||
},
|
||||
Step::Query {
|
||||
sql: sql.to_string(),
|
||||
expected: expected.to_vec(),
|
||||
},
|
||||
Step::Custom(Box::new(move |state: &mut StepTestState| {
|
||||
async move {
|
||||
let router_addr = state.cluster().router().router_grpc_base().to_string();
|
||||
let namespace = state.cluster().namespace().to_string();
|
||||
|
||||
// directory to export files to
|
||||
let export_dir =
|
||||
tempfile::tempdir().expect("could not get temporary directory");
|
||||
|
||||
// call `influxdb_iox remote store get-table <namespace> <table_name>`
|
||||
// to the table to a temporary directory
|
||||
Command::cargo_bin("influxdb_iox")
|
||||
.unwrap()
|
||||
.current_dir(export_dir.as_ref())
|
||||
.arg("-h")
|
||||
.arg(&router_addr)
|
||||
.arg("remote")
|
||||
.arg("store")
|
||||
.arg("get-table")
|
||||
.arg(&namespace)
|
||||
.arg(table_name)
|
||||
.assert()
|
||||
.success();
|
||||
|
||||
// Data is exported in <export_dir>/table_name
|
||||
let table_dir = export_dir.path().join(table_name);
|
||||
|
||||
// We can build a catalog and start up the server and run a query
|
||||
let restarted = RestartedServer::build_catalog_and_start(&table_dir).await;
|
||||
let batches = restarted
|
||||
.run_sql_until_non_empty(sql, namespace.as_str())
|
||||
.await;
|
||||
assert_batches_sorted_eq!(&expected, &batches);
|
||||
|
||||
// We can also rebuild a catalog from just the parquet files
|
||||
let only_parquet_dir = copy_only_parquet_files(&table_dir);
|
||||
let restarted =
|
||||
RestartedServer::build_catalog_and_start(only_parquet_dir.path()).await;
|
||||
let batches = restarted
|
||||
.run_sql_until_non_empty(sql, namespace.as_str())
|
||||
.await;
|
||||
assert_batches_sorted_eq!(&expected, &batches);
|
||||
}
|
||||
.boxed()
|
||||
})),
|
||||
],
|
||||
)
|
||||
.run()
|
||||
.await
|
||||
}
|
||||
|
||||
/// An all in one instance, with data directory of `data_dir`
|
||||
struct RestartedServer {
|
||||
all_in_one: ServerFixture,
|
||||
|
||||
/// data_dir is held so the temp dir is only cleaned on drop
|
||||
#[allow(dead_code)]
|
||||
data_dir: TempDir,
|
||||
}
|
||||
|
||||
impl RestartedServer {
|
||||
async fn run_sql(
|
||||
&self,
|
||||
sql: impl Into<String>,
|
||||
namespace: impl Into<String>,
|
||||
) -> Vec<RecordBatch> {
|
||||
let (batches, _schema) = run_sql(
|
||||
sql,
|
||||
namespace,
|
||||
self.all_in_one.querier_grpc_connection(),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.await;
|
||||
|
||||
batches
|
||||
}
|
||||
|
||||
/// builds a catalog from an export directory and starts a all in
|
||||
/// one instance with that exported directory.
|
||||
async fn build_catalog_and_start(exported_table_dir: &Path) -> Self {
|
||||
// directory to rebuild catalog in
|
||||
let data_dir = tempfile::tempdir().expect("could not get temporary directory");
|
||||
|
||||
println!("Input directory: {exported_table_dir:?}");
|
||||
println!("target_directory: {data_dir:?}");
|
||||
|
||||
// call `influxdb_iox debug build-catalog <table_dir> <new_data_dir>`
|
||||
let cmd = Command::cargo_bin("influxdb_iox")
|
||||
.unwrap()
|
||||
// use -v to enable logging so we can check the status messages
|
||||
.arg("-vv")
|
||||
.arg("debug")
|
||||
.arg("build-catalog")
|
||||
.arg(exported_table_dir.as_os_str().to_str().unwrap())
|
||||
.arg(data_dir.path().as_os_str().to_str().unwrap())
|
||||
.assert()
|
||||
.success();
|
||||
|
||||
// debug information to track down https://github.com/influxdata/influxdb_iox/issues/8203
|
||||
println!("***** Begin build-catalog STDOUT ****");
|
||||
std::io::stdout()
|
||||
.write_all(&cmd.get_output().stdout)
|
||||
.unwrap();
|
||||
println!("***** Begin build-catalog STDERR ****");
|
||||
std::io::stdout()
|
||||
.write_all(&cmd.get_output().stderr)
|
||||
.unwrap();
|
||||
println!("***** DONE ****");
|
||||
|
||||
cmd.stdout(
|
||||
predicate::str::contains("Beginning catalog / object_store build")
|
||||
.and(predicate::str::contains(
|
||||
"Begin importing files total_files=1",
|
||||
))
|
||||
.and(predicate::str::contains(
|
||||
"Completed importing files total_files=1",
|
||||
)),
|
||||
);
|
||||
|
||||
println!("Completed rebuild in {data_dir:?}");
|
||||
RecursiveDirPrinter::new().print(data_dir.path());
|
||||
|
||||
// now, start up a new server in all-in-one mode
|
||||
// using the newly built data directory
|
||||
let test_config = TestConfig::new_all_in_one_with_data_dir(data_dir.path());
|
||||
let all_in_one = ServerFixture::create(test_config).await;
|
||||
|
||||
Self {
|
||||
all_in_one,
|
||||
data_dir,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs the SQL query against this server, in a loop until
|
||||
/// results are returned. Panics if the results are not produced
|
||||
/// within a 5 seconds
|
||||
async fn run_sql_until_non_empty(&self, sql: &str, namespace: &str) -> Vec<RecordBatch> {
|
||||
let timeout = Duration::from_secs(5);
|
||||
let loop_sleep = Duration::from_millis(500);
|
||||
let fut = async {
|
||||
loop {
|
||||
let batches = self.run_sql(sql, namespace).await;
|
||||
if !batches.is_empty() {
|
||||
return batches;
|
||||
}
|
||||
tokio::time::sleep(loop_sleep).await;
|
||||
}
|
||||
};
|
||||
|
||||
fut.with_timeout(timeout)
|
||||
.await
|
||||
.expect("timed out waiting for non-empty batches in result")
|
||||
}
|
||||
}
|
||||
|
||||
/// Copies only parquet files from the source directory to a new
|
||||
/// temporary directory
|
||||
fn copy_only_parquet_files(src: &Path) -> TempDir {
|
||||
let target_dir = TempDir::new().expect("can't make temp dir");
|
||||
for entry in std::fs::read_dir(src).unwrap() {
|
||||
let entry = entry.unwrap();
|
||||
let src = entry.path();
|
||||
match src.extension() {
|
||||
Some(ext) if ext == "parquet" => {
|
||||
println!("Copying {ext:?} entry: {entry:?}");
|
||||
let dst = target_dir.path().join(src.file_name().unwrap());
|
||||
std::fs::copy(src, &dst).expect("error copying");
|
||||
}
|
||||
Some(ext) => {
|
||||
println!("Skipping {ext:?} entry: {entry:?}");
|
||||
}
|
||||
None => {
|
||||
println!("skipping no ext");
|
||||
}
|
||||
}
|
||||
}
|
||||
target_dir
|
||||
}
|
||||
|
||||
/// Prints out the contents of the directory recursively
|
||||
/// for debugging.
|
||||
///
|
||||
/// ```text
|
||||
/// RecursiveDirPrinter All files rooted at "/tmp/.tmpvf16r0"
|
||||
/// "/tmp/.tmpvf16r0"
|
||||
/// "/tmp/.tmpvf16r0/catalog.sqlite"
|
||||
/// "/tmp/.tmpvf16r0/object_store"
|
||||
/// "/tmp/.tmpvf16r0/object_store/1"
|
||||
/// "/tmp/.tmpvf16r0/object_store/1/1"
|
||||
/// "/tmp/.tmpvf16r0/object_store/1/1/b862a7e9b329ee6a418cde191198eaeb1512753f19b87a81def2ae6c3d0ed237"
|
||||
/// "/tmp/.tmpvf16r0/object_store/1/1/b862a7e9b329ee6a418cde191198eaeb1512753f19b87a81def2ae6c3d0ed237/d78abef6-6859-48eb-aa62-3518097fbb9b.parquet"
|
||||
///
|
||||
struct RecursiveDirPrinter {
|
||||
paths: VecDeque<PathBuf>,
|
||||
}
|
||||
|
||||
impl RecursiveDirPrinter {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
paths: VecDeque::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// print root and all directories
|
||||
fn print(mut self, root: &Path) {
|
||||
println!("RecursiveDirPrinter All files rooted at {root:?}");
|
||||
self.paths.push_back(PathBuf::from(root));
|
||||
|
||||
while let Some(path) = self.paths.pop_front() {
|
||||
println!("{path:?}");
|
||||
if path.is_dir() {
|
||||
for entry in std::fs::read_dir(path).unwrap() {
|
||||
self.paths.push_front(entry.unwrap().path());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -426,6 +426,33 @@ mod influxql {
|
|||
.await;
|
||||
}
|
||||
|
||||
/// Test TOP/BOTTOM functions, which use window functions to project
|
||||
/// the top or bottom rows in groups.
|
||||
#[tokio::test]
|
||||
async fn top_bottom() {
|
||||
test_helpers::maybe_start_logging();
|
||||
|
||||
TestCase {
|
||||
input: "cases/in/top_bottom.influxql",
|
||||
chunk_stage: ChunkStage::Ingester,
|
||||
}
|
||||
.run()
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Test PERCENTILE functions.
|
||||
#[tokio::test]
|
||||
async fn percentile() {
|
||||
test_helpers::maybe_start_logging();
|
||||
|
||||
TestCase {
|
||||
input: "cases/in/percentile.influxql",
|
||||
chunk_stage: ChunkStage::Ingester,
|
||||
}
|
||||
.run()
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn influxql_metadata() {
|
||||
test_helpers::maybe_start_logging();
|
||||
|
|
|
@ -109,7 +109,7 @@
|
|||
| physical_plan | ProjectionExec: expr=[date_bin(Utf8("1 month"),cpu.time,Utf8("1970-12-31T00:15:00Z"))@0 as month, COUNT(cpu.user)@1 as COUNT(cpu.user)] |
|
||||
| | AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("1 month"),cpu.time,Utf8("1970-12-31T00:15:00Z"))@0 as date_bin(Utf8("1 month"),cpu.time,Utf8("1970-12-31T00:15:00Z"))], aggr=[COUNT(cpu.user)] |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
| | RepartitionExec: partitioning=Hash([Column { name: "date_bin(Utf8(\"1 month\"),cpu.time,Utf8(\"1970-12-31T00:15:00Z\"))", index: 0 }], 4), input_partitions=4 |
|
||||
| | RepartitionExec: partitioning=Hash([date_bin(Utf8("1 month"),cpu.time,Utf8("1970-12-31T00:15:00Z"))@0], 4), input_partitions=4 |
|
||||
| | AggregateExec: mode=Partial, gby=[date_bin(79228162514264337593543950336, time@0, 31450500000000000) as date_bin(Utf8("1 month"),cpu.time,Utf8("1970-12-31T00:15:00Z"))], aggr=[COUNT(cpu.user)] |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
| plan_type | plan |
|
||||
----------
|
||||
| logical_plan | Projection: date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time) AS minute, COUNT(cpu.user) |
|
||||
| | GapFill: groupBy=[[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)]], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
|
||||
| | GapFill: groupBy=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time), stride=IntervalMonthDayNano("600000000000"), range=Included(Literal(TimestampNanosecond(957528000000000000, None)))..Included(Literal(TimestampNanosecond(957531540000000000, None))) |
|
||||
| | Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("600000000000"), cpu.time) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)]], aggr=[[COUNT(cpu.user)]] |
|
||||
| | TableScan: cpu projection=[time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)] |
|
||||
| physical_plan | ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@0 as minute, COUNT(cpu.user)@1 as COUNT(cpu.user)] |
|
||||
|
@ -32,7 +32,7 @@
|
|||
| | SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@0 ASC] |
|
||||
| | AggregateExec: mode=FinalPartitioned, gby=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@0 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[COUNT(cpu.user)] |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
| | RepartitionExec: partitioning=Hash([Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time)", index: 0 }], 4), input_partitions=4 |
|
||||
| | RepartitionExec: partitioning=Hash([date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@0], 4), input_partitions=4 |
|
||||
| | AggregateExec: mode=Partial, gby=[date_bin(600000000000, time@0) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[COUNT(cpu.user)] |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
|
@ -116,7 +116,7 @@ Error during planning: gap-filling query is missing lower time bound
|
|||
| plan_type | plan |
|
||||
----------
|
||||
| logical_plan | Projection: cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time) AS minute, AVG(cpu.user) AS locf(AVG(cpu.user)) |
|
||||
| | GapFill: groupBy=[[cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)]], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
|
||||
| | GapFill: groupBy=[cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time), stride=IntervalMonthDayNano("600000000000"), range=Included(Literal(TimestampNanosecond(957528000000000000, None)))..Included(Literal(TimestampNanosecond(957531540000000000, None))) |
|
||||
| | Aggregate: groupBy=[[cpu.region, date_bin(IntervalMonthDayNano("600000000000"), cpu.time) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)]], aggr=[[AVG(cpu.user)]] |
|
||||
| | TableScan: cpu projection=[region, time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)] |
|
||||
| physical_plan | ProjectionExec: expr=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@1 as minute, AVG(cpu.user)@2 as locf(AVG(cpu.user))] |
|
||||
|
@ -125,7 +125,7 @@ Error during planning: gap-filling query is missing lower time bound
|
|||
| | SortExec: expr=[region@0 ASC,date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@1 ASC] |
|
||||
| | AggregateExec: mode=FinalPartitioned, gby=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@1 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[AVG(cpu.user)], ordering_mode=PartiallyOrdered |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
| | RepartitionExec: partitioning=Hash([Column { name: "region", index: 0 }, Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time)", index: 1 }], 4), input_partitions=1 |
|
||||
| | RepartitionExec: partitioning=Hash([region@0, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)@1], 4), input_partitions=1 |
|
||||
| | AggregateExec: mode=Partial, gby=[region@0 as region, date_bin(600000000000, time@1) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time)], aggr=[AVG(cpu.user)], ordering_mode=PartiallyOrdered |
|
||||
| | CoalesceBatchesExec: target_batch_size=8192 |
|
||||
| | FilterExec: time@1 >= 957528000000000000 AND time@1 <= 957531540000000000 |
|
||||
|
|
|
@ -640,3 +640,15 @@ SELECT min, max, max - min FROM (SELECT MIN(usage_idle), MAX(usage_system) FROM
|
|||
|
||||
-- the predicate in the outer-most query is the narrowest, and therefore pushed through all the children
|
||||
SELECT * FROM (SELECT * FROM (SELECT FIRST(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z') WHERE time >= '2022-10-31T02:00:00Z') WHERE time >= '2022-10-31T02:00:10Z';
|
||||
|
||||
--
|
||||
-- division operator not performing cast
|
||||
--
|
||||
-- https://github.com/influxdata/influxdb_iox/issues/8168
|
||||
--
|
||||
|
||||
-- Raw query
|
||||
SELECT bytes_free / bytes_used AS result FROM disk;
|
||||
|
||||
-- Aggregate query
|
||||
SELECT SUM(bytes_free) / SUM(bytes_used) AS result FROM disk WHERE time >= '2022-10-31T02:00:00Z' AND time <= '2022-10-31T02:00:10Z' GROUP BY time(5s) FILL(null);
|
|
@ -372,7 +372,7 @@ name: m0
|
|||
+---------------------+-----+-------+---------------------+--------------------+--------------------+--------------------+-------------------+-----------+----------------------+-----------+------------------+-------------------+--------------------+-------------------+-------+-------+-------+-------+
|
||||
| time | i64 | abs | sin | cos | tan | acos | atan | atan2 | exp | log | ln | log2 | log10 | sqrt | pow | floor | ceil | round |
|
||||
+---------------------+-----+-------+---------------------+--------------------+--------------------+--------------------+-------------------+-----------+----------------------+-----------+------------------+-------------------+--------------------+-------------------+-------+-------+-------+-------+
|
||||
| 2022-10-31T02:00:00 | 101 | 101.0 | 0.45202578717835057 | 0.8920048697881602 | 0.5067526002248183 | 1.5707963267948966 | 1.560895660206908 | 1.5509969 | 7.307059979368067e43 | 2.2194037 | 4.61512051684126 | 6.658211482751795 | 2.0043213737826426 | 10.04987562112089 | 10201 | 101.0 | 101.0 | 101.0 |
|
||||
| 2022-10-31T02:00:00 | 101 | 101.0 | 0.45202578717835057 | 0.8920048697881602 | 0.5067526002248183 | 1.5608951749237256 | 1.560895660206908 | 1.5509969 | 7.307059979368067e43 | 2.2194037 | 4.61512051684126 | 6.658211482751795 | 2.0043213737826426 | 10.04987562112089 | 10201 | 101.0 | 101.0 | 101.0 |
|
||||
+---------------------+-----+-------+---------------------+--------------------+--------------------+--------------------+-------------------+-----------+----------------------+-----------+------------------+-------------------+--------------------+-------------------+-------+-------+-------+-------+
|
||||
-- InfluxQL: SELECT log(f64) FROM m0 LIMIT 1;
|
||||
Error while planning query: Error during planning: invalid number of arguments for log, expected 2, got 1
|
||||
|
@ -933,7 +933,7 @@ name: physical_plan
|
|||
ProjectionExec: expr=[m0 as iox::measurement, 0 as time, tag0@0 as tag0, COUNT(m0.f64)@1 as count, SUM(m0.f64)@2 as sum, STDDEV(m0.f64)@3 as stddev]
|
||||
AggregateExec: mode=FinalPartitioned, gby=[tag0@0 as tag0], aggr=[COUNT(m0.f64), SUM(m0.f64), STDDEV(m0.f64)]
|
||||
CoalesceBatchesExec: target_batch_size=8192
|
||||
RepartitionExec: partitioning=Hash([Column { name: "tag0", index: 0 }], 4), input_partitions=4
|
||||
RepartitionExec: partitioning=Hash([tag0@0], 4), input_partitions=4
|
||||
RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
|
||||
AggregateExec: mode=Partial, gby=[tag0@1 as tag0], aggr=[COUNT(m0.f64), SUM(m0.f64), STDDEV(m0.f64)]
|
||||
ParquetExec: file_groups={1 group: [[1/1/1/00000000-0000-0000-0000-000000000000.parquet]]}, projection=[f64, tag0]
|
||||
|
@ -942,7 +942,7 @@ name: physical_plan
|
|||
RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=4
|
||||
AggregateExec: mode=FinalPartitioned, gby=[tag0@0 as tag0], aggr=[COUNT(m1.f64), SUM(m1.f64), STDDEV(m1.f64)], ordering_mode=FullyOrdered
|
||||
CoalesceBatchesExec: target_batch_size=8192
|
||||
RepartitionExec: partitioning=Hash([Column { name: "tag0", index: 0 }], 4), input_partitions=1
|
||||
RepartitionExec: partitioning=Hash([tag0@0], 4), input_partitions=1
|
||||
AggregateExec: mode=Partial, gby=[tag0@1 as tag0], aggr=[COUNT(m1.f64), SUM(m1.f64), STDDEV(m1.f64)], ordering_mode=FullyOrdered
|
||||
ParquetExec: file_groups={1 group: [[1/1/1/00000000-0000-0000-0000-000000000001.parquet]]}, projection=[f64, tag0], output_ordering=[tag0@1 ASC]
|
||||
-- InfluxQL: SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1 GROUP BY tag0;
|
||||
|
@ -3064,4 +3064,25 @@ name: cpu
|
|||
| time | first |
|
||||
+---------------------+-------+
|
||||
| 2022-10-31T02:00:10 | 2.99 |
|
||||
+---------------------+-------+
|
||||
+---------------------+-------+
|
||||
-- InfluxQL: SELECT bytes_free / bytes_used AS result FROM disk;
|
||||
name: disk
|
||||
+---------------------+-----------------------+
|
||||
| time | result |
|
||||
+---------------------+-----------------------+
|
||||
| 2022-10-31T02:00:00 | 0.005613224283335911 |
|
||||
| 2022-10-31T02:00:00 | 0.006984786047936768 |
|
||||
| 2022-10-31T02:00:00 | 0.007702971146013462 |
|
||||
| 2022-10-31T02:00:10 | 0.005636096491427584 |
|
||||
| 2022-10-31T02:00:10 | 0.007000528400759146 |
|
||||
| 2022-10-31T02:00:10 | 0.0077149723818756505 |
|
||||
+---------------------+-----------------------+
|
||||
-- InfluxQL: SELECT SUM(bytes_free) / SUM(bytes_used) AS result FROM disk WHERE time >= '2022-10-31T02:00:00Z' AND time <= '2022-10-31T02:00:10Z' GROUP BY time(5s) FILL(null);
|
||||
name: disk
|
||||
+---------------------+----------------------+
|
||||
| time | result |
|
||||
+---------------------+----------------------+
|
||||
| 2022-10-31T02:00:00 | 0.006984786047936768 |
|
||||
| 2022-10-31T02:00:05 | 0.0 |
|
||||
| 2022-10-31T02:00:10 | 0.007000528400759146 |
|
||||
+---------------------+----------------------+
|
|
@ -0,0 +1,19 @@
|
|||
-- Query tests for influxql percentile
|
||||
-- IOX_SETUP: percentile
|
||||
|
||||
--
|
||||
-- Selectors
|
||||
--
|
||||
SELECT PERCENTILE(usage_idle,50) FROM cpu;
|
||||
SELECT cpu,PERCENTILE(usage_idle,66.667),usage_system FROM cpu;
|
||||
SELECT PERCENTILE(usage_idle,33.333) FROM cpu GROUP BY cpu;
|
||||
SELECT PERCENTILE(usage_idle,90),usage_user FROM cpu WHERE cpu='3';
|
||||
-- 0th percentile doesn't return any rows.
|
||||
SELECT PERCENTILE(usage_idle,0) FROM cpu;
|
||||
|
||||
--
|
||||
-- Aggregators
|
||||
--
|
||||
SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu;
|
||||
SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu GROUP BY cpu;
|
||||
SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu WHERE time >= '1970-01-01 00:00:00' AND time < '1970-01-01 03:00:00' GROUP BY time(1h),cpu;
|
|
@ -0,0 +1,152 @@
|
|||
-- Test Setup: percentile
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_idle,50) FROM cpu;
|
||||
name: cpu
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T01:10:09 | 49.7047 |
|
||||
+---------------------+------------+
|
||||
-- InfluxQL: SELECT cpu,PERCENTILE(usage_idle,66.667),usage_system FROM cpu;
|
||||
name: cpu
|
||||
+---------------------+-----+------------+--------------+
|
||||
| time | cpu | percentile | usage_system |
|
||||
+---------------------+-----+------------+--------------+
|
||||
| 1970-01-01T01:39:15 | 0 | 66.1469 | 99.8854 |
|
||||
+---------------------+-----+------------+--------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_idle,33.333) FROM cpu GROUP BY cpu;
|
||||
name: cpu
|
||||
tags: cpu=0
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T01:15:15 | 32.9757 |
|
||||
+---------------------+------------+
|
||||
name: cpu
|
||||
tags: cpu=1
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T02:13:36 | 32.3062 |
|
||||
+---------------------+------------+
|
||||
name: cpu
|
||||
tags: cpu=2
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T01:24:22 | 35.0742 |
|
||||
+---------------------+------------+
|
||||
name: cpu
|
||||
tags: cpu=3
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T01:16:58 | 32.0821 |
|
||||
+---------------------+------------+
|
||||
name: cpu
|
||||
tags: cpu=4
|
||||
+---------------------+------------+
|
||||
| time | percentile |
|
||||
+---------------------+------------+
|
||||
| 1970-01-01T00:42:34 | 32.9685 |
|
||||
+---------------------+------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_idle,90),usage_user FROM cpu WHERE cpu='3';
|
||||
name: cpu
|
||||
+---------------------+------------+------------+
|
||||
| time | percentile | usage_user |
|
||||
+---------------------+------------+------------+
|
||||
| 1970-01-01T00:19:23 | 89.7011 | 34.7815 |
|
||||
+---------------------+------------+------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_idle,0) FROM cpu;
|
||||
+------+------------+
|
||||
| time | percentile |
|
||||
+------+------------+
|
||||
+------+------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu;
|
||||
name: cpu
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 49.2732 | 89.754 | 99.0822 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu GROUP BY cpu;
|
||||
name: cpu
|
||||
tags: cpu=0
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 49.7946 | 90.0001 | 98.8816 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=1
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 50.148 | 89.4109 | 98.8158 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=2
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 49.0258 | 89.7425 | 99.2486 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=3
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 49.2054 | 89.9907 | 99.244 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=4
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 48.1551 | 89.1691 | 98.9134 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
-- InfluxQL: SELECT PERCENTILE(usage_system, 50), PERCENTILE(usage_system, 90), PERCENTILE(usage_system, 99) FROM cpu WHERE time >= '1970-01-01 00:00:00' AND time < '1970-01-01 03:00:00' GROUP BY time(1h),cpu;
|
||||
name: cpu
|
||||
tags: cpu=0
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 49.9884 | 89.7541 | 99.1413 |
|
||||
| 1970-01-01T01:00:00 | 47.7725 | 90.8035 | 98.8471 |
|
||||
| 1970-01-01T02:00:00 | 53.5363 | 90.0001 | 98.444 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=1
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 48.2785 | 88.3004 | 98.7959 |
|
||||
| 1970-01-01T01:00:00 | 51.1512 | 92.2132 | 98.9797 |
|
||||
| 1970-01-01T02:00:00 | 49.6265 | 87.8342 | 98.0481 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=2
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 50.0065 | 89.5125 | 99.109 |
|
||||
| 1970-01-01T01:00:00 | 47.9867 | 89.5532 | 99.4226 |
|
||||
| 1970-01-01T02:00:00 | 49.4459 | 90.439 | 99.2486 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=3
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 46.7256 | 90.7002 | 99.3269 |
|
||||
| 1970-01-01T01:00:00 | 50.7717 | 89.2459 | 98.9579 |
|
||||
| 1970-01-01T02:00:00 | 49.6766 | 89.555 | 98.9499 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
name: cpu
|
||||
tags: cpu=4
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| time | percentile | percentile_1 | percentile_2 |
|
||||
+---------------------+------------+--------------+--------------+
|
||||
| 1970-01-01T00:00:00 | 47.403 | 89.0086 | 98.9134 |
|
||||
| 1970-01-01T01:00:00 | 50.6295 | 89.1826 | 98.9091 |
|
||||
| 1970-01-01T02:00:00 | 46.1348 | 89.2463 | 98.7592 |
|
||||
+---------------------+------------+--------------+--------------+
|
|
@ -0,0 +1,26 @@
|
|||
-- IOX_SETUP: top_bottom
|
||||
|
||||
--
|
||||
-- top
|
||||
--
|
||||
SELECT top(writes, 2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
|
||||
SELECT top(usage_system,3) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 AND cpu = 'cpu0';
|
||||
SELECT top(usage_idle,5), cpu FROM cpu GROUP BY machine;
|
||||
SELECT top(writes,3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
|
||||
SELECT top(writes,2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
|
||||
SELECT top(usage_system,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT top(usage_system,machine,2),cpu FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT top(usage_system,machine,2),machine FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT top(usage_idle,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 GROUP BY TIME(60s);
|
||||
--
|
||||
-- bottom
|
||||
--
|
||||
SELECT bottom(reads, 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
|
||||
SELECT bottom(usage_system,3) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 AND cpu = 'cpu1';
|
||||
SELECT bottom(usage_idle,5), cpu FROM cpu GROUP BY machine;
|
||||
SELECT bottom(writes,3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
|
||||
SELECT bottom(writes,2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
|
||||
SELECT bottom(usage_system,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT bottom(usage_system,machine,2),cpu FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT bottom(usage_system,machine,2),machine FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
SELECT bottom(usage_idle,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 GROUP BY TIME(60s);
|
|
@ -0,0 +1,210 @@
|
|||
-- Test Setup: top_bottom
|
||||
-- InfluxQL: SELECT top(writes, 2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | top |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:03:20 | 5593589 |
|
||||
| 1970-01-01T00:03:30 | 5593735 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT top(usage_system,3) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 AND cpu = 'cpu0';
|
||||
name: cpu
|
||||
+---------------------+------+
|
||||
| time | top |
|
||||
+---------------------+------+
|
||||
| 1970-01-01T00:02:10 | 89.8 |
|
||||
| 1970-01-01T00:02:50 | 89.8 |
|
||||
| 1970-01-01T00:03:00 | 90.0 |
|
||||
+---------------------+------+
|
||||
-- InfluxQL: SELECT top(usage_idle,5), cpu FROM cpu GROUP BY machine;
|
||||
name: cpu
|
||||
tags: machine=machine1
|
||||
+---------------------+------+------+
|
||||
| time | top | cpu |
|
||||
+---------------------+------+------+
|
||||
| 1970-01-01T00:01:00 | 99.8 | cpu1 |
|
||||
| 1970-01-01T00:01:20 | 99.8 | cpu1 |
|
||||
| 1970-01-01T00:02:00 | 99.9 | cpu1 |
|
||||
| 1970-01-01T00:02:20 | 99.9 | cpu1 |
|
||||
| 1970-01-01T00:02:30 | 99.9 | cpu1 |
|
||||
+---------------------+------+------+
|
||||
name: cpu
|
||||
tags: machine=machine2
|
||||
+---------------------+------+------+
|
||||
| time | top | cpu |
|
||||
+---------------------+------+------+
|
||||
| 1970-01-01T00:01:00 | 89.8 | cpu1 |
|
||||
| 1970-01-01T00:01:20 | 89.8 | cpu1 |
|
||||
| 1970-01-01T00:02:00 | 89.9 | cpu1 |
|
||||
| 1970-01-01T00:02:20 | 89.9 | cpu1 |
|
||||
| 1970-01-01T00:02:30 | 89.9 | cpu1 |
|
||||
+---------------------+------+------+
|
||||
-- InfluxQL: SELECT top(writes,3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | top |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:02:10 | 5592646 |
|
||||
| 1970-01-01T00:02:20 | 5592810 |
|
||||
| 1970-01-01T00:02:30 | 5592997 |
|
||||
| 1970-01-01T00:02:40 | 5593109 |
|
||||
| 1970-01-01T00:02:50 | 5593219 |
|
||||
| 1970-01-01T00:03:00 | 5593438 |
|
||||
| 1970-01-01T00:03:10 | 5593513 |
|
||||
| 1970-01-01T00:03:20 | 5593589 |
|
||||
| 1970-01-01T00:03:30 | 5593735 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT top(writes,2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | top |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:02:10 | 5592646 |
|
||||
| 1970-01-01T00:02:20 | 5592810 |
|
||||
| 1970-01-01T00:02:40 | 5593109 |
|
||||
| 1970-01-01T00:02:50 | 5593219 |
|
||||
| 1970-01-01T00:03:10 | 5593513 |
|
||||
| 1970-01-01T00:03:20 | 5593589 |
|
||||
| 1970-01-01T00:03:30 | 5593735 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT top(usage_system,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+------+----------+------+
|
||||
| time | top | machine | cpu |
|
||||
+---------------------+------+----------+------+
|
||||
| 1970-01-01T00:02:00 | 99.9 | machine1 | cpu1 |
|
||||
| 1970-01-01T00:03:00 | 90.0 | machine1 | cpu0 |
|
||||
+---------------------+------+----------+------+
|
||||
-- InfluxQL: SELECT top(usage_system,machine,2),cpu FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+------+----------+------+
|
||||
| time | top | machine | cpu |
|
||||
+---------------------+------+----------+------+
|
||||
| 1970-01-01T00:02:00 | 99.9 | machine1 | cpu1 |
|
||||
| 1970-01-01T00:02:00 | 89.9 | machine2 | cpu1 |
|
||||
+---------------------+------+----------+------+
|
||||
-- InfluxQL: SELECT top(usage_system,machine,2),machine FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+------+----------+-----------+
|
||||
| time | top | machine | machine_1 |
|
||||
+---------------------+------+----------+-----------+
|
||||
| 1970-01-01T00:02:00 | 99.9 | machine1 | machine1 |
|
||||
| 1970-01-01T00:02:00 | 89.9 | machine2 | machine2 |
|
||||
+---------------------+------+----------+-----------+
|
||||
-- InfluxQL: SELECT top(usage_idle,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 GROUP BY TIME(60s);
|
||||
name: cpu
|
||||
+---------------------+------+----------+------+
|
||||
| time | top | machine | cpu |
|
||||
+---------------------+------+----------+------+
|
||||
| 1970-01-01T00:01:00 | 99.8 | machine1 | cpu1 |
|
||||
| 1970-01-01T00:01:00 | 89.8 | machine2 | cpu1 |
|
||||
| 1970-01-01T00:02:00 | 99.9 | machine1 | cpu1 |
|
||||
| 1970-01-01T00:02:30 | 90.4 | machine1 | cpu0 |
|
||||
| 1970-01-01T00:03:00 | 99.8 | machine1 | cpu1 |
|
||||
| 1970-01-01T00:03:00 | 90.0 | machine1 | cpu0 |
|
||||
+---------------------+------+----------+------+
|
||||
-- InfluxQL: SELECT bottom(reads, 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | bottom |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:02:10 | 2592646 |
|
||||
| 1970-01-01T00:02:30 | 2592997 |
|
||||
| 1970-01-01T00:02:50 | 2593219 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT bottom(usage_system,3) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 AND cpu = 'cpu1';
|
||||
name: cpu
|
||||
+---------------------+--------+
|
||||
| time | bottom |
|
||||
+---------------------+--------+
|
||||
| 1970-01-01T00:01:00 | 89.8 |
|
||||
| 1970-01-01T00:01:10 | 89.7 |
|
||||
| 1970-01-01T00:02:10 | 89.8 |
|
||||
+---------------------+--------+
|
||||
-- InfluxQL: SELECT bottom(usage_idle,5), cpu FROM cpu GROUP BY machine;
|
||||
name: cpu
|
||||
tags: machine=machine1
|
||||
+---------------------+--------+------+
|
||||
| time | bottom | cpu |
|
||||
+---------------------+--------+------+
|
||||
| 1970-01-01T00:01:10 | 88.6 | cpu0 |
|
||||
| 1970-01-01T00:01:20 | 88.6 | cpu0 |
|
||||
| 1970-01-01T00:01:30 | 83.4 | cpu0 |
|
||||
| 1970-01-01T00:01:40 | 87.7 | cpu0 |
|
||||
| 1970-01-01T00:02:00 | 86.9 | cpu0 |
|
||||
+---------------------+--------+------+
|
||||
name: cpu
|
||||
tags: machine=machine2
|
||||
+---------------------+--------+------+
|
||||
| time | bottom | cpu |
|
||||
+---------------------+--------+------+
|
||||
| 1970-01-01T00:01:10 | 78.6 | cpu0 |
|
||||
| 1970-01-01T00:01:20 | 78.6 | cpu0 |
|
||||
| 1970-01-01T00:01:30 | 73.4 | cpu0 |
|
||||
| 1970-01-01T00:01:40 | 77.7 | cpu0 |
|
||||
| 1970-01-01T00:02:00 | 76.9 | cpu0 |
|
||||
+---------------------+--------+------+
|
||||
-- InfluxQL: SELECT bottom(writes,3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | bottom |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:02:10 | 5592646 |
|
||||
| 1970-01-01T00:02:20 | 5592810 |
|
||||
| 1970-01-01T00:02:30 | 5592997 |
|
||||
| 1970-01-01T00:02:40 | 5593109 |
|
||||
| 1970-01-01T00:02:50 | 5593219 |
|
||||
| 1970-01-01T00:03:00 | 5593438 |
|
||||
| 1970-01-01T00:03:10 | 5593513 |
|
||||
| 1970-01-01T00:03:20 | 5593589 |
|
||||
| 1970-01-01T00:03:30 | 5593735 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT bottom(writes,2) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
|
||||
name: diskio
|
||||
+---------------------+---------+
|
||||
| time | bottom |
|
||||
+---------------------+---------+
|
||||
| 1970-01-01T00:02:10 | 5592646 |
|
||||
| 1970-01-01T00:02:20 | 5592810 |
|
||||
| 1970-01-01T00:02:30 | 5592997 |
|
||||
| 1970-01-01T00:02:40 | 5593109 |
|
||||
| 1970-01-01T00:03:00 | 5593438 |
|
||||
| 1970-01-01T00:03:10 | 5593513 |
|
||||
| 1970-01-01T00:03:30 | 5593735 |
|
||||
+---------------------+---------+
|
||||
-- InfluxQL: SELECT bottom(usage_system,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+--------+----------+------+
|
||||
| time | bottom | machine | cpu |
|
||||
+---------------------+--------+----------+------+
|
||||
| 1970-01-01T00:01:30 | 73.4 | machine2 | cpu0 |
|
||||
| 1970-01-01T00:01:30 | 83.4 | machine1 | cpu0 |
|
||||
+---------------------+--------+----------+------+
|
||||
-- InfluxQL: SELECT bottom(usage_system,machine,2),cpu FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+--------+----------+------+
|
||||
| time | bottom | machine | cpu |
|
||||
+---------------------+--------+----------+------+
|
||||
| 1970-01-01T00:01:30 | 73.4 | machine2 | cpu0 |
|
||||
| 1970-01-01T00:01:30 | 83.4 | machine1 | cpu0 |
|
||||
+---------------------+--------+----------+------+
|
||||
-- InfluxQL: SELECT bottom(usage_system,machine,2),machine FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001;
|
||||
name: cpu
|
||||
+---------------------+--------+----------+-----------+
|
||||
| time | bottom | machine | machine_1 |
|
||||
+---------------------+--------+----------+-----------+
|
||||
| 1970-01-01T00:01:30 | 83.4 | machine1 | machine1 |
|
||||
| 1970-01-01T00:01:30 | 73.4 | machine2 | machine2 |
|
||||
+---------------------+--------+----------+-----------+
|
||||
-- InfluxQL: SELECT bottom(usage_idle,machine,cpu,2) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 GROUP BY TIME(60s);
|
||||
name: cpu
|
||||
+---------------------+--------+----------+------+
|
||||
| time | bottom | machine | cpu |
|
||||
+---------------------+--------+----------+------+
|
||||
| 1970-01-01T00:01:30 | 73.4 | machine2 | cpu0 |
|
||||
| 1970-01-01T00:01:30 | 83.4 | machine1 | cpu0 |
|
||||
| 1970-01-01T00:02:00 | 76.9 | machine2 | cpu0 |
|
||||
| 1970-01-01T00:02:00 | 86.9 | machine1 | cpu0 |
|
||||
| 1970-01-01T00:03:10 | 78.8 | machine2 | cpu0 |
|
||||
| 1970-01-01T00:03:10 | 88.8 | machine1 | cpu0 |
|
||||
+---------------------+--------+----------+------+
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,80 @@
|
|||
# Load into influxdb 1.8:
|
||||
#
|
||||
# curl localhost:8086/write\?db=top_bottom --data-binary "@influxdb_iox/tests/query_tests/data/top_bottom.lp"
|
||||
#
|
||||
# Float data, regular intervals, usage_system has gaps
|
||||
#
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=89.5,usage_system=89.5 0000000060000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=79.5,usage_system=79.5 0000000060000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8,usage_system=99.8 0000000060000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8,usage_system=89.8 0000000060000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=88.6,usage_system=88.6 0000000070000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=78.6,usage_system=78.6 0000000070000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.7,usage_system=99.7 0000000070000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.7,usage_system=89.7 0000000070000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=88.6 0000000080000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=78.6 0000000080000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8 0000000080000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8 0000000080000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.7 0000000090000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.7 0000000090000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=83.4,usage_system=83.4 0000000090000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=73.4,usage_system=73.4 0000000090000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.7 0000000100000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.7 0000000100000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=87.7,usage_system=87.7 0000000100000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=77.7,usage_system=77.7 0000000100000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=88.7 0000000110000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=78.7 0000000110000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.3 0000000110000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.3 0000000110000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=86.9 0000000120000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=76.9 0000000120000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.9,usage_system=99.9 0000000120000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.9,usage_system=89.9 0000000120000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=89.8,usage_system=89.8 0000000130000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=79.8,usage_system=79.8 0000000130000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8,usage_system=99.8 0000000130000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8,usage_system=89.8 0000000130000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=89.0 0000000140000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=79.0 0000000140000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.9,usage_system=99.9 0000000140000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.9,usage_system=89.9 0000000140000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=90.4 0000000150000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=80.4 0000000150000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.9 0000000150000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.9 0000000150000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=90.2 0000000160000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=80.2 0000000160000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8 0000000160000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8 0000000160000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8 0000000170000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8 0000000170000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=89.8,usage_system=89.8 0000000170000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=79.8,usage_system=79.8 0000000170000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=90.0,usage_system=90.0 0000000180000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=80.0,usage_system=80.0 0000000180000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8,usage_system=99.8 0000000180000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8,usage_system=89.8 0000000180000000000
|
||||
cpu,cpu=cpu0,machine=machine1 usage_idle=88.8 0000000190000000000
|
||||
cpu,cpu=cpu0,machine=machine2 usage_idle=78.8 0000000190000000000
|
||||
cpu,cpu=cpu1,machine=machine1 usage_idle=99.8,usage_system=99.8 0000000190000000000
|
||||
cpu,cpu=cpu1,machine=machine2 usage_idle=89.8,usage_system=89.8 0000000190000000000
|
||||
|
||||
# integers at regular intervals
|
||||
diskio,name=disk0 reads=2591520i,writes=5591520i 0000000060000000000
|
||||
diskio,name=disk0 writes=5591620i 0000000070000000000
|
||||
diskio,name=disk0 writes=5591729i 0000000080000000000
|
||||
diskio,name=disk0 writes=5592114i 0000000090000000000
|
||||
diskio,name=disk0 writes=5592210i 0000000100000000000
|
||||
diskio,name=disk0 reads=2592366i,writes=5592366i 0000000110000000000
|
||||
diskio,name=disk0 reads=2592576i,writes=5592576i 0000000120000000000
|
||||
diskio,name=disk0 reads=2592646i,writes=5592646i 0000000130000000000
|
||||
diskio,name=disk0 writes=5592810i 0000000140000000000
|
||||
diskio,name=disk0 reads=2592997i,writes=5592997i 0000000150000000000
|
||||
diskio,name=disk0 writes=5593109i 0000000160000000000
|
||||
diskio,name=disk0 reads=2593219i,writes=5593219i 0000000170000000000
|
||||
diskio,name=disk0 reads=2593438i,writes=5593438i 0000000180000000000
|
||||
diskio,name=disk0 writes=5593513i 0000000190000000000
|
||||
diskio,name=disk0 reads=2593589i,writes=5593589i 0000000200000000000
|
||||
diskio,name=disk0 reads=2593735i,writes=5593735i 0000000210000000000
|
|
@ -1378,6 +1378,34 @@ pub static SETUPS: Lazy<HashMap<SetupName, SetupSteps>> = Lazy::new(|| {
|
|||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
// Used for top/bottom function tests for InfluxQL
|
||||
"top_bottom",
|
||||
vec![
|
||||
Step::RecordNumParquetFiles,
|
||||
Step::WriteLineProtocol(
|
||||
include_str!("data/top_bottom.lp").to_string()
|
||||
),
|
||||
Step::Persist,
|
||||
Step::WaitForPersisted {
|
||||
expected_increase: 1,
|
||||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
// Used for percentile function tests for InfluxQL
|
||||
"percentile",
|
||||
vec![
|
||||
Step::RecordNumParquetFiles,
|
||||
Step::WriteLineProtocol(
|
||||
include_str!("data/percentile.lp").to_string()
|
||||
),
|
||||
Step::Persist,
|
||||
Step::WaitForPersisted {
|
||||
expected_increase: 1,
|
||||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
"DuplicateDifferentDomains",
|
||||
(0..2)
|
||||
|
|
|
@ -24,10 +24,10 @@ prost = "0.11"
|
|||
rand = "0.8.3"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
|
||||
schema = { path = "../schema" }
|
||||
serde_json = "1.0.100"
|
||||
serde_json = "1.0.102"
|
||||
tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread"] }
|
||||
tokio-stream = "0.1.13"
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
tonic = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
@ -20,7 +20,7 @@ repository = "https://github.com/influxdata/influxdb_iox/tree/main/influxdb_line
|
|||
bytes = "1.4"
|
||||
log = "0.4.19"
|
||||
nom = { version = "7", default-features = false, features = ["std"] }
|
||||
smallvec = { version = "1.10.0", features = ["union"] }
|
||||
smallvec = { version = "1.11.0", features = ["union"] }
|
||||
snafu = "0.7"
|
||||
|
||||
[dev-dependencies] # In alphabetical order
|
||||
|
|
|
@ -7,7 +7,7 @@ license.workspace = true
|
|||
|
||||
[dependencies]
|
||||
sqlparser = "0.35.0"
|
||||
snafu = "0.7.4"
|
||||
snafu = "0.7.5"
|
||||
|
||||
generated_types = { path = "../generated_types" }
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
|
|
@ -9,8 +9,8 @@ license.workspace = true
|
|||
arrow = { workspace = true, features = ["prettyprint"] }
|
||||
arrow_util = { version = "0.1.0", path = "../arrow_util" }
|
||||
arrow-flight = { workspace = true }
|
||||
async-channel = "1.8.0"
|
||||
async-trait = "0.1.70"
|
||||
async-channel = "1.9.0"
|
||||
async-trait = "0.1.71"
|
||||
backoff = { version = "0.1.0", path = "../backoff" }
|
||||
bytes = "1.4.0"
|
||||
crossbeam-utils = "0.8.16"
|
||||
|
@ -39,7 +39,8 @@ schema = { version = "0.1.0", path = "../schema" }
|
|||
service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
|
||||
sharder = { version = "0.1.0", path = "../sharder" }
|
||||
test_helpers = { path = "../test_helpers", features = ["future_timeout"], optional = true }
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
tracker = { path = "../tracker" }
|
||||
tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
|
||||
tokio-util = "0.7.8"
|
||||
tonic = { workspace = true }
|
||||
|
@ -81,3 +82,7 @@ name = "write"
|
|||
harness = false
|
||||
# Require some internal types be made visible for benchmark code.
|
||||
required-features = ["benches"]
|
||||
|
||||
[[bench]]
|
||||
name = "query"
|
||||
harness = false
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use data_types::{NamespaceId, PartitionKey, TableId};
|
||||
use ingester::IngesterRpcInterface;
|
||||
use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
|
||||
use ingester_test_ctx::{TestContext, TestContextBuilder};
|
||||
use std::{fmt::Write, sync::Arc, time::Instant};
|
||||
use tokio::sync::Barrier;
|
||||
|
||||
const TEST_NAMESPACE: &str = "bananas";
|
||||
const PARTITION_KEY: &str = "platanos";
|
||||
|
||||
fn generate_table_data(rows: usize, cols: usize) -> String {
|
||||
let mut buf = String::new();
|
||||
for i in 0..rows {
|
||||
write!(&mut buf, "bananas ").unwrap();
|
||||
for j in 0..(cols - 1) {
|
||||
write!(&mut buf, "v{j}={i}{j},").unwrap();
|
||||
}
|
||||
writeln!(&mut buf, "v{cols}={i}{cols} 42").unwrap(); // One timestamp -> one partition
|
||||
}
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
/// Return an initialised and pre-warmed ingester instance backed by a catalog
|
||||
/// correctly populated to accept writes of `lp`.
|
||||
async fn init(
|
||||
lp: impl AsRef<str>,
|
||||
) -> (TestContext<impl IngesterRpcInterface>, NamespaceId, TableId) {
|
||||
let lp = lp.as_ref();
|
||||
|
||||
let mut ctx = TestContextBuilder::default()
|
||||
// Don't stop ingest during benchmarks
|
||||
.with_max_persist_queue_depth(10_000_000)
|
||||
.with_persist_hot_partition_cost(10_000_000_000)
|
||||
.build()
|
||||
.await;
|
||||
|
||||
// Ensure the namespace exists in the catalog.
|
||||
let ns = ctx.ensure_namespace(TEST_NAMESPACE, None).await;
|
||||
|
||||
// Write the test data
|
||||
ctx.write_lp(TEST_NAMESPACE, lp, PartitionKey::from(PARTITION_KEY), 42)
|
||||
.await;
|
||||
|
||||
let table_id = ctx.table_id(TEST_NAMESPACE, "bananas").await;
|
||||
|
||||
(ctx, ns.id, table_id)
|
||||
}
|
||||
|
||||
fn bench_query(c: &mut Criterion) {
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to initialise tokio runtime for benchmark");
|
||||
|
||||
for (rows, cols) in [(100_000, 10), (100_000, 100), (100_000, 200)] {
|
||||
run_projection_bench("no projection", rows, cols, vec![], &runtime, c);
|
||||
run_projection_bench(
|
||||
"project 1 column",
|
||||
rows,
|
||||
cols,
|
||||
vec!["time".to_string()],
|
||||
&runtime,
|
||||
c,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_projection_bench(
|
||||
name: &str,
|
||||
rows: usize,
|
||||
cols: usize,
|
||||
projection: Vec<String>,
|
||||
runtime: &tokio::runtime::Runtime,
|
||||
c: &mut Criterion,
|
||||
) {
|
||||
let lp = generate_table_data(rows, cols);
|
||||
let (ctx, namespace_id, table_id) = runtime.block_on(init(lp));
|
||||
|
||||
let mut group = c.benchmark_group("projection");
|
||||
group.throughput(Throughput::Elements(1)); // Queries per second
|
||||
group.bench_function(
|
||||
BenchmarkId::new(name, format!("rows_{rows}_cols{cols}")),
|
||||
|b| {
|
||||
let ctx = &ctx;
|
||||
let projection = &projection;
|
||||
b.to_async(runtime).iter(|| async move {
|
||||
ctx.query(IngesterQueryRequest {
|
||||
namespace_id: namespace_id.get(),
|
||||
table_id: table_id.get(),
|
||||
columns: projection.clone(),
|
||||
predicate: None,
|
||||
})
|
||||
.await
|
||||
.expect("query request failed");
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/// Number of queries to send per reader, per iteration.
|
||||
const CONCURRENT_QUERY_BATCH_SIZE: usize = 20;
|
||||
|
||||
// Benchmark scalability of the read path as more readers are added when
|
||||
// querying partitions with varying amounts of data.
|
||||
//
|
||||
// The ingester "process" is running in the same threadpool as the benchmark
|
||||
// loop, so this isn't super clean.
|
||||
fn bench_query_concurrent(c: &mut Criterion) {
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to initialise tokio runtime for benchmark");
|
||||
|
||||
for readers in [1, 10, 100] {
|
||||
for buf_size_lines in [1000, 50_000] {
|
||||
run_concurrent_bench(readers, buf_size_lines, &runtime, c);
|
||||
}
|
||||
}
|
||||
|
||||
run_concurrent_bench(1, 100_000, &runtime, c);
|
||||
run_concurrent_bench(10, 100_000, &runtime, c);
|
||||
}
|
||||
|
||||
async fn do_queries(ctx: &TestContext<impl IngesterRpcInterface>, query: &IngesterQueryRequest) {
|
||||
for _ in 0..CONCURRENT_QUERY_BATCH_SIZE {
|
||||
ctx.query(query.clone())
|
||||
.await
|
||||
.expect("query request failed");
|
||||
}
|
||||
}
|
||||
|
||||
fn run_concurrent_bench(
|
||||
concurrent_readers: usize,
|
||||
buf_size_lines: usize,
|
||||
runtime: &tokio::runtime::Runtime,
|
||||
c: &mut Criterion,
|
||||
) {
|
||||
const COLUMN_COUNT: usize = 10;
|
||||
|
||||
let lp = generate_table_data(buf_size_lines, COLUMN_COUNT);
|
||||
let (ctx, namespace_id, table_id) = runtime.block_on(init(lp));
|
||||
|
||||
let query = Arc::new(IngesterQueryRequest {
|
||||
namespace_id: namespace_id.get(),
|
||||
table_id: table_id.get(),
|
||||
columns: vec![],
|
||||
predicate: None,
|
||||
});
|
||||
|
||||
let ctx = Arc::new(ctx);
|
||||
|
||||
let mut group = c.benchmark_group("concurrent_query");
|
||||
group.throughput(Throughput::Elements(CONCURRENT_QUERY_BATCH_SIZE as _)); // Queries per second
|
||||
group.bench_function(
|
||||
format!("readers_{concurrent_readers}/buffered_{buf_size_lines}x{COLUMN_COUNT}"),
|
||||
|b| {
|
||||
b.to_async(runtime).iter_custom(|iters| {
|
||||
let query = Arc::clone(&query);
|
||||
let ctx = Arc::clone(&ctx);
|
||||
async move {
|
||||
// Sync point to ensure all readers start at approximately the same
|
||||
// time.
|
||||
let barrier = Arc::new(Barrier::new(concurrent_readers));
|
||||
|
||||
// Spawn N-1 readers that'll be adding the concurrent workload, but
|
||||
// not measured.
|
||||
for _ in 0..(concurrent_readers - 1) {
|
||||
let barrier = Arc::clone(&barrier);
|
||||
let query = Arc::clone(&query);
|
||||
let ctx = Arc::clone(&ctx);
|
||||
tokio::spawn(async move {
|
||||
barrier.wait().await;
|
||||
for _ in 0..iters {
|
||||
do_queries(&ctx, &query).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// And measure the last reader.
|
||||
barrier.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..iters {
|
||||
do_queries(&ctx, &query).await;
|
||||
}
|
||||
start.elapsed()
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_query, bench_query_concurrent);
|
||||
criterion_main!(benches);
|
|
@ -20,7 +20,10 @@ use crate::{
|
|||
deferred_load::DeferredLoad,
|
||||
dml_payload::IngestOp,
|
||||
dml_sink::DmlSink,
|
||||
query::{response::QueryResponse, tracing::QueryExecTracing, QueryError, QueryExec},
|
||||
query::{
|
||||
projection::OwnedProjection, response::QueryResponse, tracing::QueryExecTracing,
|
||||
QueryError, QueryExec,
|
||||
},
|
||||
};
|
||||
|
||||
/// The string name / identifier of a Namespace.
|
||||
|
@ -189,7 +192,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -207,7 +210,7 @@ where
|
|||
// a tracing delegate to emit a child span.
|
||||
Ok(QueryResponse::new(
|
||||
QueryExecTracing::new(inner, "table")
|
||||
.query_exec(namespace_id, table_id, columns, span, predicate)
|
||||
.query_exec(namespace_id, table_id, projection, span, predicate)
|
||||
.await?,
|
||||
))
|
||||
}
|
||||
|
|
|
@ -15,7 +15,9 @@ use self::{
|
|||
persisting::{BatchIdent, PersistingData},
|
||||
};
|
||||
use super::{namespace::NamespaceName, table::TableMetadata};
|
||||
use crate::{deferred_load::DeferredLoad, query_adaptor::QueryAdaptor};
|
||||
use crate::{
|
||||
deferred_load::DeferredLoad, query::projection::OwnedProjection, query_adaptor::QueryAdaptor,
|
||||
};
|
||||
|
||||
mod buffer;
|
||||
pub(crate) mod persisting;
|
||||
|
@ -156,9 +158,9 @@ impl PartitionData {
|
|||
|
||||
/// Return all data for this partition, ordered by the calls to
|
||||
/// [`PartitionData::buffer_write()`].
|
||||
pub(crate) fn get_query_data(&mut self) -> Option<QueryAdaptor> {
|
||||
pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Option<QueryAdaptor> {
|
||||
// Extract the buffered data, if any.
|
||||
let buffered_data = self.buffer.get_query_data();
|
||||
let buffered_data = self.buffer.get_query_data(projection);
|
||||
|
||||
// Prepend any currently persisting batches.
|
||||
//
|
||||
|
@ -168,7 +170,7 @@ impl PartitionData {
|
|||
let data = self
|
||||
.persisting
|
||||
.iter()
|
||||
.flat_map(|(_, b)| b.get_query_data())
|
||||
.flat_map(|(_, b)| b.get_query_data(projection))
|
||||
.chain(buffered_data)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
@ -230,7 +232,10 @@ impl PartitionData {
|
|||
|
||||
// Wrap the persisting data in the type wrapper
|
||||
let data = PersistingData::new(
|
||||
QueryAdaptor::new(self.partition_id, fsm.get_query_data()),
|
||||
QueryAdaptor::new(
|
||||
self.partition_id,
|
||||
fsm.get_query_data(&OwnedProjection::default()),
|
||||
),
|
||||
batch_ident,
|
||||
);
|
||||
|
||||
|
@ -349,7 +354,7 @@ impl PartitionData {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{ops::Deref, time::Duration};
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow::compute::SortOptions;
|
||||
use arrow_util::assert_batches_eq;
|
||||
|
@ -378,7 +383,7 @@ mod tests {
|
|||
let mut p = PartitionDataBuilder::new().build();
|
||||
|
||||
// And no data should be returned when queried.
|
||||
assert!(p.get_query_data().is_none());
|
||||
assert!(p.get_query_data(&OwnedProjection::default()).is_none());
|
||||
|
||||
// Perform a single write.
|
||||
let mb = lp_to_mutable_batch(r#"bananas,city=London people=2,pigeons="millions" 10"#).1;
|
||||
|
@ -387,7 +392,9 @@ mod tests {
|
|||
|
||||
// The data should be readable.
|
||||
{
|
||||
let data = p.get_query_data().expect("should return data");
|
||||
let data = p
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.expect("should return data");
|
||||
assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
|
||||
|
||||
let expected = [
|
||||
|
@ -397,15 +404,7 @@ mod tests {
|
|||
"| London | 2.0 | millions | 1970-01-01T00:00:00.000000010Z |",
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(
|
||||
expected,
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_batches_eq!(expected, data.record_batches());
|
||||
}
|
||||
|
||||
// Perform a another write, adding data to the existing queryable data
|
||||
|
@ -416,7 +415,9 @@ mod tests {
|
|||
|
||||
// And finally both writes should be readable.
|
||||
{
|
||||
let data = p.get_query_data().expect("should contain data");
|
||||
let data = p
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.expect("should contain data");
|
||||
assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
|
||||
|
||||
let expected = [
|
||||
|
@ -427,15 +428,7 @@ mod tests {
|
|||
"| Madrid | 4.0 | none | 1970-01-01T00:00:00.000000020Z |",
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(
|
||||
expected,
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_batches_eq!(expected, data.record_batches());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -445,7 +438,7 @@ mod tests {
|
|||
async fn test_persist() {
|
||||
let mut p = PartitionDataBuilder::new().build();
|
||||
|
||||
assert!(p.get_query_data().is_none());
|
||||
assert!(p.get_query_data(&OwnedProjection::default()).is_none());
|
||||
|
||||
// Perform a single write.
|
||||
let mb = lp_to_mutable_batch(r#"bananas,city=London people=2,pigeons="millions" 10"#).1;
|
||||
|
@ -468,15 +461,7 @@ mod tests {
|
|||
"| London | 2.0 | millions | 1970-01-01T00:00:00.000000010Z |",
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(
|
||||
expected,
|
||||
&*persisting_data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_batches_eq!(expected, persisting_data.record_batches());
|
||||
|
||||
// Ensure the started batch ident is increased after a persist call, but not the completed
|
||||
// batch ident.
|
||||
|
@ -492,7 +477,9 @@ mod tests {
|
|||
|
||||
// Which must be readable, alongside the ongoing persist data.
|
||||
{
|
||||
let data = p.get_query_data().expect("must have data");
|
||||
let data = p
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.expect("must have data");
|
||||
assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
|
||||
assert_eq!(data.record_batches().len(), 2);
|
||||
let expected = [
|
||||
|
@ -503,15 +490,7 @@ mod tests {
|
|||
"| Madrid | 4.0 | none | 1970-01-01T00:00:00.000000020Z |",
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(
|
||||
expected,
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_batches_eq!(expected, data.record_batches());
|
||||
}
|
||||
|
||||
// The persist now "completes".
|
||||
|
@ -526,7 +505,9 @@ mod tests {
|
|||
|
||||
// Querying the buffer should now return only the second write.
|
||||
{
|
||||
let data = p.get_query_data().expect("must have data");
|
||||
let data = p
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.expect("must have data");
|
||||
assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
|
||||
assert_eq!(data.record_batches().len(), 1);
|
||||
let expected = [
|
||||
|
@ -536,15 +517,7 @@ mod tests {
|
|||
"| Madrid | 4.0 | none | 1970-01-01T00:00:00.000000020Z |",
|
||||
"+--------+--------+---------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(
|
||||
expected,
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_batches_eq!(expected, data.record_batches());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -557,12 +530,7 @@ mod tests {
|
|||
// A helper function to dedupe the record batches in [`QueryAdaptor`]
|
||||
// and assert the resulting batch contents.
|
||||
async fn assert_deduped(expect: &[&str], batch: QueryAdaptor) {
|
||||
let batch = batch
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
let batch = batch.record_batches().to_vec();
|
||||
|
||||
let sort_keys = vec![PhysicalSortExpr {
|
||||
expr: col("time", &batch[0].schema()).unwrap(),
|
||||
|
@ -596,7 +564,13 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(1))
|
||||
.expect("write should succeed");
|
||||
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -605,7 +579,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 1.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -614,7 +588,13 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(2))
|
||||
.expect("write should succeed");
|
||||
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -623,7 +603,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 2.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -656,7 +636,13 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(3))
|
||||
.expect("write should succeed");
|
||||
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 2);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
2
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -665,7 +651,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 3.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -697,7 +683,13 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(3))
|
||||
.expect("write should succeed");
|
||||
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 3);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
3
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -706,7 +698,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -717,7 +709,13 @@ mod tests {
|
|||
assert!(set.contains(SequenceNumber::new(2)));
|
||||
|
||||
// And assert the correct value remains.
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 2);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
2
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -726,7 +724,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -736,7 +734,13 @@ mod tests {
|
|||
assert!(set.contains(SequenceNumber::new(3)));
|
||||
|
||||
// And assert the correct value remains.
|
||||
assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
|
||||
assert_eq!(
|
||||
p.get_query_data(&OwnedProjection::default())
|
||||
.unwrap()
|
||||
.record_batches()
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
assert_deduped(
|
||||
&[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -745,7 +749,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
p.get_query_data().unwrap(),
|
||||
p.get_query_data(&OwnedProjection::default()).unwrap(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -777,7 +781,7 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(3))
|
||||
.expect("write should succeed");
|
||||
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -787,12 +791,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 2.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
|
||||
// Persist again, moving the last write to the persisting state and
|
||||
|
@ -805,7 +804,7 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(4))
|
||||
.expect("write should succeed");
|
||||
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -816,12 +815,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 3.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
|
||||
// Persist again, moving the last write to the persisting state and
|
||||
|
@ -834,7 +828,7 @@ mod tests {
|
|||
p.buffer_write(mb, SequenceNumber::new(5))
|
||||
.expect("write should succeed");
|
||||
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -846,12 +840,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
|
||||
// Finish persisting the second batch out-of-order! The middle entry,
|
||||
|
@ -860,7 +849,7 @@ mod tests {
|
|||
assert_eq!(set.len(), 1);
|
||||
assert!(set.contains(SequenceNumber::new(3)));
|
||||
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -871,12 +860,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
|
||||
// Finish persisting the last batch.
|
||||
|
@ -884,7 +868,7 @@ mod tests {
|
|||
assert_eq!(set.len(), 1);
|
||||
assert!(set.contains(SequenceNumber::new(4)));
|
||||
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -894,12 +878,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
|
||||
// Finish persisting the first batch.
|
||||
|
@ -908,7 +887,7 @@ mod tests {
|
|||
assert!(set.contains(SequenceNumber::new(1)));
|
||||
|
||||
// Assert only the buffered data remains
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------------------------------+-----+",
|
||||
|
@ -917,12 +896,7 @@ mod tests {
|
|||
"| 1970-01-01T00:00:00.000000042Z | 4.0 |",
|
||||
"+--------------------------------+-----+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -954,27 +928,26 @@ mod tests {
|
|||
// Populate the catalog with the namespace / table
|
||||
let (_ns_id, table_id) = populate_catalog(&*catalog, "bananas", "platanos").await;
|
||||
|
||||
let partition_id = catalog
|
||||
let partition = catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.create_or_get("test".into(), table_id)
|
||||
.await
|
||||
.expect("should create")
|
||||
.id;
|
||||
.expect("should create");
|
||||
|
||||
catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.cas_sort_key(partition_id, None, &["terrific"])
|
||||
.cas_sort_key(&partition.transition_partition_id(), None, &["terrific"])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Read the just-created sort key (None)
|
||||
let fetcher = Arc::new(DeferredLoad::new(
|
||||
Duration::from_nanos(1),
|
||||
SortKeyResolver::new(partition_id, Arc::clone(&catalog), backoff_config.clone())
|
||||
SortKeyResolver::new(partition.id, Arc::clone(&catalog), backoff_config.clone())
|
||||
.fetch(),
|
||||
&metrics,
|
||||
));
|
||||
|
@ -1009,7 +982,7 @@ mod tests {
|
|||
);
|
||||
|
||||
// Nothing should explode, data should be readable.
|
||||
let data = p.get_query_data().unwrap();
|
||||
let data = p.get_query_data(&OwnedProjection::default()).unwrap();
|
||||
assert_batches_eq!(
|
||||
[
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
|
@ -1019,12 +992,7 @@ mod tests {
|
|||
"| Madrid | 2.0 | none | 1970-01-01T00:00:00.000000011Z |",
|
||||
"+--------+--------+----------+--------------------------------+",
|
||||
],
|
||||
&*data
|
||||
.record_batches()
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
&*data.record_batches().to_vec()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1053,6 +1021,6 @@ mod tests {
|
|||
async fn test_empty_partition_no_queryadaptor_panic() {
|
||||
let mut p = PartitionDataBuilder::new().build();
|
||||
|
||||
assert!(p.get_query_data().is_none());
|
||||
assert!(p.get_query_data(&OwnedProjection::default()).is_none());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use data_types::SequenceNumber;
|
||||
use mutable_batch::MutableBatch;
|
||||
|
@ -11,6 +9,8 @@ pub(crate) mod traits;
|
|||
|
||||
pub(crate) use state_machine::*;
|
||||
|
||||
use crate::query::projection::OwnedProjection;
|
||||
|
||||
use self::{always_some::AlwaysSome, traits::Queryable};
|
||||
|
||||
/// The current state of the [`BufferState`] state machine.
|
||||
|
@ -63,12 +63,12 @@ impl DataBuffer {
|
|||
|
||||
/// Return all data for this buffer, ordered by the [`SequenceNumber`] from
|
||||
/// which it was buffered with.
|
||||
pub(crate) fn get_query_data(&mut self) -> Vec<Arc<RecordBatch>> {
|
||||
pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Vec<RecordBatch> {
|
||||
// Take ownership of the FSM and return the data within it.
|
||||
self.0.mutate(|fsm| match fsm {
|
||||
// The buffering state can return data.
|
||||
FsmState::Buffering(b) => {
|
||||
let ret = b.get_query_data();
|
||||
let ret = b.get_query_data(projection);
|
||||
(FsmState::Buffering(b), ret)
|
||||
}
|
||||
})
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use mutable_batch::MutableBatch;
|
||||
use schema::Projection;
|
||||
|
@ -39,12 +37,12 @@ impl Buffer {
|
|||
/// # Panics
|
||||
///
|
||||
/// If generating the snapshot fails, this method panics.
|
||||
pub(super) fn snapshot(self) -> Option<Arc<RecordBatch>> {
|
||||
Some(Arc::new(
|
||||
pub(super) fn snapshot(self) -> Option<RecordBatch> {
|
||||
Some(
|
||||
self.buffer?
|
||||
.to_arrow(Projection::All)
|
||||
.expect("failed to snapshot buffer data"),
|
||||
))
|
||||
)
|
||||
}
|
||||
|
||||
pub(super) fn is_empty(&self) -> bool {
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
#![allow(dead_code)]
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use data_types::{sequence_number_set::SequenceNumberSet, SequenceNumber};
|
||||
use mutable_batch::MutableBatch;
|
||||
|
@ -12,6 +10,8 @@ mod snapshot;
|
|||
pub(in crate::buffer_tree::partition::buffer) use buffering::*;
|
||||
pub(crate) use persisting::*;
|
||||
|
||||
use crate::query::projection::OwnedProjection;
|
||||
|
||||
use super::traits::{Queryable, Writeable};
|
||||
|
||||
/// A result type for fallible transitions.
|
||||
|
@ -122,14 +122,14 @@ where
|
|||
/// Returns the current buffer data.
|
||||
///
|
||||
/// This is always a cheap method call.
|
||||
fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
|
||||
self.state.get_query_data()
|
||||
fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
|
||||
self.state.get_query_data(projection)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_util::assert_batches_eq;
|
||||
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
|
||||
|
@ -139,6 +139,8 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
#[test]
|
||||
// comparing dyn Array always has same vtable, so is accurate to use Arc::ptr_eq
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
fn test_buffer_lifecycle() {
|
||||
// Initialise a buffer in the base state.
|
||||
let mut buffer: BufferState<Buffering> = BufferState::new();
|
||||
|
@ -166,7 +168,7 @@ mod tests {
|
|||
// Keep the data to validate they are ref-counted copies after further
|
||||
// writes below. Note this construct allows the caller to decide when/if
|
||||
// to allocate.
|
||||
let w1_data = buffer.get_query_data();
|
||||
let w1_data = buffer.get_query_data(&OwnedProjection::default());
|
||||
|
||||
let expected = vec![
|
||||
"+-------+----------+----------+--------------------------------+",
|
||||
|
@ -175,7 +177,7 @@ mod tests {
|
|||
"| true | 42.0 | platanos | 1991-03-10T00:00:42.000000042Z |",
|
||||
"+-------+----------+----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[w1_data[0].deref().clone()]);
|
||||
assert_batches_eq!(&expected, &[w1_data[0].clone()]);
|
||||
|
||||
// Apply another write.
|
||||
buffer
|
||||
|
@ -195,7 +197,7 @@ mod tests {
|
|||
};
|
||||
|
||||
// Verify the writes are still queryable.
|
||||
let w2_data = buffer.get_query_data();
|
||||
let w2_data = buffer.get_query_data(&OwnedProjection::default());
|
||||
let expected = vec![
|
||||
"+-------+----------+----------+--------------------------------+",
|
||||
"| great | how_much | tag | time |",
|
||||
|
@ -205,18 +207,18 @@ mod tests {
|
|||
"+-------+----------+----------+--------------------------------+",
|
||||
];
|
||||
assert_eq!(w2_data.len(), 1);
|
||||
assert_batches_eq!(&expected, &[w2_data[0].deref().clone()]);
|
||||
assert_batches_eq!(&expected, &[w2_data[0].clone()]);
|
||||
|
||||
// Ensure the same data is returned for a second read.
|
||||
{
|
||||
let second_read = buffer.get_query_data();
|
||||
let second_read = buffer.get_query_data(&OwnedProjection::default());
|
||||
assert_eq!(w2_data, second_read);
|
||||
|
||||
// And that no data was actually copied.
|
||||
let same_arcs = w2_data
|
||||
.iter()
|
||||
.zip(second_read.iter())
|
||||
.all(|(a, b)| Arc::ptr_eq(a, b));
|
||||
.all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
|
||||
assert!(same_arcs);
|
||||
}
|
||||
|
||||
|
@ -224,14 +226,120 @@ mod tests {
|
|||
let buffer: BufferState<Persisting> = buffer.into_persisting();
|
||||
|
||||
// Extract the final buffered result
|
||||
let final_data = buffer.get_query_data();
|
||||
let final_data = buffer.get_query_data(&OwnedProjection::default());
|
||||
|
||||
// And once again verify no data was changed, copied or re-ordered.
|
||||
assert_eq!(w2_data, final_data);
|
||||
let same_arcs = w2_data
|
||||
.into_iter()
|
||||
.zip(final_data.into_iter())
|
||||
.all(|(a, b)| Arc::ptr_eq(&a, &b));
|
||||
.all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
|
||||
assert!(same_arcs);
|
||||
|
||||
// Assert the sequence numbers were recorded.
|
||||
let set = buffer.into_sequence_number_set();
|
||||
assert!(set.contains(SequenceNumber::new(0)));
|
||||
assert!(set.contains(SequenceNumber::new(1)));
|
||||
assert_eq!(set.len(), 2);
|
||||
}
|
||||
|
||||
/// Assert projection is correct across all the queryable FSM states.
|
||||
#[test]
|
||||
// comparing dyn Array always has same vtable, so is accurate to use Arc::ptr_eq
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
fn test_buffer_projection() {
|
||||
let projection = OwnedProjection::from(vec![
|
||||
"tag".to_string(),
|
||||
"great".to_string(),
|
||||
"missing".to_string(),
|
||||
"time".to_string(),
|
||||
]);
|
||||
|
||||
// Initialise a buffer in the base state.
|
||||
let mut buffer: BufferState<Buffering> = BufferState::new();
|
||||
|
||||
// Write some data to a buffer.
|
||||
buffer
|
||||
.write(
|
||||
lp_to_mutable_batch(
|
||||
r#"bananas,tag=platanos great=true,how_much=42 668563242000000042"#,
|
||||
)
|
||||
.1,
|
||||
SequenceNumber::new(0),
|
||||
)
|
||||
.expect("write to empty buffer should succeed");
|
||||
|
||||
// Extract the queryable data from the buffer and validate it.
|
||||
//
|
||||
// Keep the data to validate they are ref-counted copies after further
|
||||
// writes below. Note this construct allows the caller to decide when/if
|
||||
// to allocate.
|
||||
let w1_data = buffer.get_query_data(&projection);
|
||||
|
||||
let expected = vec![
|
||||
"+----------+-------+--------------------------------+",
|
||||
"| tag | great | time |",
|
||||
"+----------+-------+--------------------------------+",
|
||||
"| platanos | true | 1991-03-10T00:00:42.000000042Z |",
|
||||
"+----------+-------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[w1_data[0].clone()]);
|
||||
|
||||
// Apply another write.
|
||||
buffer
|
||||
.write(
|
||||
lp_to_mutable_batch(
|
||||
r#"bananas,tag=platanos great=true,how_much=1000 668563242000000043"#,
|
||||
)
|
||||
.1,
|
||||
SequenceNumber::new(1),
|
||||
)
|
||||
.expect("write to empty buffer should succeed");
|
||||
|
||||
// Snapshot the buffer into an immutable, queryable data format.
|
||||
let buffer: BufferState<Snapshot> = match buffer.snapshot() {
|
||||
Transition::Ok(v) => v,
|
||||
Transition::Unchanged(_) => panic!("did not transition to snapshot state"),
|
||||
};
|
||||
|
||||
// Verify the writes are still queryable.
|
||||
let w2_data = buffer.get_query_data(&projection);
|
||||
let expected = vec![
|
||||
"+----------+-------+--------------------------------+",
|
||||
"| tag | great | time |",
|
||||
"+----------+-------+--------------------------------+",
|
||||
"| platanos | true | 1991-03-10T00:00:42.000000042Z |",
|
||||
"| platanos | true | 1991-03-10T00:00:42.000000043Z |",
|
||||
"+----------+-------+--------------------------------+",
|
||||
];
|
||||
assert_eq!(w2_data.len(), 1);
|
||||
assert_batches_eq!(&expected, &[w2_data[0].clone()]);
|
||||
|
||||
// Ensure the same data is returned for a second read.
|
||||
{
|
||||
let second_read = buffer.get_query_data(&projection);
|
||||
assert_eq!(w2_data, second_read);
|
||||
|
||||
// And that no data was actually copied.
|
||||
let same_arcs = w2_data
|
||||
.iter()
|
||||
.zip(second_read.iter())
|
||||
.all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
|
||||
assert!(same_arcs);
|
||||
}
|
||||
|
||||
// Finally transition into the terminal persisting state.
|
||||
let buffer: BufferState<Persisting> = buffer.into_persisting();
|
||||
|
||||
// Extract the final buffered result
|
||||
let final_data = buffer.get_query_data(&projection);
|
||||
|
||||
// And once again verify no data was changed, copied or re-ordered.
|
||||
assert_eq!(w2_data, final_data);
|
||||
let same_arcs = w2_data
|
||||
.into_iter()
|
||||
.zip(final_data.into_iter())
|
||||
.all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
|
||||
assert!(same_arcs);
|
||||
|
||||
// Assert the sequence numbers were recorded.
|
||||
|
@ -258,16 +366,16 @@ mod tests {
|
|||
Transition::Unchanged(_) => panic!("failed to transition"),
|
||||
};
|
||||
|
||||
assert_eq!(buffer.get_query_data().len(), 1);
|
||||
assert_eq!(buffer.get_query_data(&OwnedProjection::default()).len(), 1);
|
||||
|
||||
let snapshot = &buffer.get_query_data()[0];
|
||||
let snapshot = buffer.get_query_data(&OwnedProjection::default())[0].clone();
|
||||
|
||||
// Generate the combined buffer from the original inputs to compare
|
||||
// against.
|
||||
mb1.extend_from(&mb2).unwrap();
|
||||
let want = mb1.to_arrow(Projection::All).unwrap();
|
||||
|
||||
assert_eq!(&**snapshot, &want);
|
||||
assert_eq!(snapshot, want);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
//! A write buffer.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use mutable_batch::MutableBatch;
|
||||
use schema::Projection;
|
||||
|
||||
use super::{snapshot::Snapshot, BufferState, Transition};
|
||||
use crate::buffer_tree::partition::buffer::{
|
||||
mutable_buffer::Buffer,
|
||||
traits::{Queryable, Writeable},
|
||||
use crate::{
|
||||
buffer_tree::partition::buffer::{
|
||||
mutable_buffer::Buffer,
|
||||
traits::{Queryable, Writeable},
|
||||
},
|
||||
query::projection::OwnedProjection,
|
||||
};
|
||||
|
||||
/// The FSM starting ingest state - a mutable buffer collecting writes.
|
||||
|
@ -35,18 +35,11 @@ pub(crate) struct Buffering {
|
|||
/// This method panics if converting the buffered data (if any) into an Arrow
|
||||
/// [`RecordBatch`] fails (a non-transient error).
|
||||
impl Queryable for Buffering {
|
||||
fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
|
||||
let data = self.buffer.buffer().map(|v| {
|
||||
Arc::new(
|
||||
v.to_arrow(Projection::All)
|
||||
.expect("failed to snapshot buffer data"),
|
||||
)
|
||||
});
|
||||
|
||||
match data {
|
||||
Some(v) => vec![v],
|
||||
None => vec![],
|
||||
}
|
||||
fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
|
||||
self.buffer
|
||||
.buffer()
|
||||
.map(|v| vec![projection.project_mutable_batches(v)])
|
||||
.unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
//! A writfield1 buffer, with one or more snapshots.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use data_types::sequence_number_set::SequenceNumberSet;
|
||||
|
||||
use super::BufferState;
|
||||
use crate::buffer_tree::partition::buffer::traits::Queryable;
|
||||
use crate::{
|
||||
buffer_tree::partition::buffer::traits::Queryable, query::projection::OwnedProjection,
|
||||
};
|
||||
|
||||
/// An immutable set of [`RecordBatch`] in the process of being persisted.
|
||||
#[derive(Debug)]
|
||||
|
@ -14,18 +14,18 @@ pub(crate) struct Persisting {
|
|||
/// Snapshots generated from previous buffer contents to be persisted.
|
||||
///
|
||||
/// INVARIANT: this array is always non-empty.
|
||||
snapshots: Vec<Arc<RecordBatch>>,
|
||||
snapshots: Vec<RecordBatch>,
|
||||
}
|
||||
|
||||
impl Persisting {
|
||||
pub(super) fn new(snapshots: Vec<Arc<RecordBatch>>) -> Self {
|
||||
pub(super) fn new(snapshots: Vec<RecordBatch>) -> Self {
|
||||
Self { snapshots }
|
||||
}
|
||||
}
|
||||
|
||||
impl Queryable for Persisting {
|
||||
fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
|
||||
self.snapshots.clone()
|
||||
fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
|
||||
projection.project_record_batch(&self.snapshots)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
//! A writfield1 buffer, with one or more snapshots.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
|
||||
use super::BufferState;
|
||||
use crate::buffer_tree::partition::buffer::{
|
||||
state_machine::persisting::Persisting, traits::Queryable,
|
||||
use crate::{
|
||||
buffer_tree::partition::buffer::{state_machine::persisting::Persisting, traits::Queryable},
|
||||
query::projection::OwnedProjection,
|
||||
};
|
||||
|
||||
/// An immutable, queryable FSM state containing at least one buffer snapshot.
|
||||
|
@ -15,19 +14,19 @@ pub(crate) struct Snapshot {
|
|||
/// Snapshots generated from previous buffer contents.
|
||||
///
|
||||
/// INVARIANT: this array is always non-empty.
|
||||
snapshots: Vec<Arc<RecordBatch>>,
|
||||
snapshots: Vec<RecordBatch>,
|
||||
}
|
||||
|
||||
impl Snapshot {
|
||||
pub(super) fn new(snapshots: Vec<Arc<RecordBatch>>) -> Self {
|
||||
pub(super) fn new(snapshots: Vec<RecordBatch>) -> Self {
|
||||
assert!(!snapshots.is_empty());
|
||||
Self { snapshots }
|
||||
}
|
||||
}
|
||||
|
||||
impl Queryable for Snapshot {
|
||||
fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
|
||||
self.snapshots.clone()
|
||||
fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
|
||||
projection.project_record_batch(&self.snapshots)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
//! Private traits for state machine states.
|
||||
|
||||
use std::{fmt::Debug, sync::Arc};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use mutable_batch::MutableBatch;
|
||||
|
||||
use crate::query::projection::OwnedProjection;
|
||||
|
||||
/// A state that can accept writes.
|
||||
pub(crate) trait Writeable: Debug {
|
||||
fn write(&mut self, batch: MutableBatch) -> Result<(), mutable_batch::Error>;
|
||||
|
@ -13,5 +15,5 @@ pub(crate) trait Writeable: Debug {
|
|||
/// A state that can return the contents of the buffer as one or more
|
||||
/// [`RecordBatch`] instances.
|
||||
pub(crate) trait Queryable: Debug {
|
||||
fn get_query_data(&self) -> Vec<Arc<RecordBatch>>;
|
||||
fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch>;
|
||||
}
|
||||
|
|
|
@ -100,7 +100,11 @@ mod tests {
|
|||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use assert_matches::assert_matches;
|
||||
use iox_catalog::test_helpers::{arbitrary_namespace, arbitrary_table};
|
||||
use data_types::TransitionPartitionId;
|
||||
use iox_catalog::{
|
||||
partition_lookup,
|
||||
test_helpers::{arbitrary_namespace, arbitrary_table},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::buffer_tree::table::TableName;
|
||||
|
@ -161,11 +165,9 @@ mod tests {
|
|||
assert_matches!(got.lock().sort_key(), SortKeyState::Provided(None));
|
||||
assert!(got.lock().partition_key.ptr_eq(&callers_partition_key));
|
||||
|
||||
let got = catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.get_by_id(got.lock().partition_id)
|
||||
let mut repos = catalog.repositories().await;
|
||||
let id = TransitionPartitionId::Deprecated(got.lock().partition_id);
|
||||
let got = partition_lookup(repos.as_mut(), &id)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("partition not created");
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use backoff::{Backoff, BackoffConfig};
|
||||
use data_types::PartitionId;
|
||||
use iox_catalog::interface::Catalog;
|
||||
use data_types::{PartitionId, TransitionPartitionId};
|
||||
use iox_catalog::{interface::Catalog, partition_lookup};
|
||||
use schema::sort::SortKey;
|
||||
|
||||
/// A resolver of [`SortKey`] from the catalog for a given [`PartitionId`].
|
||||
|
@ -33,12 +33,9 @@ impl SortKeyResolver {
|
|||
pub(crate) async fn fetch(self) -> Option<SortKey> {
|
||||
Backoff::new(&self.backoff_config)
|
||||
.retry_all_errors("fetch partition sort key", || async {
|
||||
let s = self
|
||||
.catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.get_by_id(self.partition_id)
|
||||
let mut repos = self.catalog.repositories().await;
|
||||
let id = TransitionPartitionId::Deprecated(self.partition_id);
|
||||
let s = partition_lookup(repos.as_mut(), &id)
|
||||
.await?
|
||||
.unwrap_or_else(|| {
|
||||
panic!(
|
||||
|
@ -76,24 +73,27 @@ mod tests {
|
|||
// Populate the catalog with the namespace / table
|
||||
let (_ns_id, table_id) = populate_catalog(&*catalog, NAMESPACE_NAME, TABLE_NAME).await;
|
||||
|
||||
let partition_id = catalog
|
||||
let partition = catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.create_or_get(PARTITION_KEY.into(), table_id)
|
||||
.await
|
||||
.expect("should create")
|
||||
.id;
|
||||
.expect("should create");
|
||||
|
||||
let fetcher =
|
||||
SortKeyResolver::new(partition_id, Arc::clone(&catalog), backoff_config.clone());
|
||||
SortKeyResolver::new(partition.id, Arc::clone(&catalog), backoff_config.clone());
|
||||
|
||||
// Set the sort key
|
||||
let catalog_state = catalog
|
||||
.repositories()
|
||||
.await
|
||||
.partitions()
|
||||
.cas_sort_key(partition_id, None, &["uno", "dos", "bananas"])
|
||||
.cas_sort_key(
|
||||
&partition.transition_partition_id(),
|
||||
None,
|
||||
&["uno", "dos", "bananas"],
|
||||
)
|
||||
.await
|
||||
.expect("should update existing partition key");
|
||||
|
||||
|
|
|
@ -18,7 +18,10 @@ use crate::{
|
|||
dml_payload::IngestOp,
|
||||
dml_sink::DmlSink,
|
||||
partition_iter::PartitionIter,
|
||||
query::{response::QueryResponse, tracing::QueryExecTracing, QueryError, QueryExec},
|
||||
query::{
|
||||
projection::OwnedProjection, response::QueryResponse, tracing::QueryExecTracing,
|
||||
QueryError, QueryExec,
|
||||
},
|
||||
};
|
||||
|
||||
/// A [`BufferTree`] is the root of an in-memory tree of many [`NamespaceData`]
|
||||
|
@ -201,7 +204,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -213,7 +216,7 @@ where
|
|||
// Delegate query execution to the namespace, wrapping the execution in
|
||||
// a tracing delegate to emit a child span.
|
||||
QueryExecTracing::new(inner, "namespace")
|
||||
.query_exec(namespace_id, table_id, columns, span, predicate)
|
||||
.query_exec(namespace_id, table_id, projection, span, predicate)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
@ -399,7 +402,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
$predicate
|
||||
)
|
||||
|
@ -742,6 +745,108 @@ mod tests {
|
|||
]
|
||||
);
|
||||
|
||||
/// Ensure partition pruning during query execution also prunes metadata
|
||||
/// frames.
|
||||
///
|
||||
/// Individual frames are fast to serialise, but large numbers of frames can
|
||||
/// add significant query overhead, particularly for queries returning small
|
||||
/// numbers of rows where the metadata becomes a significant portion of the
|
||||
/// response.
|
||||
#[tokio::test]
|
||||
async fn test_partition_metadata_pruning() {
|
||||
let partition_provider = Arc::new(
|
||||
MockPartitionProvider::default()
|
||||
.with_partition(
|
||||
PartitionDataBuilder::new()
|
||||
.with_partition_id(ARBITRARY_PARTITION_ID)
|
||||
.with_partition_key("madrid".into())
|
||||
.build(),
|
||||
)
|
||||
.with_partition(
|
||||
PartitionDataBuilder::new()
|
||||
.with_partition_id(PARTITION2_ID)
|
||||
.with_partition_key("asturias".into())
|
||||
.build(),
|
||||
),
|
||||
);
|
||||
|
||||
// Construct a partition template suitable for pruning on the "region"
|
||||
// tag.
|
||||
let table_provider = Arc::new(MockTableProvider::new(TableMetadata::new_for_testing(
|
||||
ARBITRARY_TABLE_NAME.clone(),
|
||||
test_table_partition_override(vec![TemplatePart::TagValue("region")]),
|
||||
)));
|
||||
|
||||
// Init the buffer tree
|
||||
let buf = BufferTree::new(
|
||||
Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
|
||||
table_provider,
|
||||
partition_provider,
|
||||
Arc::new(MockPostWriteObserver::default()),
|
||||
Arc::new(metric::Registry::default()),
|
||||
);
|
||||
|
||||
// Write to two regions
|
||||
buf.apply(IngestOp::Write(make_write_op(
|
||||
&PartitionKey::from("madrid"),
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
&ARBITRARY_TABLE_NAME,
|
||||
ARBITRARY_TABLE_ID,
|
||||
0,
|
||||
&format!(
|
||||
r#"{},region=madrid temp=35 4242424242"#,
|
||||
&*ARBITRARY_TABLE_NAME
|
||||
),
|
||||
None,
|
||||
)))
|
||||
.await
|
||||
.expect("failed to perform write");
|
||||
|
||||
buf.apply(IngestOp::Write(make_write_op(
|
||||
&PartitionKey::from("asturias"),
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
&ARBITRARY_TABLE_NAME,
|
||||
ARBITRARY_TABLE_ID,
|
||||
0,
|
||||
&format!(
|
||||
r#"{},region=asturias temp=35 4242424242"#,
|
||||
&*ARBITRARY_TABLE_NAME
|
||||
),
|
||||
None,
|
||||
)))
|
||||
.await
|
||||
.expect("failed to perform write");
|
||||
|
||||
// Construct a predicate suitable for pruning partitions based on the
|
||||
// region / partition template.
|
||||
let predicate = Some(Predicate::new().with_expr(col("region").eq(lit(
|
||||
ScalarValue::Dictionary(
|
||||
Box::new(DataType::Int32),
|
||||
Box::new(ScalarValue::from("asturias")),
|
||||
),
|
||||
))));
|
||||
|
||||
// Execute the query and count the number of partitions that are
|
||||
// returned (either data, or metadata).
|
||||
let partition_count = buf
|
||||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
predicate,
|
||||
)
|
||||
.await
|
||||
.expect("query should succeed")
|
||||
.into_partition_stream()
|
||||
.count()
|
||||
.await;
|
||||
|
||||
// Because the data in the "madrid" partition was pruned out, the
|
||||
// metadata should not be sent either.
|
||||
assert_eq!(partition_count, 1);
|
||||
}
|
||||
|
||||
/// Assert that multiple writes to a single namespace/table results in a
|
||||
/// single namespace being created, and matching metrics.
|
||||
#[tokio::test]
|
||||
|
@ -966,7 +1071,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
@ -994,7 +1099,13 @@ mod tests {
|
|||
|
||||
// Ensure an unknown table errors
|
||||
let err = buf
|
||||
.query_exec(ARBITRARY_NAMESPACE_ID, TABLE2_ID, vec![], None, None)
|
||||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
TABLE2_ID,
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.expect_err("query should fail");
|
||||
assert_matches!(err, QueryError::TableNotFound(ns, t) => {
|
||||
|
@ -1006,7 +1117,7 @@ mod tests {
|
|||
buf.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
@ -1080,7 +1191,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
|
|
@ -18,7 +18,6 @@ use iox_query::{
|
|||
use mutable_batch::MutableBatch;
|
||||
use parking_lot::Mutex;
|
||||
use predicate::Predicate;
|
||||
use schema::Projection;
|
||||
use trace::span::{Span, SpanRecorder};
|
||||
|
||||
use super::{
|
||||
|
@ -30,7 +29,8 @@ use crate::{
|
|||
arcmap::ArcMap,
|
||||
deferred_load::DeferredLoad,
|
||||
query::{
|
||||
partition_response::PartitionResponse, response::PartitionStream, QueryError, QueryExec,
|
||||
partition_response::PartitionResponse, projection::OwnedProjection,
|
||||
response::PartitionStream, QueryError, QueryExec,
|
||||
},
|
||||
query_adaptor::QueryAdaptor,
|
||||
};
|
||||
|
@ -256,7 +256,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -270,7 +270,7 @@ where
|
|||
|
||||
// Gather the partition data from all of the partitions in this table.
|
||||
let span = SpanRecorder::new(span);
|
||||
let partitions = self.partitions().into_iter().map(move |p| {
|
||||
let partitions = self.partitions().into_iter().filter_map(move |p| {
|
||||
let mut span = span.child("partition read");
|
||||
|
||||
let (id, hash_id, completed_persistence_count, data, partition_key) = {
|
||||
|
@ -279,7 +279,7 @@ where
|
|||
p.partition_id(),
|
||||
p.partition_hash_id().cloned(),
|
||||
p.completed_persistence_count(),
|
||||
p.get_query_data(),
|
||||
p.get_query_data(&projection),
|
||||
p.partition_key().clone(),
|
||||
)
|
||||
};
|
||||
|
@ -288,8 +288,6 @@ where
|
|||
Some(data) => {
|
||||
assert_eq!(id, data.partition_id());
|
||||
|
||||
let data = Arc::new(data);
|
||||
|
||||
// Potentially prune out this partition if the partition
|
||||
// template & derived partition key can be used to match
|
||||
// against the optional predicate.
|
||||
|
@ -305,30 +303,36 @@ where
|
|||
})
|
||||
.unwrap_or_default()
|
||||
{
|
||||
return PartitionResponse::new(
|
||||
vec![],
|
||||
id,
|
||||
hash_id,
|
||||
completed_persistence_count,
|
||||
);
|
||||
// This partition will never contain any data that would
|
||||
// form part of the query response.
|
||||
//
|
||||
// Because this is true of buffered data, it is also
|
||||
// true of the persisted data, and therefore sending the
|
||||
// persisted file count metadata is useless because the
|
||||
// querier would never utilise the persisted files as
|
||||
// part of this query.
|
||||
//
|
||||
// This avoids sending O(n) metadata frames for queries
|
||||
// that may only touch one or two actual frames. The N
|
||||
// partition count grows over the lifetime of the
|
||||
// ingester as more partitions are created, and while
|
||||
// fast to serialise individually, the sequentially-sent
|
||||
// N metadata frames add up.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Project the data if necessary
|
||||
let columns = columns.iter().map(String::as_str).collect::<Vec<_>>();
|
||||
let selection = if columns.is_empty() {
|
||||
Projection::All
|
||||
} else {
|
||||
Projection::Some(columns.as_ref())
|
||||
};
|
||||
|
||||
let data = data.project_selection(selection).into_iter().collect();
|
||||
PartitionResponse::new(data, id, hash_id, completed_persistence_count)
|
||||
PartitionResponse::new(
|
||||
data.into_record_batches(),
|
||||
id,
|
||||
hash_id,
|
||||
completed_persistence_count,
|
||||
)
|
||||
}
|
||||
None => PartitionResponse::new(vec![], id, hash_id, completed_persistence_count),
|
||||
};
|
||||
|
||||
span.ok("read partition data");
|
||||
ret
|
||||
Some(ret)
|
||||
});
|
||||
|
||||
Ok(PartitionStream::new(futures::stream::iter(partitions)))
|
||||
|
|
|
@ -21,6 +21,7 @@ use parquet_file::storage::ParquetStorage;
|
|||
use thiserror::Error;
|
||||
use tokio::sync::oneshot;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracker::DiskSpaceMetrics;
|
||||
use wal::Wal;
|
||||
|
||||
use crate::{
|
||||
|
@ -100,6 +101,11 @@ pub struct IngesterGuard<T> {
|
|||
/// Aborted on drop.
|
||||
rotation_task: tokio::task::JoinHandle<()>,
|
||||
|
||||
/// The handle of the periodic disk metric task.
|
||||
///
|
||||
/// Aborted on drop.
|
||||
disk_metric_task: tokio::task::JoinHandle<()>,
|
||||
|
||||
/// The task handle executing the graceful shutdown once triggered.
|
||||
graceful_shutdown_handler: tokio::task::JoinHandle<()>,
|
||||
shutdown_complete: Shared<oneshot::Receiver<()>>,
|
||||
|
@ -126,6 +132,7 @@ where
|
|||
impl<T> Drop for IngesterGuard<T> {
|
||||
fn drop(&mut self) {
|
||||
self.rotation_task.abort();
|
||||
self.disk_metric_task.abort();
|
||||
self.graceful_shutdown_handler.abort();
|
||||
}
|
||||
}
|
||||
|
@ -289,7 +296,9 @@ where
|
|||
let ingest_state = Arc::new(IngestState::default());
|
||||
|
||||
// Initialise the WAL
|
||||
let wal = Wal::new(wal_directory).await.map_err(InitError::WalInit)?;
|
||||
let wal = Wal::new(wal_directory.clone())
|
||||
.await
|
||||
.map_err(InitError::WalInit)?;
|
||||
|
||||
// Prepare the WAL segment reference tracker
|
||||
let (wal_reference_handle, wal_reference_actor) =
|
||||
|
@ -339,6 +348,16 @@ where
|
|||
// The tokio handle does not need retained here as the actor handle is
|
||||
// responsible for aborting the actor's run loop when dropped.
|
||||
tokio::spawn(wal_reference_actor.run());
|
||||
|
||||
// Initialize disk metrics to emit disk capacity / free statistics for the
|
||||
// WAL directory.
|
||||
let disk_metric_task = tokio::task::spawn(
|
||||
DiskSpaceMetrics::new(wal_directory, &metrics)
|
||||
.expect("failed to resolve WAL directory to disk")
|
||||
.run(),
|
||||
);
|
||||
|
||||
// Replay the WAL log files, if any.
|
||||
let max_sequence_number =
|
||||
wal_replay::replay(&wal, &buffer, Arc::clone(&persist_handle), &metrics)
|
||||
.await
|
||||
|
@ -416,6 +435,7 @@ where
|
|||
persist_handle,
|
||||
),
|
||||
rotation_task,
|
||||
disk_metric_task,
|
||||
graceful_shutdown_handler: shutdown_task,
|
||||
shutdown_complete: shutdown_rx.shared(),
|
||||
})
|
||||
|
|
|
@ -9,6 +9,7 @@ use crate::{
|
|||
ingest_state::{IngestState, IngestStateError},
|
||||
partition_iter::PartitionIter,
|
||||
persist::{drain_buffer::persist_partitions, queue::PersistQueue},
|
||||
query::projection::OwnedProjection,
|
||||
wal::reference_tracker::WalReferenceHandle,
|
||||
};
|
||||
|
||||
|
@ -79,10 +80,11 @@ pub(super) async fn graceful_shutdown_handler<F, T, P>(
|
|||
// springs to life and buffers in the buffer tree after this check has
|
||||
// completed - I think this is extreme enough to accept as a theoretical
|
||||
// possibility that doesn't need covering off in practice.
|
||||
while buffer
|
||||
.partition_iter()
|
||||
.any(|p| p.lock().get_query_data().is_some())
|
||||
{
|
||||
while buffer.partition_iter().any(|p| {
|
||||
p.lock()
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.is_some()
|
||||
}) {
|
||||
if persist_partitions(buffer.partition_iter(), &persist).await != 0 {
|
||||
// Late arriving writes needed persisting.
|
||||
debug!("re-persisting late arriving data");
|
||||
|
|
|
@ -67,10 +67,7 @@ pub(super) async fn compact_persisting_batch(
|
|||
adjust_sort_key_columns(&sk, &batch.schema().primary_key())
|
||||
}
|
||||
None => {
|
||||
let sort_key = compute_sort_key(
|
||||
batch.schema(),
|
||||
batch.record_batches().iter().map(|sb| sb.as_ref()),
|
||||
);
|
||||
let sort_key = compute_sort_key(batch.schema(), batch.record_batches().iter());
|
||||
// Use the sort key computed from the cardinality as the sort key for this parquet
|
||||
// file's metadata, also return the sort key to be stored in the catalog
|
||||
(sort_key.clone(), Some(sort_key))
|
||||
|
@ -127,7 +124,7 @@ mod tests {
|
|||
.to_arrow(Projection::All)
|
||||
.unwrap();
|
||||
|
||||
let batch = QueryAdaptor::new(ARBITRARY_PARTITION_ID, vec![Arc::new(batch)]);
|
||||
let batch = QueryAdaptor::new(ARBITRARY_PARTITION_ID, vec![batch]);
|
||||
|
||||
// verify PK
|
||||
let schema = batch.schema();
|
||||
|
@ -459,8 +456,7 @@ mod tests {
|
|||
let expected_pk = vec!["tag1", "time"];
|
||||
assert_eq!(expected_pk, pk);
|
||||
|
||||
let sort_key =
|
||||
compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
|
||||
let sort_key = compute_sort_key(schema, batch.record_batches().iter());
|
||||
assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
|
||||
|
||||
// compact
|
||||
|
@ -500,8 +496,7 @@ mod tests {
|
|||
let expected_pk = vec!["tag1", "time"];
|
||||
assert_eq!(expected_pk, pk);
|
||||
|
||||
let sort_key =
|
||||
compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
|
||||
let sort_key = compute_sort_key(schema, batch.record_batches().iter());
|
||||
assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
|
||||
|
||||
// compact
|
||||
|
@ -549,8 +544,7 @@ mod tests {
|
|||
let expected_pk = vec!["tag1", "time"];
|
||||
assert_eq!(expected_pk, pk);
|
||||
|
||||
let sort_key =
|
||||
compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
|
||||
let sort_key = compute_sort_key(schema, batch.record_batches().iter());
|
||||
assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
|
||||
|
||||
// compact
|
||||
|
@ -596,8 +590,7 @@ mod tests {
|
|||
let expected_pk = vec!["tag1", "tag2", "time"];
|
||||
assert_eq!(expected_pk, pk);
|
||||
|
||||
let sort_key =
|
||||
compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
|
||||
let sort_key = compute_sort_key(schema, batch.record_batches().iter());
|
||||
assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
|
||||
|
||||
// compact
|
||||
|
@ -647,8 +640,7 @@ mod tests {
|
|||
let expected_pk = vec!["tag1", "tag2", "time"];
|
||||
assert_eq!(expected_pk, pk);
|
||||
|
||||
let sort_key =
|
||||
compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
|
||||
let sort_key = compute_sort_key(schema, batch.record_batches().iter());
|
||||
assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
|
||||
|
||||
// compact
|
||||
|
@ -699,7 +691,7 @@ mod tests {
|
|||
batch.schema();
|
||||
}
|
||||
|
||||
async fn create_one_row_record_batch_with_influxtype() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_one_row_record_batch_with_influxtype() -> Vec<RecordBatch> {
|
||||
let chunk1 = Arc::new(
|
||||
TestChunk::new("t")
|
||||
.with_id(1)
|
||||
|
@ -723,11 +715,10 @@ mod tests {
|
|||
];
|
||||
assert_batches_eq!(&expected, &batches);
|
||||
|
||||
let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
|
||||
batches
|
||||
}
|
||||
|
||||
async fn create_one_record_batch_with_influxtype_no_duplicates() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_one_record_batch_with_influxtype_no_duplicates() -> Vec<RecordBatch> {
|
||||
let chunk1 = Arc::new(
|
||||
TestChunk::new("t")
|
||||
.with_id(1)
|
||||
|
@ -753,11 +744,10 @@ mod tests {
|
|||
];
|
||||
assert_batches_eq!(&expected, &batches);
|
||||
|
||||
let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
|
||||
batches
|
||||
}
|
||||
|
||||
async fn create_one_record_batch_with_influxtype_duplicates() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_one_record_batch_with_influxtype_duplicates() -> Vec<RecordBatch> {
|
||||
let chunk1 = Arc::new(
|
||||
TestChunk::new("t")
|
||||
.with_id(1)
|
||||
|
@ -790,12 +780,11 @@ mod tests {
|
|||
];
|
||||
assert_batches_eq!(&expected, &batches);
|
||||
|
||||
let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
|
||||
batches
|
||||
}
|
||||
|
||||
/// RecordBatches with knowledge of influx metadata
|
||||
async fn create_batches_with_influxtype() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_batches_with_influxtype() -> Vec<RecordBatch> {
|
||||
// Use the available TestChunk to create chunks and then convert them to raw RecordBatches
|
||||
let mut batches = vec![];
|
||||
|
||||
|
@ -826,7 +815,7 @@ mod tests {
|
|||
"+-----------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch1.clone()]);
|
||||
batches.push(Arc::new(batch1));
|
||||
batches.push(batch1);
|
||||
|
||||
// chunk2 having duplicate data with chunk 1
|
||||
let chunk2 = Arc::new(
|
||||
|
@ -850,7 +839,7 @@ mod tests {
|
|||
"+-----------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch2.clone()]);
|
||||
batches.push(Arc::new(batch2));
|
||||
batches.push(batch2);
|
||||
|
||||
// verify data from both batches
|
||||
let expected = vec![
|
||||
|
@ -874,14 +863,13 @@ mod tests {
|
|||
"| 5 | MT | 1970-01-01T00:00:00.000005Z |",
|
||||
"+-----------+------+--------------------------------+",
|
||||
];
|
||||
let b: Vec<_> = batches.iter().map(|b| (**b).clone()).collect();
|
||||
assert_batches_eq!(&expected, &b);
|
||||
assert_batches_eq!(&expected, &batches);
|
||||
|
||||
batches
|
||||
}
|
||||
|
||||
/// RecordBatches with knowledge of influx metadata
|
||||
async fn create_batches_with_influxtype_different_columns() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_batches_with_influxtype_different_columns() -> Vec<RecordBatch> {
|
||||
// Use the available TestChunk to create chunks and then convert them to raw RecordBatches
|
||||
let mut batches = vec![];
|
||||
|
||||
|
@ -912,7 +900,7 @@ mod tests {
|
|||
"+-----------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch1.clone()]);
|
||||
batches.push(Arc::new(batch1));
|
||||
batches.push(batch1);
|
||||
|
||||
// chunk2 having duplicate data with chunk 1
|
||||
// mmore columns
|
||||
|
@ -939,14 +927,14 @@ mod tests {
|
|||
"+-----------+------------+------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch2.clone()]);
|
||||
batches.push(Arc::new(batch2));
|
||||
batches.push(batch2);
|
||||
|
||||
batches
|
||||
}
|
||||
|
||||
/// RecordBatches with knowledge of influx metadata
|
||||
async fn create_batches_with_influxtype_different_columns_different_order(
|
||||
) -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_batches_with_influxtype_different_columns_different_order() -> Vec<RecordBatch>
|
||||
{
|
||||
// Use the available TestChunk to create chunks and then convert them to raw RecordBatches
|
||||
let mut batches = vec![];
|
||||
|
||||
|
@ -978,7 +966,7 @@ mod tests {
|
|||
"+-----------+------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch1.clone()]);
|
||||
batches.push(Arc::new(batch1.clone()));
|
||||
batches.push(batch1.clone());
|
||||
|
||||
// chunk2 having duplicate data with chunk 1
|
||||
// mmore columns
|
||||
|
@ -1003,13 +991,13 @@ mod tests {
|
|||
"+-----------+------+--------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch2.clone()]);
|
||||
batches.push(Arc::new(batch2));
|
||||
batches.push(batch2);
|
||||
|
||||
batches
|
||||
}
|
||||
|
||||
/// Has 2 tag columns; tag1 has a lower cardinality (3) than tag3 (4)
|
||||
async fn create_batches_with_influxtype_different_cardinality() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_batches_with_influxtype_different_cardinality() -> Vec<RecordBatch> {
|
||||
// Use the available TestChunk to create chunks and then convert them to raw RecordBatches
|
||||
let mut batches = vec![];
|
||||
|
||||
|
@ -1034,7 +1022,7 @@ mod tests {
|
|||
"+-----------+------+------+-----------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch1.clone()]);
|
||||
batches.push(Arc::new(batch1.clone()));
|
||||
batches.push(batch1.clone());
|
||||
|
||||
let chunk2 = Arc::new(
|
||||
TestChunk::new("t")
|
||||
|
@ -1057,13 +1045,13 @@ mod tests {
|
|||
"+-----------+------+------+-----------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch2.clone()]);
|
||||
batches.push(Arc::new(batch2));
|
||||
batches.push(batch2);
|
||||
|
||||
batches
|
||||
}
|
||||
|
||||
/// RecordBatches with knowledge of influx metadata
|
||||
async fn create_batches_with_influxtype_same_columns_different_type() -> Vec<Arc<RecordBatch>> {
|
||||
async fn create_batches_with_influxtype_same_columns_different_type() -> Vec<RecordBatch> {
|
||||
// Use the available TestChunk to create chunks and then convert them to raw RecordBatches
|
||||
let mut batches = vec![];
|
||||
|
||||
|
@ -1087,7 +1075,7 @@ mod tests {
|
|||
"+-----------+------+-----------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch1.clone()]);
|
||||
batches.push(Arc::new(batch1));
|
||||
batches.push(batch1);
|
||||
|
||||
// chunk2 having duplicate data with chunk 1
|
||||
// mmore columns
|
||||
|
@ -1110,7 +1098,7 @@ mod tests {
|
|||
"+-----------+------+-----------------------------+",
|
||||
];
|
||||
assert_batches_eq!(&expected, &[batch2.clone()]);
|
||||
batches.push(Arc::new(batch2));
|
||||
batches.push(batch2);
|
||||
|
||||
batches
|
||||
}
|
||||
|
|
|
@ -110,6 +110,7 @@ mod tests {
|
|||
|
||||
use crate::{
|
||||
persist::queue::mock::MockPersistQueue,
|
||||
query::projection::OwnedProjection,
|
||||
test_util::{PartitionDataBuilder, ARBITRARY_TABLE_NAME},
|
||||
};
|
||||
|
||||
|
@ -162,7 +163,9 @@ mod tests {
|
|||
guard
|
||||
.buffer_write(mb, SequenceNumber::new(2))
|
||||
.expect("write should succeed");
|
||||
guard.get_query_data().expect("should have query adaptor")
|
||||
guard
|
||||
.get_query_data(&OwnedProjection::default())
|
||||
.expect("should have query adaptor")
|
||||
};
|
||||
|
||||
hot_partition_persister.observe(Arc::clone(&p), p.lock());
|
||||
|
@ -170,7 +173,7 @@ mod tests {
|
|||
tokio::task::yield_now().await;
|
||||
// Assert the partition was queued for persistence with the correct data.
|
||||
assert_matches!(persist_handle.calls().as_slice(), [got] => {
|
||||
let got_query_data = got.lock().get_query_data().expect("should have query adaptor");
|
||||
let got_query_data = got.lock().get_query_data(&OwnedProjection::default(),).expect("should have query adaptor");
|
||||
assert_eq!(got_query_data.record_batches(), want_query_data.record_batches());
|
||||
});
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ mod tests {
|
|||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use assert_matches::assert_matches;
|
||||
use data_types::{CompactionLevel, ParquetFile};
|
||||
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
|
||||
use futures::TryStreamExt;
|
||||
use iox_catalog::{
|
||||
interface::{get_schema_by_id, Catalog, SoftDeletedRows},
|
||||
|
@ -243,7 +243,7 @@ mod tests {
|
|||
.repositories()
|
||||
.await
|
||||
.parquet_files()
|
||||
.list_by_partition_not_to_delete(partition_id)
|
||||
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
|
||||
.await
|
||||
.expect("query for parquet files failed");
|
||||
|
||||
|
@ -344,7 +344,7 @@ mod tests {
|
|||
.await
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
partition_id,
|
||||
&transition_partition_id,
|
||||
None,
|
||||
&["bananas", "are", "good", "for", "you"],
|
||||
)
|
||||
|
@ -392,7 +392,7 @@ mod tests {
|
|||
.repositories()
|
||||
.await
|
||||
.parquet_files()
|
||||
.list_by_partition_not_to_delete(partition_id)
|
||||
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
|
||||
.await
|
||||
.expect("query for parquet files failed");
|
||||
|
||||
|
|
|
@ -376,7 +376,11 @@ where
|
|||
let mut repos = catalog.repositories().await;
|
||||
match repos
|
||||
.partitions()
|
||||
.cas_sort_key(ctx.partition_id(), old_sort_key.clone(), &new_sort_key_str)
|
||||
.cas_sort_key(
|
||||
&ctx.transition_partition_id(),
|
||||
old_sort_key.clone(),
|
||||
&new_sort_key_str,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => ControlFlow::Break(Ok(())),
|
||||
|
|
|
@ -7,7 +7,7 @@ use metric::{DurationHistogram, Metric};
|
|||
use predicate::Predicate;
|
||||
use trace::span::Span;
|
||||
|
||||
use super::QueryExec;
|
||||
use super::{projection::OwnedProjection, QueryExec};
|
||||
use crate::query::QueryError;
|
||||
|
||||
/// An instrumentation decorator over a [`QueryExec`] implementation.
|
||||
|
@ -63,7 +63,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -71,7 +71,7 @@ where
|
|||
|
||||
let res = self
|
||||
.inner
|
||||
.query_exec(namespace_id, table_id, columns, span, predicate)
|
||||
.query_exec(namespace_id, table_id, projection, span, predicate)
|
||||
.await;
|
||||
|
||||
if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
|
||||
|
@ -115,7 +115,7 @@ mod tests {
|
|||
|
||||
// Call the decorator and assert the return value
|
||||
let got = decorator
|
||||
.query_exec(NamespaceId::new(42), TableId::new(24), vec![], None, None)
|
||||
.query_exec(NamespaceId::new(42), TableId::new(24),OwnedProjection::default(), None, None)
|
||||
.await;
|
||||
assert_matches!(got, $($want_ret)+);
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ use parking_lot::Mutex;
|
|||
use predicate::Predicate;
|
||||
use trace::span::Span;
|
||||
|
||||
use super::{response::QueryResponse, QueryError, QueryExec};
|
||||
use super::{projection::OwnedProjection, response::QueryResponse, QueryError, QueryExec};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct MockQueryExec {
|
||||
|
@ -26,7 +26,7 @@ impl QueryExec for MockQueryExec {
|
|||
&self,
|
||||
_namespace_id: NamespaceId,
|
||||
_table_id: TableId,
|
||||
_columns: Vec<String>,
|
||||
_projection: OwnedProjection,
|
||||
_span: Option<Span>,
|
||||
_predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
mod r#trait;
|
||||
pub(crate) use r#trait::*;
|
||||
|
||||
pub(crate) mod projection;
|
||||
|
||||
// Response types
|
||||
pub(crate) mod partition_response;
|
||||
pub(crate) mod response;
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
use arrow::record_batch::RecordBatch;
|
||||
use mutable_batch::MutableBatch;
|
||||
use schema::SchemaBuilder;
|
||||
|
||||
/// The private inner type to prevent callers from constructing an empty Subset.
|
||||
#[derive(Debug, Default)]
|
||||
enum Projection {
|
||||
/// Return all columns.
|
||||
#[default]
|
||||
All,
|
||||
|
||||
/// Return the specified subset of columns.
|
||||
///
|
||||
/// The returned columns MAY NOT match the specified column order.
|
||||
//
|
||||
// Invariant: subset is never empty - this variant is only constructed when
|
||||
// there is at least one column to project.
|
||||
Project(Vec<String>),
|
||||
}
|
||||
|
||||
/// Specify the set of columns to project during a query.
|
||||
///
|
||||
/// Defaults to "all columns".
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct OwnedProjection(Projection);
|
||||
|
||||
impl From<Vec<String>> for OwnedProjection {
|
||||
fn from(value: Vec<String>) -> Self {
|
||||
if value.is_empty() {
|
||||
return Self(Projection::All);
|
||||
}
|
||||
|
||||
Self(Projection::Project(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedProjection {
|
||||
/// Copy the data within a [`MutableBatch`] into a [`RecordBatch`], applying
|
||||
/// the the specified projection.
|
||||
///
|
||||
/// This avoids copying column data for columns that are not part of the
|
||||
/// projection.
|
||||
///
|
||||
/// NOTE: this copies the underlying column data
|
||||
pub(crate) fn project_mutable_batches(&self, batch: &MutableBatch) -> RecordBatch {
|
||||
// Pre-allocate the outputs to their maximal possible size to avoid
|
||||
// reallocations.
|
||||
let max_capacity = match &self.0 {
|
||||
Projection::All => batch.columns().len(),
|
||||
Projection::Project(s) => s.len(),
|
||||
};
|
||||
|
||||
let mut schema_builder = SchemaBuilder::with_capacity(max_capacity);
|
||||
let mut column_data = Vec::with_capacity(max_capacity);
|
||||
|
||||
// Compute the schema overlap between the requested projection, and the
|
||||
// buffered data.
|
||||
//
|
||||
// Generate the RecordBatch contents in a single pass.
|
||||
match &self.0 {
|
||||
Projection::All => {
|
||||
// If there's no projection, the columns must be emitted ordered
|
||||
// by their name.
|
||||
let mut columns = batch.columns().collect::<Vec<_>>();
|
||||
columns.sort_unstable_by_key(|v| v.0);
|
||||
|
||||
for (name, column) in columns.into_iter() {
|
||||
schema_builder.influx_column(name, column.influx_type());
|
||||
column_data.push(column.to_arrow().expect("failed to snapshot buffer data"));
|
||||
}
|
||||
}
|
||||
|
||||
Projection::Project(cols) => {
|
||||
// Invariant: subset is never empty
|
||||
assert!(!cols.is_empty());
|
||||
|
||||
// Construct the schema & data arrays in a single pass, ordered
|
||||
// by the projection and ignoring any missing columns.
|
||||
for name in cols {
|
||||
if let Ok(column) = batch.column(name) {
|
||||
schema_builder.influx_column(name, column.influx_type());
|
||||
column_data
|
||||
.push(column.to_arrow().expect("failed to snapshot buffer data"));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let schema = schema_builder
|
||||
.build()
|
||||
.expect("failed to create batch schema");
|
||||
|
||||
RecordBatch::try_new(schema.into(), column_data)
|
||||
.expect("failed to generate snapshot record batch")
|
||||
}
|
||||
|
||||
/// Apply the specified projection to `batches`.
|
||||
///
|
||||
/// This projection requires relatively cheap ref-counting clones and does
|
||||
/// not copy the underlying data.
|
||||
pub(crate) fn project_record_batch(&self, batches: &[RecordBatch]) -> Vec<RecordBatch> {
|
||||
match &self.0 {
|
||||
Projection::All => batches.to_vec(),
|
||||
Projection::Project(columns) => {
|
||||
// Invariant: subset is never empty
|
||||
assert!(!columns.is_empty());
|
||||
|
||||
batches
|
||||
.iter()
|
||||
.map(|batch| {
|
||||
let schema = batch.schema();
|
||||
|
||||
// Map the column names to column indexes, ignoring
|
||||
// columns specified in the columns that do not exist
|
||||
// in this batch.
|
||||
let projection = columns
|
||||
.iter()
|
||||
.flat_map(|column_name| schema.index_of(column_name).ok())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
batch
|
||||
.project(&projection)
|
||||
.expect("batch projection failure")
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -67,6 +67,8 @@ use crate::query::{
|
|||
QueryError, QueryExec,
|
||||
};
|
||||
|
||||
use super::projection::OwnedProjection;
|
||||
|
||||
/// A [`QueryExec`] decorator adding instrumentation to the [`QueryResponse`]
|
||||
/// returned by the inner implementation.
|
||||
///
|
||||
|
@ -203,7 +205,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -213,7 +215,7 @@ where
|
|||
// metrics to be added?
|
||||
let stream = self
|
||||
.inner
|
||||
.query_exec(namespace_id, table_id, columns, span, predicate)
|
||||
.query_exec(namespace_id, table_id, projection, span, predicate)
|
||||
.await?;
|
||||
|
||||
let stream = QueryMetricContext::new(
|
||||
|
@ -474,7 +476,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
@ -561,7 +563,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
@ -647,7 +649,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
@ -733,7 +735,7 @@ mod tests {
|
|||
.query_exec(
|
||||
ARBITRARY_NAMESPACE_ID,
|
||||
ARBITRARY_TABLE_ID,
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
|
|
@ -5,7 +5,7 @@ use data_types::{NamespaceId, TableId};
|
|||
use predicate::Predicate;
|
||||
use trace::span::{Span, SpanRecorder};
|
||||
|
||||
use super::QueryExec;
|
||||
use super::{projection::OwnedProjection, QueryExec};
|
||||
use crate::query::QueryError;
|
||||
|
||||
/// An tracing decorator over a [`QueryExec`] implementation.
|
||||
|
@ -41,7 +41,7 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
|
@ -52,7 +52,7 @@ where
|
|||
.query_exec(
|
||||
namespace_id,
|
||||
table_id,
|
||||
columns,
|
||||
projection,
|
||||
recorder.span().cloned(),
|
||||
predicate,
|
||||
)
|
||||
|
@ -117,7 +117,7 @@ mod tests {
|
|||
.query_exec(
|
||||
NamespaceId::new(42),
|
||||
TableId::new(24),
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
Some(span.child("root span")),
|
||||
None,
|
||||
)
|
||||
|
@ -141,7 +141,7 @@ mod tests {
|
|||
.query_exec(
|
||||
NamespaceId::new(42),
|
||||
TableId::new(24),
|
||||
vec![],
|
||||
OwnedProjection::default(),
|
||||
Some(span.child("root span")),
|
||||
None,
|
||||
)
|
||||
|
|
|
@ -6,6 +6,8 @@ use predicate::Predicate;
|
|||
use thiserror::Error;
|
||||
use trace::span::Span;
|
||||
|
||||
use super::projection::OwnedProjection;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[allow(missing_copy_implementations)]
|
||||
pub(crate) enum QueryError {
|
||||
|
@ -24,7 +26,7 @@ pub(crate) trait QueryExec: Send + Sync + Debug {
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError>;
|
||||
|
@ -41,12 +43,12 @@ where
|
|||
&self,
|
||||
namespace_id: NamespaceId,
|
||||
table_id: TableId,
|
||||
columns: Vec<String>,
|
||||
projection: OwnedProjection,
|
||||
span: Option<Span>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Result<Self::Response, QueryError> {
|
||||
self.deref()
|
||||
.query_exec(namespace_id, table_id, columns, span, predicate)
|
||||
.query_exec(namespace_id, table_id, projection, span, predicate)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ pub struct QueryAdaptor {
|
|||
///
|
||||
/// This MUST be non-pub(crate) / closed for modification / immutable to support
|
||||
/// interning the merged schema in [`Self::schema()`].
|
||||
data: Vec<Arc<RecordBatch>>,
|
||||
data: Vec<RecordBatch>,
|
||||
|
||||
/// The catalog ID of the partition the this data is part of.
|
||||
partition_id: PartitionId,
|
||||
|
@ -50,12 +50,12 @@ impl QueryAdaptor {
|
|||
///
|
||||
/// This constructor panics if `data` contains no [`RecordBatch`], or all
|
||||
/// [`RecordBatch`] are empty.
|
||||
pub(crate) fn new(partition_id: PartitionId, data: Vec<Arc<RecordBatch>>) -> Self {
|
||||
pub(crate) fn new(partition_id: PartitionId, data: Vec<RecordBatch>) -> Self {
|
||||
// There must always be at least one record batch and one row.
|
||||
//
|
||||
// This upholds an invariant that simplifies dealing with empty
|
||||
// partitions - if there is a QueryAdaptor, it contains data.
|
||||
assert!(data.iter().map(|b| b.num_rows()).sum::<usize>() > 0);
|
||||
assert!(data.iter().any(|b| b.num_rows() > 0));
|
||||
|
||||
let schema = merge_record_batch_schemas(&data);
|
||||
Self {
|
||||
|
@ -73,8 +73,7 @@ impl QueryAdaptor {
|
|||
// Project the column selection across all RecordBatch
|
||||
self.data
|
||||
.iter()
|
||||
.map(|data| {
|
||||
let batch = data.as_ref();
|
||||
.map(|batch| {
|
||||
let schema = batch.schema();
|
||||
|
||||
// Apply selection to in-memory batch
|
||||
|
@ -96,10 +95,16 @@ impl QueryAdaptor {
|
|||
}
|
||||
|
||||
/// Returns the [`RecordBatch`] instances in this [`QueryAdaptor`].
|
||||
pub(crate) fn record_batches(&self) -> &[Arc<RecordBatch>] {
|
||||
pub(crate) fn record_batches(&self) -> &[RecordBatch] {
|
||||
self.data.as_ref()
|
||||
}
|
||||
|
||||
/// Unwrap this [`QueryAdaptor`], yielding the inner [`RecordBatch`]
|
||||
/// instances.
|
||||
pub(crate) fn into_record_batches(self) -> Vec<RecordBatch> {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the partition ID from which the data this [`QueryAdaptor`] was
|
||||
/// sourced from.
|
||||
pub(crate) fn partition_id(&self) -> PartitionId {
|
||||
|
@ -113,8 +118,7 @@ impl QueryAdaptor {
|
|||
|
||||
/// Time range, useful for building stats
|
||||
pub(crate) fn ts_min_max(&self) -> TimestampMinMax {
|
||||
compute_timenanosecond_min_max(self.data.iter().map(|b| b.as_ref()))
|
||||
.expect("Should have time range")
|
||||
compute_timenanosecond_min_max(self.data.iter()).expect("Should have time range")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ use tokio::sync::{Semaphore, TryAcquireError};
|
|||
use tonic::{Request, Response, Streaming};
|
||||
use trace::{
|
||||
ctx::SpanContext,
|
||||
span::{Span, SpanExt},
|
||||
span::{Span, SpanExt, SpanRecorder},
|
||||
};
|
||||
|
||||
mod instrumentation;
|
||||
|
@ -27,7 +27,7 @@ use instrumentation::FlightFrameEncodeInstrumentation;
|
|||
|
||||
use crate::{
|
||||
ingester_id::IngesterId,
|
||||
query::{response::QueryResponse, QueryError, QueryExec},
|
||||
query::{projection::OwnedProjection, response::QueryResponse, QueryError, QueryExec},
|
||||
};
|
||||
|
||||
/// Error states for the query RPC handler.
|
||||
|
@ -175,7 +175,7 @@ where
|
|||
request: Request<Ticket>,
|
||||
) -> Result<Response<Self::DoGetStream>, tonic::Status> {
|
||||
let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
|
||||
let span = span_ctx.child_span("ingester query");
|
||||
let mut query_recorder = SpanRecorder::new(span_ctx.child_span("ingester query"));
|
||||
|
||||
// Acquire and hold a permit for the duration of this request, or return
|
||||
// an error if the existing requests have already exhausted the
|
||||
|
@ -207,13 +207,15 @@ where
|
|||
None
|
||||
};
|
||||
|
||||
let projection = OwnedProjection::from(request.columns);
|
||||
|
||||
let response = match self
|
||||
.query_handler
|
||||
.query_exec(
|
||||
namespace_id,
|
||||
table_id,
|
||||
request.columns,
|
||||
span.clone(),
|
||||
projection,
|
||||
query_recorder.child_span("query exec"),
|
||||
predicate,
|
||||
)
|
||||
.await
|
||||
|
@ -221,10 +223,11 @@ where
|
|||
Ok(v) => v,
|
||||
Err(e @ (QueryError::TableNotFound(_, _) | QueryError::NamespaceNotFound(_))) => {
|
||||
debug!(
|
||||
error=%e,
|
||||
%namespace_id,
|
||||
%table_id,
|
||||
"query error, no buffered data found");
|
||||
error=%e,
|
||||
%namespace_id,
|
||||
%table_id,
|
||||
"no buffered data found for query"
|
||||
);
|
||||
|
||||
return Err(e)?;
|
||||
}
|
||||
|
@ -233,11 +236,12 @@ where
|
|||
let output = encode_response(
|
||||
response,
|
||||
self.ingester_id,
|
||||
span,
|
||||
query_recorder.child_span("serialise response"),
|
||||
Arc::clone(&self.query_request_frame_encoding_duration),
|
||||
)
|
||||
.map_err(tonic::Status::from);
|
||||
|
||||
query_recorder.ok("query exec complete - streaming results");
|
||||
Ok(Response::new(Box::pin(output) as Self::DoGetStream))
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
use arrow_util::assert_batches_sorted_eq;
|
||||
use data_types::PartitionKey;
|
||||
use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
|
||||
use ingester_test_ctx::TestContextBuilder;
|
||||
use metric::{DurationHistogram, U64Histogram};
|
||||
|
||||
// Write data to an ingester through the RPC interface and query the data, validating the contents.
|
||||
#[tokio::test]
|
||||
async fn write_query() {
|
||||
let namespace_name = "write_query_test_namespace";
|
||||
let mut ctx = TestContextBuilder::default().build().await;
|
||||
let ns = ctx.ensure_namespace(namespace_name, None).await;
|
||||
|
||||
// Initial write
|
||||
let partition_key = PartitionKey::from("1970-01-01");
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas greatness=\"unbounded\" 10",
|
||||
partition_key.clone(),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
|
||||
// A subsequent write with a non-contiguous sequence number to a different table.
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"cpu bar=2 20\ncpu bar=3 30",
|
||||
partition_key.clone(),
|
||||
7,
|
||||
)
|
||||
.await;
|
||||
|
||||
// And a third write that appends more data to the table in the initial
|
||||
// write.
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas count=42 200",
|
||||
partition_key.clone(),
|
||||
42,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Perform a query to validate the actual data buffered.
|
||||
let data: Vec<_> = ctx
|
||||
.query(IngesterQueryRequest {
|
||||
namespace_id: ns.id.get(),
|
||||
table_id: ctx.table_id(namespace_name, "bananas").await.get(),
|
||||
columns: vec![],
|
||||
predicate: None,
|
||||
})
|
||||
.await
|
||||
.expect("query request failed");
|
||||
|
||||
let expected = vec![
|
||||
"+-------+-----------+--------------------------------+",
|
||||
"| count | greatness | time |",
|
||||
"+-------+-----------+--------------------------------+",
|
||||
"| | unbounded | 1970-01-01T00:00:00.000000010Z |",
|
||||
"| 42.0 | | 1970-01-01T00:00:00.000000200Z |",
|
||||
"+-------+-----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_sorted_eq!(&expected, &data);
|
||||
|
||||
// Assert various ingest metrics.
|
||||
let hist = ctx
|
||||
.get_metric::<DurationHistogram, _>(
|
||||
"ingester_dml_sink_apply_duration",
|
||||
&[("handler", "write_apply"), ("result", "success")],
|
||||
)
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 3);
|
||||
|
||||
// Read metrics
|
||||
let hist = ctx
|
||||
.get_metric::<DurationHistogram, _>(
|
||||
"ingester_query_stream_duration",
|
||||
&[("request", "complete")],
|
||||
)
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 1);
|
||||
|
||||
let hist = ctx
|
||||
.get_metric::<U64Histogram, _>("ingester_query_result_row", &[])
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 1);
|
||||
assert_eq!(hist.total, 2);
|
||||
}
|
||||
|
||||
// Write data to an ingester through the RPC interface and query the data, validating the contents.
|
||||
#[tokio::test]
|
||||
async fn write_query_projection() {
|
||||
let namespace_name = "write_query_test_namespace";
|
||||
let mut ctx = TestContextBuilder::default().build().await;
|
||||
let ns = ctx.ensure_namespace(namespace_name, None).await;
|
||||
|
||||
// Initial write
|
||||
let partition_key = PartitionKey::from("1970-01-01");
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas greatness=\"unbounded\",level=42 10",
|
||||
partition_key.clone(),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Another write that appends more data to the table in the initial write.
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas count=42,level=4242 200",
|
||||
partition_key.clone(),
|
||||
42,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Perform a query to validate the actual data buffered.
|
||||
let data: Vec<_> = ctx
|
||||
.query(IngesterQueryRequest {
|
||||
namespace_id: ns.id.get(),
|
||||
table_id: ctx.table_id(namespace_name, "bananas").await.get(),
|
||||
columns: vec![],
|
||||
predicate: None,
|
||||
})
|
||||
.await
|
||||
.expect("query request failed");
|
||||
|
||||
let expected = vec![
|
||||
"+-------+-----------+--------+--------------------------------+",
|
||||
"| count | greatness | level | time |",
|
||||
"+-------+-----------+--------+--------------------------------+",
|
||||
"| | unbounded | 42.0 | 1970-01-01T00:00:00.000000010Z |",
|
||||
"| 42.0 | | 4242.0 | 1970-01-01T00:00:00.000000200Z |",
|
||||
"+-------+-----------+--------+--------------------------------+",
|
||||
];
|
||||
assert_batches_sorted_eq!(&expected, &data);
|
||||
|
||||
// And perform a query with projection, selecting a column that is entirely
|
||||
// non-NULL, a column containing NULLs (in a different order to the above)
|
||||
// and a column that does not exist.
|
||||
let data: Vec<_> = ctx
|
||||
.query(IngesterQueryRequest {
|
||||
namespace_id: ns.id.get(),
|
||||
table_id: ctx.table_id(namespace_name, "bananas").await.get(),
|
||||
columns: vec![
|
||||
"level".to_string(),
|
||||
"greatness".to_string(),
|
||||
"platanos".to_string(),
|
||||
],
|
||||
predicate: None,
|
||||
})
|
||||
.await
|
||||
.expect("query request failed");
|
||||
|
||||
let expected = vec![
|
||||
"+--------+-----------+",
|
||||
"| level | greatness |",
|
||||
"+--------+-----------+",
|
||||
"| 42.0 | unbounded |",
|
||||
"| 4242.0 | |",
|
||||
"+--------+-----------+",
|
||||
];
|
||||
assert_batches_sorted_eq!(&expected, &data);
|
||||
}
|
|
@ -12,88 +12,6 @@ use parquet_file::ParquetFilePath;
|
|||
use std::{ffi::OsString, fs::read_dir, path::Path, sync::Arc, time::Duration};
|
||||
use test_helpers::timeout::FutureTimeout;
|
||||
|
||||
// Write data to an ingester through the RPC interface and query the data, validating the contents.
|
||||
#[tokio::test]
|
||||
async fn write_query() {
|
||||
let namespace_name = "write_query_test_namespace";
|
||||
let mut ctx = TestContextBuilder::default().build().await;
|
||||
let ns = ctx.ensure_namespace(namespace_name, None).await;
|
||||
|
||||
// Initial write
|
||||
let partition_key = PartitionKey::from("1970-01-01");
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas greatness=\"unbounded\" 10",
|
||||
partition_key.clone(),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
|
||||
// A subsequent write with a non-contiguous sequence number to a different table.
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"cpu bar=2 20\ncpu bar=3 30",
|
||||
partition_key.clone(),
|
||||
7,
|
||||
)
|
||||
.await;
|
||||
|
||||
// And a third write that appends more data to the table in the initial
|
||||
// write.
|
||||
ctx.write_lp(
|
||||
namespace_name,
|
||||
"bananas count=42 200",
|
||||
partition_key.clone(),
|
||||
42,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Perform a query to validate the actual data buffered.
|
||||
let data: Vec<_> = ctx
|
||||
.query(IngesterQueryRequest {
|
||||
namespace_id: ns.id.get(),
|
||||
table_id: ctx.table_id(namespace_name, "bananas").await.get(),
|
||||
columns: vec![],
|
||||
predicate: None,
|
||||
})
|
||||
.await
|
||||
.expect("query request failed");
|
||||
|
||||
let expected = vec![
|
||||
"+-------+-----------+--------------------------------+",
|
||||
"| count | greatness | time |",
|
||||
"+-------+-----------+--------------------------------+",
|
||||
"| | unbounded | 1970-01-01T00:00:00.000000010Z |",
|
||||
"| 42.0 | | 1970-01-01T00:00:00.000000200Z |",
|
||||
"+-------+-----------+--------------------------------+",
|
||||
];
|
||||
assert_batches_sorted_eq!(&expected, &data);
|
||||
|
||||
// Assert various ingest metrics.
|
||||
let hist = ctx
|
||||
.get_metric::<DurationHistogram, _>(
|
||||
"ingester_dml_sink_apply_duration",
|
||||
&[("handler", "write_apply"), ("result", "success")],
|
||||
)
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 3);
|
||||
|
||||
// Read metrics
|
||||
let hist = ctx
|
||||
.get_metric::<DurationHistogram, _>(
|
||||
"ingester_query_stream_duration",
|
||||
&[("request", "complete")],
|
||||
)
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 1);
|
||||
|
||||
let hist = ctx
|
||||
.get_metric::<U64Histogram, _>("ingester_query_result_row", &[])
|
||||
.fetch();
|
||||
assert_eq!(hist.sample_count(), 1);
|
||||
assert_eq!(hist.total, 2);
|
||||
}
|
||||
|
||||
// Write data to an ingester through the RPC interface and persist the data.
|
||||
#[tokio::test]
|
||||
async fn write_persist() {
|
||||
|
|
|
@ -6,7 +6,7 @@ edition.workspace = true
|
|||
license.workspace = true
|
||||
|
||||
[dependencies] # In alphabetical order
|
||||
async-trait = "0.1.70"
|
||||
async-trait = "0.1.71"
|
||||
data_types = { path = "../data_types" }
|
||||
futures = "0.3"
|
||||
iox_time = { version = "0.1.0", path = "../iox_time" }
|
||||
|
@ -20,7 +20,7 @@ siphasher = "0.3"
|
|||
snafu = "0.7"
|
||||
sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid", "sqlite" ] }
|
||||
sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" }
|
||||
thiserror = "1.0.41"
|
||||
thiserror = "1.0.43"
|
||||
tokio = { version = "1.29", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
||||
|
@ -31,7 +31,7 @@ dotenvy = "0.15.7"
|
|||
generated_types = { path = "../generated_types" }
|
||||
mutable_batch_lp = { path = "../mutable_batch_lp" }
|
||||
paste = "1.0.13"
|
||||
pretty_assertions = "1.3.0"
|
||||
pretty_assertions = "1.4.0"
|
||||
rand = "0.8"
|
||||
tempfile = "3"
|
||||
test_helpers = { path = "../test_helpers" }
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
-- Add to help the compactor when it searches for partitions with files created recently.
|
||||
|
||||
-- By default we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY` however takes longer.
|
||||
-- In our prod test this took about 15min, but better be safe than sorry.
|
||||
-- IOX_NO_TRANSACTION
|
||||
SET statement_timeout TO '60min';
|
||||
|
||||
-- IOX_STEP_BOUNDARY
|
||||
|
||||
-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
|
||||
-- successfully built.
|
||||
-- IOX_NO_TRANSACTION
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS partition_new_file_at_idx ON partition (new_file_at);
|
|
@ -0,0 +1,11 @@
|
|||
-- By default, we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY`,
|
||||
-- however, can take longer.
|
||||
-- IOX_NO_TRANSACTION
|
||||
SET statement_timeout TO '60min';
|
||||
|
||||
-- IOX_STEP_BOUNDARY
|
||||
|
||||
-- IOX_NO_TRANSACTION
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS parquet_file_partition_hash_id_idx
|
||||
ON parquet_file (partition_hash_id)
|
||||
WHERE partition_hash_id IS NOT NULL;
|
|
@ -0,0 +1,3 @@
|
|||
CREATE INDEX IF NOT EXISTS parquet_file_partition_hash_id_idx
|
||||
ON parquet_file (partition_hash_id)
|
||||
WHERE partition_hash_id IS NOT NULL;
|
|
@ -6,7 +6,7 @@ use data_types::{
|
|||
Column, ColumnType, ColumnsByName, CompactionLevel, Namespace, NamespaceId, NamespaceName,
|
||||
NamespaceSchema, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId,
|
||||
ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction,
|
||||
Table, TableId, TableSchema, Timestamp,
|
||||
Table, TableId, TableSchema, Timestamp, TransitionPartitionId,
|
||||
};
|
||||
use iox_time::TimeProvider;
|
||||
use snafu::{OptionExt, Snafu};
|
||||
|
@ -80,7 +80,7 @@ pub enum Error {
|
|||
TableNotFound { id: TableId },
|
||||
|
||||
#[snafu(display("partition {} not found", id))]
|
||||
PartitionNotFound { id: PartitionId },
|
||||
PartitionNotFound { id: TransitionPartitionId },
|
||||
|
||||
#[snafu(display(
|
||||
"couldn't create column {} in table {}; limit reached on namespace",
|
||||
|
@ -397,7 +397,7 @@ pub trait PartitionRepo: Send + Sync {
|
|||
/// concurrent writers.
|
||||
async fn cas_sort_key(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
old_sort_key: Option<Vec<String>>,
|
||||
new_sort_key: &[&str],
|
||||
) -> Result<Partition, CasFailure<Vec<String>>>;
|
||||
|
@ -483,7 +483,7 @@ pub trait ParquetFileRepo: Send + Sync {
|
|||
/// [`to_delete`](ParquetFile::to_delete).
|
||||
async fn list_by_partition_not_to_delete(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
) -> Result<Vec<ParquetFile>>;
|
||||
|
||||
/// Return the parquet file with the given object store id
|
||||
|
@ -1549,7 +1549,11 @@ pub(crate) mod test_helpers {
|
|||
// test update_sort_key from None to Some
|
||||
repos
|
||||
.partitions()
|
||||
.cas_sort_key(other_partition.id, None, &["tag2", "tag1", "time"])
|
||||
.cas_sort_key(
|
||||
&other_partition.transition_partition_id(),
|
||||
None,
|
||||
&["tag2", "tag1", "time"],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
@ -1557,7 +1561,7 @@ pub(crate) mod test_helpers {
|
|||
let err = repos
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
other_partition.id,
|
||||
&other_partition.transition_partition_id(),
|
||||
Some(["bananas".to_string()].to_vec()),
|
||||
&["tag2", "tag1", "tag3 , with comma", "time"],
|
||||
)
|
||||
|
@ -1593,7 +1597,7 @@ pub(crate) mod test_helpers {
|
|||
let err = repos
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
other_partition.id,
|
||||
&other_partition.transition_partition_id(),
|
||||
None,
|
||||
&["tag2", "tag1", "tag3 , with comma", "time"],
|
||||
)
|
||||
|
@ -1607,7 +1611,7 @@ pub(crate) mod test_helpers {
|
|||
let err = repos
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
other_partition.id,
|
||||
&other_partition.transition_partition_id(),
|
||||
Some(["bananas".to_string()].to_vec()),
|
||||
&["tag2", "tag1", "tag3 , with comma", "time"],
|
||||
)
|
||||
|
@ -1621,7 +1625,7 @@ pub(crate) mod test_helpers {
|
|||
repos
|
||||
.partitions()
|
||||
.cas_sort_key(
|
||||
other_partition.id,
|
||||
&other_partition.transition_partition_id(),
|
||||
Some(
|
||||
["tag2", "tag1", "time"]
|
||||
.into_iter()
|
||||
|
@ -2676,6 +2680,7 @@ pub(crate) mod test_helpers {
|
|||
|
||||
let other_partition_params = ParquetFileParams {
|
||||
partition_id: partition2.id,
|
||||
partition_hash_id: partition2.hash_id().cloned(),
|
||||
object_store_id: Uuid::new_v4(),
|
||||
..parquet_file_params.clone()
|
||||
};
|
||||
|
@ -2687,14 +2692,16 @@ pub(crate) mod test_helpers {
|
|||
|
||||
let files = repos
|
||||
.parquet_files()
|
||||
.list_by_partition_not_to_delete(partition.id)
|
||||
.list_by_partition_not_to_delete(&partition.transition_partition_id())
|
||||
.await
|
||||
.unwrap();
|
||||
// not asserting against a vector literal to guard against flakiness due to uncertain
|
||||
// ordering of SQL query in postgres impl
|
||||
assert_eq!(files.len(), 2);
|
||||
assert_matches!(files.iter().find(|f| f.id == parquet_file.id), Some(_));
|
||||
assert_matches!(files.iter().find(|f| f.id == level1_file.id), Some(_));
|
||||
|
||||
let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
|
||||
file_ids.sort();
|
||||
let mut expected_ids = vec![parquet_file.id, level1_file.id];
|
||||
expected_ids.sort();
|
||||
assert_eq!(file_ids, expected_ids);
|
||||
|
||||
// remove namespace to avoid it from affecting later tests
|
||||
repos
|
||||
|
|
|
@ -22,7 +22,7 @@ use workspace_hack as _;
|
|||
use crate::interface::{ColumnTypeMismatchSnafu, Error, RepoCollection, Result};
|
||||
use data_types::{
|
||||
partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
|
||||
ColumnType, NamespaceId, NamespaceSchema, TableSchema,
|
||||
ColumnType, NamespaceId, NamespaceSchema, Partition, TableSchema, TransitionPartitionId,
|
||||
};
|
||||
use mutable_batch::MutableBatch;
|
||||
use std::{borrow::Cow, collections::HashMap};
|
||||
|
@ -67,6 +67,27 @@ impl TableScopedError {
|
|||
}
|
||||
}
|
||||
|
||||
/// Look up a partition in the catalog by either database-assigned ID or deterministic hash ID.
|
||||
///
|
||||
/// The existence of this function should be temporary; it can be removed once all partition lookup
|
||||
/// is happening with only the deterministic hash ID.
|
||||
pub async fn partition_lookup<R>(
|
||||
repos: &mut R,
|
||||
id: &TransitionPartitionId,
|
||||
) -> Result<Option<Partition>, Error>
|
||||
where
|
||||
R: RepoCollection + ?Sized,
|
||||
{
|
||||
match id {
|
||||
TransitionPartitionId::Deprecated(partition_id) => {
|
||||
repos.partitions().get_by_id(*partition_id).await
|
||||
}
|
||||
TransitionPartitionId::Deterministic(partition_hash_id) => {
|
||||
repos.partitions().get_by_hash_id(partition_hash_id).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Given an iterator of `(table_name, batch)` to validate, this function
|
||||
/// ensures all the columns within `batch` match the existing schema for
|
||||
/// `table_name` in `schema`. If the column does not already exist in `schema`,
|
||||
|
|
|
@ -19,7 +19,7 @@ use data_types::{
|
|||
Column, ColumnId, ColumnType, CompactionLevel, Namespace, NamespaceId, NamespaceName,
|
||||
NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId, ParquetFileParams,
|
||||
Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, Table, TableId,
|
||||
Timestamp,
|
||||
Timestamp, TransitionPartitionId,
|
||||
};
|
||||
use iox_time::{SystemProvider, TimeProvider};
|
||||
use snafu::ensure;
|
||||
|
@ -625,20 +625,26 @@ impl PartitionRepo for MemTxn {
|
|||
|
||||
async fn cas_sort_key(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
old_sort_key: Option<Vec<String>>,
|
||||
new_sort_key: &[&str],
|
||||
) -> Result<Partition, CasFailure<Vec<String>>> {
|
||||
let stage = self.stage();
|
||||
let old_sort_key = old_sort_key.unwrap_or_default();
|
||||
match stage.partitions.iter_mut().find(|p| p.id == partition_id) {
|
||||
|
||||
match stage.partitions.iter_mut().find(|p| match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => {
|
||||
p.hash_id().map_or(false, |h| h == hash_id)
|
||||
}
|
||||
TransitionPartitionId::Deprecated(id) => p.id == *id,
|
||||
}) {
|
||||
Some(p) if p.sort_key == old_sort_key => {
|
||||
p.sort_key = new_sort_key.iter().map(|s| s.to_string()).collect();
|
||||
Ok(p.clone())
|
||||
}
|
||||
Some(p) => return Err(CasFailure::ValueMismatch(p.sort_key.clone())),
|
||||
None => Err(CasFailure::QueryError(Error::PartitionNotFound {
|
||||
id: partition_id,
|
||||
id: partition_id.clone(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
@ -844,14 +850,20 @@ impl ParquetFileRepo for MemTxn {
|
|||
|
||||
async fn list_by_partition_not_to_delete(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
) -> Result<Vec<ParquetFile>> {
|
||||
let stage = self.stage();
|
||||
|
||||
Ok(stage
|
||||
.parquet_files
|
||||
.iter()
|
||||
.filter(|f| f.partition_id == partition_id && f.to_delete.is_none())
|
||||
.filter(|f| match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => {
|
||||
f.partition_hash_id.as_ref().map_or(false, |h| h == hash_id)
|
||||
}
|
||||
TransitionPartitionId::Deprecated(id) => f.partition_id == *id,
|
||||
})
|
||||
.filter(|f| f.to_delete.is_none())
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
@ -962,7 +974,9 @@ async fn create_parquet_file(
|
|||
.partitions
|
||||
.iter_mut()
|
||||
.find(|p| p.id == partition_id)
|
||||
.ok_or(Error::PartitionNotFound { id: partition_id })?;
|
||||
.ok_or(Error::PartitionNotFound {
|
||||
id: TransitionPartitionId::Deprecated(partition_id),
|
||||
})?;
|
||||
partition.new_file_at = Some(created_at);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ use data_types::{
|
|||
Column, ColumnType, CompactionLevel, Namespace, NamespaceId, NamespaceName,
|
||||
NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId, ParquetFileParams,
|
||||
Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, Table, TableId,
|
||||
Timestamp,
|
||||
Timestamp, TransitionPartitionId,
|
||||
};
|
||||
use iox_time::{SystemProvider, TimeProvider};
|
||||
use metric::{DurationHistogram, Metric};
|
||||
|
@ -174,7 +174,7 @@ decorate!(
|
|||
"partition_get_by_hash_id" = get_by_hash_id(&mut self, partition_hash_id: &PartitionHashId) -> Result<Option<Partition>>;
|
||||
"partition_list_by_table_id" = list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>>;
|
||||
"partition_list_ids" = list_ids(&mut self) -> Result<Vec<PartitionId>>;
|
||||
"partition_update_sort_key" = cas_sort_key(&mut self, partition_id: PartitionId, old_sort_key: Option<Vec<String>>, new_sort_key: &[&str]) -> Result<Partition, CasFailure<Vec<String>>>;
|
||||
"partition_update_sort_key" = cas_sort_key(&mut self, partition_id: &TransitionPartitionId, old_sort_key: Option<Vec<String>>, new_sort_key: &[&str]) -> Result<Partition, CasFailure<Vec<String>>>;
|
||||
"partition_record_skipped_compaction" = record_skipped_compaction(&mut self, partition_id: PartitionId, reason: &str, num_files: usize, limit_num_files: usize, limit_num_files_first_in_partition: usize, estimated_bytes: u64, limit_bytes: u64) -> Result<()>;
|
||||
"partition_list_skipped_compactions" = list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>>;
|
||||
"partition_delete_skipped_compactions" = delete_skipped_compactions(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
|
||||
|
@ -193,7 +193,7 @@ decorate!(
|
|||
"parquet_list_by_namespace_not_to_delete" = list_by_namespace_not_to_delete(&mut self, namespace_id: NamespaceId) -> Result<Vec<ParquetFile>>;
|
||||
"parquet_list_by_table_not_to_delete" = list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>>;
|
||||
"parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>>;
|
||||
"parquet_list_by_partition_not_to_delete" = list_by_partition_not_to_delete(&mut self, partition_id: PartitionId) -> Result<Vec<ParquetFile>>;
|
||||
"parquet_list_by_partition_not_to_delete" = list_by_partition_not_to_delete(&mut self, partition_id: &TransitionPartitionId) -> Result<Vec<ParquetFile>>;
|
||||
"parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: Uuid) -> Result<Option<ParquetFile>>;
|
||||
"parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<Uuid>) -> Result<Vec<Uuid>>;
|
||||
"parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, delete: &[ParquetFileId], upgrade: &[ParquetFileId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result<Vec<ParquetFileId>>;
|
||||
|
|
|
@ -23,7 +23,7 @@ use data_types::{
|
|||
Column, ColumnType, CompactionLevel, Namespace, NamespaceId, NamespaceName,
|
||||
NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId, ParquetFileParams,
|
||||
Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, Table, TableId,
|
||||
Timestamp,
|
||||
Timestamp, TransitionPartitionId,
|
||||
};
|
||||
use iox_time::{SystemProvider, TimeProvider};
|
||||
use observability_deps::tracing::{debug, info, warn};
|
||||
|
@ -1153,24 +1153,38 @@ WHERE table_id = $1;
|
|||
/// round trips to service a transaction in the happy path).
|
||||
async fn cas_sort_key(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
old_sort_key: Option<Vec<String>>,
|
||||
new_sort_key: &[&str],
|
||||
) -> Result<Partition, CasFailure<Vec<String>>> {
|
||||
let old_sort_key = old_sort_key.unwrap_or_default();
|
||||
let res = sqlx::query_as::<_, Partition>(
|
||||
r#"
|
||||
// This `match` will go away when all partitions have hash IDs in the database.
|
||||
let query = match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, Partition>(
|
||||
r#"
|
||||
UPDATE partition
|
||||
SET sort_key = $1
|
||||
WHERE hash_id = $2 AND sort_key = $3
|
||||
RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
||||
"#,
|
||||
)
|
||||
.bind(new_sort_key) // $1
|
||||
.bind(hash_id) // $2
|
||||
.bind(&old_sort_key), // $3
|
||||
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, Partition>(
|
||||
r#"
|
||||
UPDATE partition
|
||||
SET sort_key = $1
|
||||
WHERE id = $2 AND sort_key = $3
|
||||
RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
||||
"#,
|
||||
)
|
||||
.bind(new_sort_key) // $1
|
||||
.bind(partition_id) // $2
|
||||
.bind(&old_sort_key) // $3
|
||||
.fetch_one(&mut self.inner)
|
||||
.await;
|
||||
)
|
||||
.bind(new_sort_key) // $1
|
||||
.bind(id) // $2
|
||||
.bind(&old_sort_key), // $3
|
||||
};
|
||||
|
||||
let res = query.fetch_one(&mut self.inner).await;
|
||||
|
||||
let partition = match res {
|
||||
Ok(v) => v,
|
||||
|
@ -1187,11 +1201,11 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
|||
// NOTE: this is racy, but documented - this might return "Sort
|
||||
// key differs! Old key: <old sort key you provided>"
|
||||
return Err(CasFailure::ValueMismatch(
|
||||
PartitionRepo::get_by_id(self, partition_id)
|
||||
crate::partition_lookup(self, partition_id)
|
||||
.await
|
||||
.map_err(CasFailure::QueryError)?
|
||||
.ok_or(CasFailure::QueryError(Error::PartitionNotFound {
|
||||
id: partition_id,
|
||||
id: partition_id.clone(),
|
||||
}))?
|
||||
.sort_key,
|
||||
));
|
||||
|
@ -1458,10 +1472,23 @@ RETURNING id;
|
|||
|
||||
async fn list_by_partition_not_to_delete(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
) -> Result<Vec<ParquetFile>> {
|
||||
sqlx::query_as::<_, ParquetFile>(
|
||||
r#"
|
||||
// This `match` will go away when all partitions have hash IDs in the database.
|
||||
let query = match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFile>(
|
||||
r#"
|
||||
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
|
||||
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
|
||||
max_l0_created_at
|
||||
FROM parquet_file
|
||||
WHERE parquet_file.partition_hash_id = $1
|
||||
AND parquet_file.to_delete IS NULL;
|
||||
"#,
|
||||
)
|
||||
.bind(hash_id), // $1
|
||||
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFile>(
|
||||
r#"
|
||||
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
|
||||
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
|
||||
max_l0_created_at
|
||||
|
@ -1469,11 +1496,14 @@ FROM parquet_file
|
|||
WHERE parquet_file.partition_id = $1
|
||||
AND parquet_file.to_delete IS NULL;
|
||||
"#,
|
||||
)
|
||||
.bind(partition_id) // $1
|
||||
.fetch_all(&mut self.inner)
|
||||
.await
|
||||
.map_err(|e| Error::SqlxError { source: e })
|
||||
)
|
||||
.bind(id), // $1
|
||||
};
|
||||
|
||||
query
|
||||
.fetch_all(&mut self.inner)
|
||||
.await
|
||||
.map_err(|e| Error::SqlxError { source: e })
|
||||
}
|
||||
|
||||
async fn get_by_object_store_id(
|
||||
|
|
|
@ -21,7 +21,7 @@ use data_types::{
|
|||
Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, Namespace, NamespaceId,
|
||||
NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId,
|
||||
ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction,
|
||||
Table, TableId, Timestamp,
|
||||
Table, TableId, Timestamp, TransitionPartitionId,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
|
@ -952,24 +952,39 @@ WHERE table_id = $1;
|
|||
/// round trips to service a transaction in the happy path).
|
||||
async fn cas_sort_key(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
old_sort_key: Option<Vec<String>>,
|
||||
new_sort_key: &[&str],
|
||||
) -> Result<Partition, CasFailure<Vec<String>>> {
|
||||
let old_sort_key = old_sort_key.unwrap_or_default();
|
||||
let res = sqlx::query_as::<_, PartitionPod>(
|
||||
r#"
|
||||
|
||||
// This `match` will go away when all partitions have hash IDs in the database.
|
||||
let query = match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, PartitionPod>(
|
||||
r#"
|
||||
UPDATE partition
|
||||
SET sort_key = $1
|
||||
WHERE hash_id = $2 AND sort_key = $3
|
||||
RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
||||
"#,
|
||||
)
|
||||
.bind(Json(new_sort_key)) // $1
|
||||
.bind(hash_id) // $2
|
||||
.bind(Json(&old_sort_key)), // $3
|
||||
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, PartitionPod>(
|
||||
r#"
|
||||
UPDATE partition
|
||||
SET sort_key = $1
|
||||
WHERE id = $2 AND sort_key = $3
|
||||
RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
||||
"#,
|
||||
)
|
||||
.bind(Json(new_sort_key)) // $1
|
||||
.bind(partition_id) // $2
|
||||
.bind(Json(&old_sort_key)) // $3
|
||||
.fetch_one(self.inner.get_mut())
|
||||
.await;
|
||||
)
|
||||
.bind(Json(new_sort_key)) // $1
|
||||
.bind(id) // $2
|
||||
.bind(Json(&old_sort_key)), // $3
|
||||
};
|
||||
|
||||
let res = query.fetch_one(self.inner.get_mut()).await;
|
||||
|
||||
let partition = match res {
|
||||
Ok(v) => v,
|
||||
|
@ -986,11 +1001,11 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
|
|||
// NOTE: this is racy, but documented - this might return "Sort
|
||||
// key differs! Old key: <old sort key you provided>"
|
||||
return Err(CasFailure::ValueMismatch(
|
||||
PartitionRepo::get_by_id(self, partition_id)
|
||||
crate::partition_lookup(self, partition_id)
|
||||
.await
|
||||
.map_err(CasFailure::QueryError)?
|
||||
.ok_or(CasFailure::QueryError(Error::PartitionNotFound {
|
||||
id: partition_id,
|
||||
id: partition_id.clone(),
|
||||
}))?
|
||||
.sort_key,
|
||||
));
|
||||
|
@ -1323,10 +1338,23 @@ RETURNING id;
|
|||
|
||||
async fn list_by_partition_not_to_delete(
|
||||
&mut self,
|
||||
partition_id: PartitionId,
|
||||
partition_id: &TransitionPartitionId,
|
||||
) -> Result<Vec<ParquetFile>> {
|
||||
Ok(sqlx::query_as::<_, ParquetFilePod>(
|
||||
r#"
|
||||
// This `match` will go away when all partitions have hash IDs in the database.
|
||||
let query = match partition_id {
|
||||
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFilePod>(
|
||||
r#"
|
||||
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
|
||||
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
|
||||
max_l0_created_at
|
||||
FROM parquet_file
|
||||
WHERE parquet_file.partition_hash_id = $1
|
||||
AND parquet_file.to_delete IS NULL;
|
||||
"#,
|
||||
)
|
||||
.bind(hash_id), // $1
|
||||
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFilePod>(
|
||||
r#"
|
||||
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
|
||||
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
|
||||
max_l0_created_at
|
||||
|
@ -1334,14 +1362,17 @@ FROM parquet_file
|
|||
WHERE parquet_file.partition_id = $1
|
||||
AND parquet_file.to_delete IS NULL;
|
||||
"#,
|
||||
)
|
||||
.bind(partition_id) // $1
|
||||
.fetch_all(self.inner.get_mut())
|
||||
.await
|
||||
.map_err(|e| Error::SqlxError { source: e })?
|
||||
.into_iter()
|
||||
.map(Into::into)
|
||||
.collect())
|
||||
)
|
||||
.bind(id), // $1
|
||||
};
|
||||
|
||||
Ok(query
|
||||
.fetch_all(self.inner.get_mut())
|
||||
.await
|
||||
.map_err(|e| Error::SqlxError { source: e })?
|
||||
.into_iter()
|
||||
.map(Into::into)
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn get_by_object_store_id(
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue