Merge branch 'main' into dependabot/cargo/clap-4.0.2

pull/24376/head
kodiakhq[bot] 2022-10-12 14:01:28 +00:00 committed by GitHub
commit 266b8f2a58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
129 changed files with 4605 additions and 5043 deletions

84
Cargo.lock generated
View File

@ -1050,7 +1050,7 @@ dependencies = [
"influxdb_line_protocol",
"iox_time",
"observability_deps",
"ordered-float 3.1.0",
"ordered-float 3.2.0",
"percent-encoding",
"schema",
"serde",
@ -1094,7 +1094,7 @@ dependencies = [
"log",
"num_cpus",
"object_store",
"ordered-float 3.1.0",
"ordered-float 3.2.0",
"parking_lot 0.12.1",
"parquet",
"paste",
@ -1116,7 +1116,7 @@ source = "git+https://github.com/apache/arrow-datafusion.git?rev=c7f3a70a79ee840
dependencies = [
"arrow",
"object_store",
"ordered-float 3.1.0",
"ordered-float 3.2.0",
"parquet",
"sqlparser 0.23.0",
]
@ -1163,7 +1163,7 @@ dependencies = [
"hashbrown",
"lazy_static",
"md-5",
"ordered-float 3.1.0",
"ordered-float 3.2.0",
"paste",
"rand",
"regex",
@ -1741,9 +1741,9 @@ dependencies = [
[[package]]
name = "handlebars"
version = "4.3.4"
version = "4.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56b224eaa4987c03c30b251de7ef0c15a6a59f34222905850dbc3026dfb24d5f"
checksum = "433e4ab33f1213cdc25b5fa45c76881240cfe79284cf2b395e8b9e312a30a2fd"
dependencies = [
"log",
"pest",
@ -2061,7 +2061,9 @@ dependencies = [
"data_types",
"datafusion 0.1.0",
"dotenvy",
"flate2",
"futures",
"futures-util",
"generated_types",
"hashbrown",
"http",
@ -2126,12 +2128,13 @@ dependencies = [
"client_util",
"futures-util",
"generated_types",
"mockito",
"influxdb_line_protocol",
"prost 0.11.0",
"rand",
"reqwest",
"thiserror",
"tokio",
"tokio-stream",
"tonic",
]
@ -2182,7 +2185,7 @@ version = "0.1.0"
dependencies = [
"generated_types",
"snafu",
"sqlparser 0.24.0",
"sqlparser 0.25.0",
"workspace-hack",
]
@ -2222,6 +2225,7 @@ dependencies = [
"pin-project",
"predicate",
"prost 0.11.0",
"rand",
"schema",
"snafu",
"test_helpers",
@ -2681,9 +2685,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.134"
version = "0.2.135"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
[[package]]
name = "libloading"
@ -3130,9 +3134,9 @@ dependencies = [
[[package]]
name = "object_store"
version = "0.5.0"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2168fee79ee3e7695905bc3a48777d807f82d956f821186fa7a2601c1295a73e"
checksum = "56ce10a205d9f610ae3532943039c34c145930065ce0c4284134c897fe6073b1"
dependencies = [
"async-trait",
"base64",
@ -3142,7 +3146,7 @@ dependencies = [
"itertools",
"parking_lot 0.12.1",
"percent-encoding",
"quick-xml 0.24.1",
"quick-xml 0.25.0",
"rand",
"reqwest",
"ring",
@ -3207,9 +3211,9 @@ dependencies = [
[[package]]
name = "ordered-float"
version = "3.1.0"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a"
checksum = "129d36517b53c461acc6e1580aeb919c8ae6708a4b1eae61c4463a615d4f0411"
dependencies = [
"num-traits",
]
@ -3581,7 +3585,7 @@ dependencies = [
"schema",
"serde_json",
"snafu",
"sqlparser 0.24.0",
"sqlparser 0.25.0",
"test_helpers",
"workspace-hack",
]
@ -3670,9 +3674,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro2"
version = "1.0.43"
version = "1.0.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b"
dependencies = [
"unicode-ident",
]
@ -3942,9 +3946,9 @@ dependencies = [
[[package]]
name = "quick-xml"
version = "0.24.1"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37dddbbe9df96afafcb8027fcf263971b726530e12f0787f620a7ba5b4846081"
checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9"
dependencies = [
"memchr",
"serde",
@ -4412,9 +4416,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.85"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
dependencies = [
"itoa 1.0.3",
"ryu",
@ -4669,15 +4673,15 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.9.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
[[package]]
name = "snafu"
version = "0.7.1"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2"
checksum = "dd726aec4ebad65756394ff89a9b9598793d4e30121cd71690244c1e497b3aee"
dependencies = [
"doc-comment",
"snafu-derive",
@ -4685,9 +4689,9 @@ dependencies = [
[[package]]
name = "snafu-derive"
version = "0.7.1"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5"
checksum = "712529e9b0b014eabaa345b38e06032767e3dc393e8b017e853b1d7247094e74"
dependencies = [
"heck",
"proc-macro2",
@ -4748,9 +4752,9 @@ dependencies = [
[[package]]
name = "sqlparser"
version = "0.24.0"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dac9c312566fdfc45a38ecf1924013c82af2a7d5315e46f67b1cc987f12be260"
checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd"
dependencies = [
"log",
]
@ -4953,9 +4957,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.101"
version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
dependencies = [
"proc-macro2",
"quote",
@ -5228,9 +5232,9 @@ dependencies = [
[[package]]
name = "tokio-stream"
version = "0.1.10"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af"
checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
dependencies = [
"futures-core",
"pin-project-lite",
@ -5434,9 +5438,9 @@ dependencies = [
[[package]]
name = "tracing"
version = "0.1.36"
version = "0.1.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
dependencies = [
"cfg-if",
"log",
@ -5447,9 +5451,9 @@ dependencies = [
[[package]]
name = "tracing-attributes"
version = "0.1.22"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2"
checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
dependencies = [
"proc-macro2",
"quote",
@ -5458,9 +5462,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.29"
version = "0.1.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
dependencies = [
"once_cell",
"valuable",

View File

@ -11,10 +11,10 @@ humantime = "2.1.0"
iox_catalog = { path = "../iox_catalog" }
iox_time = { path = "../iox_time" }
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.83"
serde_json = "1.0.86"
snafu = "0.7"
tempfile = "3.1.0"
trace = { path = "../trace" }

View File

@ -14,7 +14,7 @@ datafusion = { path = "../datafusion" }
futures = "0.3"
iox_catalog = { path = "../iox_catalog" }
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
parquet_file = { path = "../parquet_file" }
predicate = { path = "../predicate" }

View File

@ -45,7 +45,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
compaction_type,
CompactionLevel::Initial,
compact_in_parallel,
false, // no split
true, // split
candidates.clone().into(),
)
.await;
@ -57,7 +57,7 @@ pub async fn compact(compactor: Arc<Compactor>, do_full_compact: bool) -> usize
compaction_type,
CompactionLevel::FileNonOverlapped,
compact_in_parallel,
false, // don't split
true, // split
candidates.into(),
)
.await;
@ -812,24 +812,42 @@ mod tests {
compact(compactor, true).await;
// Should have 1 non-soft-deleted file:
// Should have 2 non-soft-deleted file:
//
// - the level 2 file created after combining all 3 level 1 files created by the first step
// - the 2 level-2 files created after combining all 3 level 1 files created by the first step
// of compaction to compact remaining level 0 files
let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
assert_eq!(files.len(), 1, "{files:?}");
assert_eq!(files.len(), 2, "{files:?}");
let files_and_levels: Vec<_> = files
.iter()
.map(|f| (f.id.get(), f.compaction_level))
.collect();
// The initial files are: L0 1-4, L1 5-6. The first step of cold compaction took files 1-5
// and compacted them into a l-1 file 7. The second step of cold compaction
// took 6 and 7 and combined them all into file 8.
assert_eq!(files_and_levels, vec![(8, CompactionLevel::Final)]);
// and compacted them into two l-1 files 7, 8. The second step of cold compaction
// took 6, 7, and 8 and combined them all into two files 9 and 10.
assert_eq!(
files_and_levels,
vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)]
);
// ------------------------------------------------
// Verify the parquet file content
// first file:
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
&[
"+-----------+------+------+------+-----------------------------+",
"| field_int | tag1 | tag2 | tag3 | time |",
"+-----------+------+------+------+-----------------------------+",
"| 421 | | OH | 21 | 1970-01-01T00:00:00.000091Z |",
"| 81601 | | PA | 15 | 1970-01-01T00:00:00.000090Z |",
"+-----------+------+------+------+-----------------------------+",
],
&batches
);
// second file
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
@ -847,9 +865,7 @@ mod tests {
"| 20 | | VT | 20 | 1970-01-01T00:00:00.000026Z |",
"| 21 | | OH | 21 | 1970-01-01T00:00:00.000000025Z |",
"| 270 | UT | | | 1970-01-01T00:00:00.000025Z |",
"| 421 | | OH | 21 | 1970-01-01T00:00:00.000091Z |",
"| 70 | UT | | | 1970-01-01T00:00:00.000020Z |",
"| 81601 | | PA | 15 | 1970-01-01T00:00:00.000090Z |",
"+-----------+------+------+------+--------------------------------+",
],
&batches
@ -1027,14 +1043,14 @@ mod tests {
compact(compactor, true).await;
// Should have 3 non-soft-deleted files:
// Should have 4 non-soft-deleted files:
//
// - pf4, the level 1 file untouched because it didn't fit in the memory budget
// - pf6, the level 2 file untouched because it doesn't overlap anything
// - the level 2 file created after combining all 3 level 1 files created by the first step
// - two level-2 files created after combining all 3 level 1 files created by the first step
// of compaction to compact remaining level 0 files
let mut files = catalog.list_by_table_not_to_delete(table.table.id).await;
assert_eq!(files.len(), 3, "{files:?}");
assert_eq!(files.len(), 4, "{files:?}");
let files_and_levels: Vec<_> = files
.iter()
.map(|f| (f.id.get(), f.compaction_level))
@ -1042,20 +1058,35 @@ mod tests {
// File 4 was L1 but didn't fit in the memory budget, so was untouched.
// File 6 was already L2 and did not overlap with anything, so was untouched.
// Cold compaction took files 1, 2, 3, 5 and compacted them into file 7.
// Cold compaction took files 1, 2, 3, 5 and compacted them into 2 files 7 and 8.
assert_eq!(
files_and_levels,
vec![
(4, CompactionLevel::FileNonOverlapped),
(6, CompactionLevel::Final),
(7, CompactionLevel::Final),
(8, CompactionLevel::Final),
]
);
// ------------------------------------------------
// Verify the parquet file content
let file1 = files.pop().unwrap();
let batches = table.read_parquet_file(file1).await;
// newly created L-2 with largest timestamp
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
&[
"+-----------+------+------+------+-----------------------------+",
"| field_int | tag1 | tag2 | tag3 | time |",
"+-----------+------+------+------+-----------------------------+",
"| 270 | UT | | | 1970-01-01T00:00:00.000025Z |",
"+-----------+------+------+------+-----------------------------+",
],
&batches
);
// newly created L-2 with smallest timestamp
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
&[
"+-----------+------+------+------+--------------------------------+",
@ -1068,15 +1099,14 @@ mod tests {
"| 1500 | WA | | | 1970-01-01T00:00:00.000008Z |",
"| 1601 | | PA | 15 | 1970-01-01T00:00:00.000000009Z |",
"| 21 | | OH | 21 | 1970-01-01T00:00:00.000000025Z |",
"| 270 | UT | | | 1970-01-01T00:00:00.000025Z |",
"| 70 | UT | | | 1970-01-01T00:00:00.000020Z |",
"+-----------+------+------+------+--------------------------------+",
],
&batches
);
let file0 = files.pop().unwrap();
let batches = table.read_parquet_file(file0).await;
// available L2 that does not overlap
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
&[
"+-----------+------+------+-----------------------------+",
@ -1088,6 +1118,20 @@ mod tests {
],
&batches
);
// available L1 that did not fit in the memory budget
let file = files.pop().unwrap();
let batches = table.read_parquet_file(file).await;
assert_batches_sorted_eq!(
&[
"+-----------+------+------+-----------------------------+",
"| field_int | tag2 | tag3 | time |",
"+-----------+------+------+-----------------------------+",
"| 1600 | WA | 10 | 1970-01-01T00:00:00.000028Z |",
"| 20 | VT | 20 | 1970-01-01T00:00:00.000026Z |",
"+-----------+------+------+-----------------------------+",
],
&batches
);
}
struct TestDb {

View File

@ -4,10 +4,10 @@ use data_types::{
ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
TableSummary, Timestamp, TimestampMinMax, Tombstone,
};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
QueryChunk, QueryChunkError, QueryChunkMeta,
QueryChunk, QueryChunkMeta,
};
use observability_deps::tracing::trace;
use parquet_file::chunk::ParquetChunk;
@ -194,7 +194,7 @@ impl QueryChunk for QueryableParquetChunk {
_ctx: IOxSessionContext,
_predicate: &Predicate,
_columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
@ -208,7 +208,7 @@ impl QueryChunk for QueryableParquetChunk {
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
@ -230,7 +230,7 @@ impl QueryChunk for QueryableParquetChunk {
mut ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError> {
) -> Result<SendableRecordBatchStream, DataFusionError> {
ctx.set_metadata("storage", "compactor");
ctx.set_metadata("projection", format!("{}", selection));
trace!(?selection, "selection");
@ -238,7 +238,7 @@ impl QueryChunk for QueryableParquetChunk {
self.data
.read_filter(predicate, selection)
.context(ReadParquetSnafu)
.map_err(|e| Box::new(e) as _)
.map_err(|e| DataFusionError::External(Box::new(e)))
}
/// Returns chunk type

View File

@ -15,7 +15,7 @@ use datafusion::execution::context::TaskContext;
use datafusion::physical_expr::PhysicalExpr;
use datafusion::physical_plan::common::SizedRecordBatchStream;
use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics};
use datafusion::physical_plan::{collect, ExecutionPlan};
use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan};
use datafusion::prelude::SessionContext;
use datafusion::{
arrow::{
@ -236,12 +236,19 @@ where
}
/// Create a SendableRecordBatchStream a RecordBatch
pub fn stream_from_batch(batch: RecordBatch) -> SendableRecordBatchStream {
stream_from_batches(vec![Arc::new(batch)])
pub fn stream_from_batch(schema: Arc<Schema>, batch: RecordBatch) -> SendableRecordBatchStream {
stream_from_batches(schema, vec![Arc::new(batch)])
}
/// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema
pub fn stream_from_batches(batches: Vec<Arc<RecordBatch>>) -> SendableRecordBatchStream {
pub fn stream_from_batches(
schema: Arc<Schema>,
batches: Vec<Arc<RecordBatch>>,
) -> SendableRecordBatchStream {
if batches.is_empty() {
return Box::pin(EmptyRecordBatchStream::new(schema));
}
let dummy_metrics = ExecutionPlanMetricsSet::new();
let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0);
let stream = SizedRecordBatchStream::new(batches[0].schema(), batches, mem_metrics);

View File

@ -15,17 +15,25 @@ developers.
Build IOx for release with pprof:
```shell
cd influxdb_iox
cargo build --release --features=pprof
```
## Step 2: Start redpanda and postgres
You can also install the `influxdb_iox` command locally via
Now, start up redpanda and postgres locally in docker containers:
```shell
cd influxdb_iox
cargo install --path influxdb_iox
```
## Step 2: Start kafka and postgres
Now, start up kafka and postgres locally in docker containers:
```shell
# get rskafka from https://github.com/influxdata/rskafka
cd rskafka
# Run redpanda on localhost:9010
docker-compose -f docker-compose-redpanda.yml up &
# Run kafka on localhost:9010
docker-compose -f docker-compose-kafka.yml up &
# now run postgres
docker run -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust postgres &
```
@ -136,8 +144,8 @@ INFLUXDB_IOX_GRPC_BIND_ADDR=localhost:8084 \
INFLUXDB_IOX_WRITE_BUFFER_TYPE=kafka \
INFLUXDB_IOX_WRITE_BUFFER_ADDR=localhost:9010 \
xINFLUXDB_IOX_WRITE_BUFFER_AUTO_CREATE_TOPICS=10 \
INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_START=0 \
INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END=0 \
INFLUXDB_IOX_SHARD_INDEX_RANGE_START=0 \
INFLUXDB_IOX_SHARD_INDEX_RANGE_END=0 \
INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES=5000000000 \
INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES=4000000000 \
INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost:5432/postgres \
@ -151,6 +159,11 @@ LOG_FILTER=info \
# Step 5: Ingest data
You can load data using the influxdb_iox client:
```shell
influxdb_iox --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp
```
Now you can post data to `http://localhost:8080` with your favorite load generating tool
My favorite is https://github.com/alamb/low_card
@ -171,3 +184,17 @@ posting fairly large requests (necessitating the
# Step 6: Profile
See [`profiling.md`](./profiling.md).
# Step 7: Clean up local state
If you find yourself needing to clean up postgres / kafka state use these commands:
```shell
docker ps -a -q | xargs docker stop
docker rm rskafka_proxy_1
docker rm rskafka_kafka-0_1
docker rm rskafka_kafka-1_1
docker rm rskafka_kafka-2_1
docker rm rskafka_zookeeper_1
docker volume rm rskafka_kafka_0_data rskafka_kafka_1_data rskafka_kafka_2_data rskafka_zookeeper_data
```

View File

@ -11,7 +11,7 @@ data_types = { path = "../data_types" }
futures = "0.3"
humantime = "2.1.0"
iox_catalog = { path = "../iox_catalog" }
object_store = { version = "0.5.0" }
object_store = { version = "0.5.1" }
observability_deps = { path = "../observability_deps" }
snafu = "0.7"
tokio = { version = "1", features = ["macros", "rt", "sync"] }

View File

@ -82,8 +82,9 @@ message PartitionStatus {
// Max sequence number persisted
optional int64 parquet_max_sequence_number = 1;
// Max sequence number for a tombstone associated
optional int64 tombstone_max_sequence_number = 2;
// Deprecated tombstone support in ingester (#5825).
reserved "tombstone_max_sequence_number";
reserved 2;
}
// Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s

View File

@ -13,11 +13,11 @@ futures = "0.3"
generated_types = { path = "../generated_types" }
influxdb_iox_client = { path = "../influxdb_iox_client" }
iox_catalog = { path = "../iox_catalog" }
object_store = { version = "0.5.0", features = ["aws"] }
object_store = { version = "0.5.1", features = ["aws"] }
observability_deps = { path = "../observability_deps" }
schema = { path = "../schema" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.82"
serde_json = "1.0.86"
thiserror = "1.0.37"
tokio = { version = "1.21" }
tonic = { version = "0.8" }

View File

@ -9,7 +9,7 @@ bytes = "1.2"
futures = { version = "0.3", default-features = false }
reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.83"
serde_json = "1.0.86"
snafu = "0.7"
url = "2.3.1"
uuid = { version = "1", features = ["v4"] }

View File

@ -2,6 +2,7 @@ use crate::expression::conditional::{conditional_expression, ConditionalExpressi
use crate::identifier::{identifier, Identifier};
use crate::internal::{expect, ParseResult};
use crate::literal::unsigned_integer;
use crate::string::{regex, Regex};
use core::fmt;
use nom::branch::alt;
use nom::bytes::complete::{tag, tag_no_case};
@ -11,73 +12,82 @@ use nom::multi::separated_list1;
use nom::sequence::{pair, preceded, terminated};
use std::fmt::{Display, Formatter};
/// Represents a fully-qualified measurement name.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct MeasurementNameExpression {
/// Represents a measurement name as either an identifier or a regular expression.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum MeasurementName {
/// A measurement name expressed as an [`Identifier`].
Name(Identifier),
/// A measurement name expressed as a [`Regex`].
Regex(Regex),
}
impl Parser for MeasurementName {
/// Parse a measurement name, which may be an identifier or a regular expression.
fn parse(i: &str) -> ParseResult<&str, Self> {
alt((
map(identifier, MeasurementName::Name),
map(regex, MeasurementName::Regex),
))(i)
}
}
impl Display for MeasurementName {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Name(ident) => fmt::Display::fmt(ident, f),
Self::Regex(regex) => fmt::Display::fmt(regex, f),
}
}
}
/// Represents a fully-qualified, 3-part measurement name.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct QualifiedMeasurementName {
pub database: Option<Identifier>,
pub retention_policy: Option<Identifier>,
pub name: Identifier,
pub name: MeasurementName,
}
impl MeasurementNameExpression {
/// Constructs a new `MeasurementNameExpression` with the specified `name`.
pub fn new(name: Identifier) -> Self {
Self {
database: None,
retention_policy: None,
name,
}
}
/// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
pub fn new_db(name: Identifier, database: Identifier) -> Self {
Self {
database: Some(database),
retention_policy: None,
name,
}
}
/// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
pub fn new_db_rp(name: Identifier, database: Identifier, retention_policy: Identifier) -> Self {
Self {
database: Some(database),
retention_policy: Some(retention_policy),
name,
}
}
}
impl fmt::Display for MeasurementNameExpression {
impl Display for QualifiedMeasurementName {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self {
database: None,
retention_policy: None,
name,
} => write!(f, "{}", name)?,
} => write!(f, "{}", name),
Self {
database: Some(db),
retention_policy: None,
name,
} => write!(f, "{}..{}", db, name)?,
} => write!(f, "{}..{}", db, name),
Self {
database: None,
retention_policy: Some(rp),
name,
} => write!(f, "{}.{}", rp, name)?,
} => write!(f, "{}.{}", rp, name),
Self {
database: Some(db),
retention_policy: Some(rp),
name,
} => write!(f, "{}.{}.{}", db, rp, name)?,
};
Ok(())
} => write!(f, "{}.{}.{}", db, rp, name),
}
}
}
/// Match a 3-part measurement name expression.
pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementNameExpression> {
/// Match a fully-qualified, 3-part measurement name.
///
/// ```text
/// qualified_measurement_name ::= measurement_name |
/// ( policy_name "." measurement_name ) |
/// ( db_name "." policy_name? "." measurement_name )
///
/// db_name ::= identifier
/// policy_name ::= identifier
/// measurement_name ::= identifier | regex_lit
/// ```
pub fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> {
let (remaining_input, (opt_db_rp, name)) = pair(
opt(alt((
// database "." retention_policy "."
@ -93,7 +103,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
// retention_policy "."
map(terminated(identifier, tag(".")), |rp| (None, Some(rp))),
))),
identifier,
MeasurementName::parse,
)(i)?;
// Extract possible `database` and / or `retention_policy`
@ -104,7 +114,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName
Ok((
remaining_input,
MeasurementNameExpression {
QualifiedMeasurementName {
database,
retention_policy,
name,
@ -290,35 +300,107 @@ mod tests {
use crate::assert_expect_error;
use nom::character::complete::alphanumeric1;
#[test]
fn test_measurement_name_expression() {
let (_, got) = measurement_name_expression("diskio").unwrap();
assert_eq!(
got,
MeasurementNameExpression {
impl From<&str> for MeasurementName {
/// Convert a `str` to [`MeasurementName::Name`].
fn from(s: &str) -> Self {
Self::Name(Identifier(s.into()))
}
}
impl QualifiedMeasurementName {
/// Constructs a new `MeasurementNameExpression` with the specified `name`.
pub fn new(name: MeasurementName) -> Self {
Self {
database: None,
retention_policy: None,
name: "diskio".into(),
name,
}
}
/// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
pub fn new_db(name: MeasurementName, database: Identifier) -> Self {
Self {
database: Some(database),
retention_policy: None,
name,
}
}
/// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
pub fn new_db_rp(
name: MeasurementName,
database: Identifier,
retention_policy: Identifier,
) -> Self {
Self {
database: Some(database),
retention_policy: Some(retention_policy),
name,
}
}
}
#[test]
fn test_qualified_measurement_name() {
use MeasurementName::*;
let (_, got) = qualified_measurement_name("diskio").unwrap();
assert_eq!(
got,
QualifiedMeasurementName {
database: None,
retention_policy: None,
name: Name("diskio".into()),
}
);
let (_, got) = measurement_name_expression("telegraf.autogen.diskio").unwrap();
let (_, got) = qualified_measurement_name("/diskio/").unwrap();
assert_eq!(
got,
MeasurementNameExpression {
QualifiedMeasurementName {
database: None,
retention_policy: None,
name: Regex("diskio".into()),
}
);
let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap();
assert_eq!(
got,
QualifiedMeasurementName {
database: Some("telegraf".into()),
retention_policy: Some("autogen".into()),
name: "diskio".into(),
name: Name("diskio".into()),
}
);
let (_, got) = measurement_name_expression("telegraf..diskio").unwrap();
let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap();
assert_eq!(
got,
MeasurementNameExpression {
QualifiedMeasurementName {
database: Some("telegraf".into()),
retention_policy: Some("autogen".into()),
name: Regex("diskio".into()),
}
);
let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap();
assert_eq!(
got,
QualifiedMeasurementName {
database: Some("telegraf".into()),
retention_policy: None,
name: "diskio".into(),
name: Name("diskio".into()),
}
);
let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap();
assert_eq!(
got,
QualifiedMeasurementName {
database: Some("telegraf".into()),
retention_policy: None,
name: Regex("diskio".into()),
}
);
}

View File

@ -73,9 +73,14 @@ mod test {
// Validate via the Display trait, as we don't need to validate the contents of the
// FROM and / or WHERE clauses, given they are tested in their on modules.
// Measurement name expressed as an identifier
let (_, got) = delete_statement("DELETE FROM foo").unwrap();
assert_eq!(format!("{}", got), "DELETE FROM foo");
// Measurement name expressed as a regular expression
let (_, got) = delete_statement("DELETE FROM /foo/").unwrap();
assert_eq!(format!("{}", got), "DELETE FROM /foo/");
let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap();
assert_eq!(format!("{}", got), "DELETE FROM foo WHERE time > 10");

View File

@ -0,0 +1,140 @@
#![allow(dead_code)] // Temporary
use crate::internal::{expect, ParseResult};
use crate::select::{select_statement, SelectStatement};
use nom::branch::alt;
use nom::bytes::complete::tag_no_case;
use nom::character::complete::multispace1;
use nom::combinator::{map, opt, value};
use nom::sequence::{preceded, tuple};
use std::fmt::{Display, Formatter};
/// Represents various options for an `EXPLAIN` statement.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ExplainOption {
/// `EXPLAIN VERBOSE statement`
Verbose,
/// `EXPLAIN ANALYZE statement`
Analyze,
/// `EXPLAIN ANALYZE VERBOSE statement`
AnalyzeVerbose,
}
impl Display for ExplainOption {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Verbose => f.write_str("VERBOSE"),
Self::Analyze => f.write_str("ANALYZE"),
Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"),
}
}
}
/// Represents an `EXPLAIN` statement.
///
/// ```text
/// explain ::= "EXPLAIN" explain_options? select_statement
/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? )
/// ```
#[derive(Debug, Clone, PartialEq)]
pub struct ExplainStatement {
options: Option<ExplainOption>,
select: Box<SelectStatement>,
}
impl Display for ExplainStatement {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str("EXPLAIN ")?;
if let Some(options) = &self.options {
write!(f, "{} ", options)?;
}
Display::fmt(&self.select, f)
}
}
/// Parse an `EXPLAIN` statement.
pub fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> {
map(
tuple((
tag_no_case("EXPLAIN"),
opt(preceded(
multispace1,
alt((
map(
preceded(
tag_no_case("ANALYZE"),
opt(preceded(multispace1, tag_no_case("VERBOSE"))),
),
|v| match v {
// If the optional combinator is Some, then it matched VERBOSE
Some(_) => ExplainOption::AnalyzeVerbose,
_ => ExplainOption::Analyze,
},
),
value(ExplainOption::Verbose, tag_no_case("VERBOSE")),
)),
)),
multispace1,
expect(
"invalid EXPLAIN statement, expected SELECT statement",
select_statement,
),
)),
|(_, options, _, select)| ExplainStatement {
options,
select: Box::new(select),
},
)(i)
}
#[cfg(test)]
mod test {
use crate::assert_expect_error;
use crate::explain::{explain_statement, ExplainOption};
use assert_matches::assert_matches;
#[test]
fn test_explain_statement() {
let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
assert_eq!(remain, ""); // assert that all input was consumed
assert_matches!(got.options, None);
assert_eq!(format!("{}", got), "EXPLAIN SELECT val FROM temp");
let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap();
assert_eq!(remain, "");
assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
assert_eq!(format!("{}", got), "EXPLAIN VERBOSE SELECT val FROM temp");
let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap();
assert_eq!(remain, "");
assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
assert_eq!(format!("{}", got), "EXPLAIN ANALYZE SELECT val FROM temp");
let (remain, got) =
explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap();
assert_eq!(remain, "");
assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
assert_eq!(
format!("{}", got),
"EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
);
// Fallible cases
assert_expect_error!(
explain_statement("EXPLAIN ANALYZE SHOW DATABASES"),
"invalid EXPLAIN statement, expected SELECT statement"
);
assert_expect_error!(
explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"),
"invalid EXPLAIN statement, expected SELECT statement"
);
// surfaces statement-specific errors
assert_expect_error!(
explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"),
"invalid FROM clause, expected identifier, regular expression or subquery"
);
}
}

View File

@ -22,12 +22,10 @@ impl<I: Display> Display for Error<I> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Syntax { input: _, message } => {
write!(f, "Syntax error: {}", message)?;
write!(f, "Syntax error: {}", message)
}
Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind)?,
Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind),
}
Ok(())
}
}

View File

@ -29,6 +29,7 @@ mod test_util;
mod common;
mod delete;
mod drop;
mod explain;
mod expression;
mod identifier;
mod internal;

View File

@ -1,6 +1,6 @@
use crate::common::{
limit_clause, measurement_name_expression, offset_clause, order_by_clause, where_clause,
MeasurementNameExpression, OneOrMore, OrderByClause, Parser,
limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause,
OneOrMore, OrderByClause, Parser, QualifiedMeasurementName,
};
use crate::expression::arithmetic::Expr::Wildcard;
use crate::expression::arithmetic::{
@ -164,8 +164,7 @@ pub fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> {
/// Represents a single measurement selection found in a `FROM` clause.
#[derive(Clone, Debug, PartialEq)]
pub enum MeasurementSelection {
Name(MeasurementNameExpression),
Regex(Regex),
Name(QualifiedMeasurementName),
Subquery(Box<SelectStatement>),
}
@ -173,7 +172,6 @@ impl Display for MeasurementSelection {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::Name(ref name) => fmt::Display::fmt(name, f),
Self::Regex(ref re) => fmt::Display::fmt(re, f),
Self::Subquery(ref subquery) => write!(f, "({})", subquery),
}
}
@ -182,8 +180,7 @@ impl Display for MeasurementSelection {
impl Parser for MeasurementSelection {
fn parse(i: &str) -> ParseResult<&str, Self> {
alt((
map(measurement_name_expression, MeasurementSelection::Name),
map(regex, MeasurementSelection::Regex),
map(qualified_measurement_name, MeasurementSelection::Name),
map(
delimited(
preceded(multispace0, char('(')),
@ -812,7 +809,7 @@ mod test {
assert_matches!(got, MeasurementSelection::Name(_));
let (_, got) = MeasurementSelection::parse("/regex/").unwrap();
assert_matches!(got, MeasurementSelection::Regex(_));
assert_matches!(got, MeasurementSelection::Name(_));
let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap();
assert_matches!(got, MeasurementSelection::Subquery(_));

View File

@ -2,24 +2,21 @@
//!
//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements
use crate::common::{
limit_clause, offset_clause, qualified_measurement_name, where_clause, QualifiedMeasurementName,
};
use crate::expression::conditional::ConditionalExpression;
use crate::identifier::{identifier, Identifier};
use crate::internal::{expect, ParseResult};
use nom::branch::alt;
use nom::bytes::complete::{tag, tag_no_case};
use nom::character::complete::{char, multispace0, multispace1};
use nom::character::complete::{multispace0, multispace1};
use nom::combinator::{map, opt, value};
use nom::sequence::tuple;
use nom::sequence::{pair, preceded, terminated};
use std::fmt;
use std::fmt::Formatter;
use crate::common::{
limit_clause, measurement_name_expression, offset_clause, where_clause,
MeasurementNameExpression,
};
use crate::expression::conditional::ConditionalExpression;
use crate::identifier::{identifier, Identifier};
use crate::string::{regex, Regex};
/// OnExpression represents an InfluxQL database or retention policy name
/// or a wildcard.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
@ -110,18 +107,16 @@ impl fmt::Display for ShowMeasurementsStatement {
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum MeasurementExpression {
Equals(MeasurementNameExpression),
Regex(Regex),
Equals(QualifiedMeasurementName),
Regex(QualifiedMeasurementName),
}
impl fmt::Display for MeasurementExpression {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::Equals(ref name) => write!(f, "= {}", name)?,
Self::Regex(ref re) => write!(f, "=~ {}", re)?,
};
Ok(())
Self::Equals(ref name) => write!(f, "= {}", name),
Self::Regex(ref re) => write!(f, "=~ {}", re),
}
}
}
@ -140,23 +135,15 @@ fn with_measurement_clause(i: &str) -> ParseResult<&str, MeasurementExpression>
"expected = or =~",
alt((
map(
tuple((
tag("=~"),
multispace0,
expect("expected regular expression literal", regex),
)),
|(_, _, regex)| MeasurementExpression::Regex(regex),
preceded(pair(tag("=~"), multispace0), qualified_measurement_name),
MeasurementExpression::Regex,
),
map(
tuple((
char('='),
multispace0,
expect(
"expected measurement name or wildcard",
measurement_name_expression,
),
)),
|(_, _, name)| MeasurementExpression::Equals(name),
preceded(
pair(tag("="), multispace0),
expect("expected measurement name", qualified_measurement_name),
),
MeasurementExpression::Equals,
),
)),
),
@ -200,6 +187,7 @@ pub fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement
mod test {
use super::*;
use crate::assert_expect_error;
use crate::common::MeasurementName;
use crate::expression::arithmetic::Expr;
use assert_matches::assert_matches;
@ -232,7 +220,7 @@ mod test {
ShowMeasurementsStatement {
on_expression: Some(OnExpression::Database("foo".into())),
measurement_expression: Some(MeasurementExpression::Equals(
MeasurementNameExpression {
QualifiedMeasurementName {
database: None,
retention_policy: None,
name: "bar".into(),
@ -255,7 +243,9 @@ mod test {
got,
ShowMeasurementsStatement {
on_expression: Some(OnExpression::Database("foo".into())),
measurement_expression: Some(MeasurementExpression::Regex(Regex("bar".into()))),
measurement_expression: Some(MeasurementExpression::Regex(
QualifiedMeasurementName::new(MeasurementName::Regex("bar".into()))
)),
condition: Some(Expr::Literal(true.into()).into()),
limit: None,
offset: None
@ -343,33 +333,50 @@ mod test {
#[test]
fn test_with_measurement_clause() {
use crate::common::MeasurementName::*;
let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap();
assert_eq!(
got,
MeasurementExpression::Equals(MeasurementNameExpression {
database: None,
retention_policy: None,
name: "foo".into()
})
MeasurementExpression::Equals(QualifiedMeasurementName::new(Name("foo".into())))
);
let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap();
assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
assert_eq!(
got,
MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
);
// Expressions are still valid when whitespace is omitted
let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap();
assert_eq!(
got,
MeasurementExpression::Equals(MeasurementNameExpression {
database: Some("foo".into()),
retention_policy: None,
name: "bar".into()
})
MeasurementExpression::Equals(QualifiedMeasurementName::new_db(
Name("bar".into()),
"foo".into()
))
);
let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap();
assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into())));
assert_eq!(
got,
MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
);
// Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662
let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap();
assert_eq!(
got,
MeasurementExpression::Regex(QualifiedMeasurementName::new(Name("foo".into())))
);
let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap();
assert_eq!(
got,
MeasurementExpression::Equals(QualifiedMeasurementName::new(Regex("foo".into())))
);
// Fallible cases
@ -379,28 +386,16 @@ mod test {
"invalid WITH clause, expected MEASUREMENT"
);
// Must have a regex for equal regex operator
assert_expect_error!(
with_measurement_clause("WITH measurement =~ foo"),
"expected regular expression literal"
);
// Unsupported regex not equal operator
assert_expect_error!(
with_measurement_clause("WITH measurement !~ foo"),
"expected = or =~"
);
// Must have an identifier for equal operator
assert_expect_error!(
with_measurement_clause("WITH measurement = /foo/"),
"expected measurement name or wildcard"
);
// Must have an identifier
assert_expect_error!(
with_measurement_clause("WITH measurement = 1"),
"expected measurement name or wildcard"
"expected measurement name"
);
}
}

View File

@ -1,41 +1,12 @@
use crate::common::{measurement_name_expression, MeasurementNameExpression, OneOrMore, Parser};
use crate::common::{
qualified_measurement_name, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName,
};
use crate::identifier::{identifier, Identifier};
use crate::internal::ParseResult;
use crate::string::{regex, Regex};
use nom::branch::alt;
use nom::bytes::complete::tag_no_case;
use nom::character::complete::multispace1;
use nom::combinator::map;
use nom::sequence::{pair, preceded};
use std::fmt;
use std::fmt::Formatter;
/// Represents a single measurement selection found in a `FROM` measurement clause.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum MeasurementSelection<T: Parser> {
Name(T),
Regex(Regex),
}
impl<T: Parser> Parser for MeasurementSelection<T> {
fn parse(i: &str) -> ParseResult<&str, Self> {
alt((
map(T::parse, MeasurementSelection::Name),
map(regex, MeasurementSelection::Regex),
))(i)
}
}
impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::Name(ref name) => fmt::Display::fmt(name, f)?,
Self::Regex(ref re) => fmt::Display::fmt(re, f)?,
};
Ok(())
}
}
/// Represents a `FROM` clause of a `DELETE` or `SHOW` statement.
///
@ -43,7 +14,7 @@ impl<T: fmt::Display + Parser> fmt::Display for MeasurementSelection<T> {
/// for measurements names.
///
/// A `FROM` clause for a number of `SHOW` statements can accept a 3-part measurement name or
pub type FromMeasurementClause<U> = OneOrMore<MeasurementSelection<U>>;
pub type FromMeasurementClause<U> = OneOrMore<U>;
fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasurementClause<T>> {
preceded(
@ -54,9 +25,9 @@ fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasu
)(i)
}
impl Parser for MeasurementNameExpression {
impl Parser for QualifiedMeasurementName {
fn parse(i: &str) -> ParseResult<&str, Self> {
measurement_name_expression(i)
qualified_measurement_name(i)
}
}
@ -68,10 +39,9 @@ impl Parser for MeasurementNameExpression {
/// It is defined by the following EBNF notation:
///
/// ```text
/// from_clause ::= "FROM" measurement_selection ("," measurement_selection)*
/// measurement_selection ::= measurement
/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)*
///
/// measurement ::= measurement_name |
/// qualified_measurement_name ::= measurement_name |
/// ( policy_name "." measurement_name ) |
/// ( db_name "." policy_name? "." measurement_name )
///
@ -92,7 +62,7 @@ impl Parser for MeasurementNameExpression {
/// ```text
/// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar
/// ```
pub type ShowFromClause = FromMeasurementClause<MeasurementNameExpression>;
pub type ShowFromClause = FromMeasurementClause<QualifiedMeasurementName>;
/// Parse a `FROM` clause for various `SHOW` statements.
pub fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> {
@ -106,7 +76,7 @@ impl Parser for Identifier {
}
/// Represents a `FROM` clause for a `DELETE` statement.
pub type DeleteFromClause = FromMeasurementClause<Identifier>;
pub type DeleteFromClause = FromMeasurementClause<MeasurementName>;
/// Parse a `FROM` clause for a `DELETE` statement.
pub fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> {
@ -119,49 +89,52 @@ mod test {
#[test]
fn test_show_from_clause() {
use crate::simple_from_clause::MeasurementSelection::*;
use crate::common::MeasurementName::*;
let (_, from) = show_from_clause("FROM c").unwrap();
assert_eq!(
from,
ShowFromClause::new(vec![Name(MeasurementNameExpression::new("c".into()))])
ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))])
);
let (_, from) = show_from_clause("FROM a..c").unwrap();
assert_eq!(
from,
ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db(
"c".into(),
ShowFromClause::new(vec![QualifiedMeasurementName::new_db(
Name("c".into()),
"a".into()
))])
)])
);
let (_, from) = show_from_clause("FROM a.b.c").unwrap();
assert_eq!(
from,
ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db_rp(
"c".into(),
ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp(
Name("c".into()),
"a".into(),
"b".into()
))])
)])
);
let (_, from) = show_from_clause("FROM /reg/").unwrap();
assert_eq!(from, ShowFromClause::new(vec![Regex("reg".into())]));
assert_eq!(
from,
ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))])
);
let (_, from) = show_from_clause("FROM c, /reg/").unwrap();
assert_eq!(
from,
ShowFromClause::new(vec![
Name(MeasurementNameExpression::new("c".into())),
Regex("reg".into())
QualifiedMeasurementName::new(Name("c".into())),
QualifiedMeasurementName::new(Regex("reg".into()))
])
);
}
#[test]
fn test_delete_from_clause() {
use crate::simple_from_clause::MeasurementSelection::*;
use crate::common::MeasurementName::*;
let (_, from) = delete_from_clause("FROM c").unwrap();
assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())]));

View File

@ -1,5 +1,6 @@
use crate::delete::{delete_statement, DeleteStatement};
use crate::drop::{drop_statement, DropMeasurementStatement};
use crate::explain::{explain_statement, ExplainStatement};
use crate::internal::ParseResult;
use crate::select::{select_statement, SelectStatement};
use crate::show::{show_statement, ShowDatabasesStatement};
@ -19,6 +20,8 @@ pub enum Statement {
Delete(Box<DeleteStatement>),
/// Represents a `DROP MEASUREMENT` statement.
DropMeasurement(Box<DropMeasurementStatement>),
/// Represents an `EXPLAIN` statement.
Explain(Box<ExplainStatement>),
/// Represents a `SELECT` statement.
Select(Box<SelectStatement>),
/// Represents a `SHOW DATABASES` statement.
@ -40,6 +43,7 @@ impl Display for Statement {
match self {
Self::Delete(s) => Display::fmt(s, f),
Self::DropMeasurement(s) => Display::fmt(s, f),
Self::Explain(s) => Display::fmt(s, f),
Self::Select(s) => Display::fmt(s, f),
Self::ShowDatabases(s) => Display::fmt(s, f),
Self::ShowMeasurements(s) => Display::fmt(s, f),
@ -56,6 +60,7 @@ pub fn statement(i: &str) -> ParseResult<&str, Statement> {
alt((
map(delete_statement, |s| Statement::Delete(Box::new(s))),
map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))),
map(explain_statement, |s| Statement::Explain(Box::new(s))),
map(select_statement, |s| Statement::Select(Box::new(s))),
show_statement,
))(i)
@ -77,6 +82,10 @@ mod test {
let (got, _) = statement("DROP MEASUREMENT foo").unwrap();
assert_eq!(got, "");
// explain_statement combinator
let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap();
assert_eq!(got, "");
let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap();
assert_eq!(got, "");

View File

@ -25,7 +25,7 @@ ioxd_querier = { path = "../ioxd_querier"}
ioxd_router = { path = "../ioxd_router"}
ioxd_test = { path = "../ioxd_test"}
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
object_store_metrics = { path = "../object_store_metrics" }
observability_deps = { path = "../observability_deps" }
panic_logging = { path = "../panic_logging" }
@ -47,6 +47,8 @@ clap = { version = "4", features = ["derive", "env"] }
console-subscriber = { version = "0.1.8", optional = true, features = ["parking_lot"] }
dotenvy = "0.15.5"
futures = "0.3"
futures-util = { version = "0.3" }
flate2 = "1.0"
hashbrown = "0.12"
http = "0.2.8"
humantime = "2.1.0"
@ -55,7 +57,7 @@ libc = { version = "0.2" }
num_cpus = "1.13.0"
once_cell = { version = "1.15.0", features = ["parking_lot"] }
rustyline = { version = "10.0", default-features = false }
serde_json = "1.0.83"
serde_json = "1.0.86"
snafu = "0.7"
thiserror = "1.0.37"
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }

View File

@ -53,7 +53,7 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
enum QueryEngine {
/// Run queries against the named database on the remote server
/// Run queries against the namespace on the remote server
Remote(String),
/// Run queries against a local `Observer` instance
@ -177,7 +177,7 @@ pub struct Repl {
/// Client for running sql
flight_client: influxdb_iox_client::flight::Client,
/// database name against which SQL commands are run
/// namespace name against which SQL commands are run
query_engine: Option<QueryEngine>,
/// Formatter to use to format query results
@ -239,8 +239,8 @@ impl Repl {
.map_err(|e| println!("{}", e))
.ok();
}
ReplCommand::UseDatabase { db_name } => {
self.use_database(db_name);
ReplCommand::UseNamespace { db_name } => {
self.use_namespace(db_name);
}
ReplCommand::SqlCommand { sql } => {
self.run_sql(sql).await.map_err(|e| println!("{}", e)).ok();
@ -302,18 +302,18 @@ impl Repl {
self.print_results(&[record_batch])
}
// Run a command against the currently selected remote database
// Run a command against the currently selected remote namespace
async fn run_sql(&mut self, sql: String) -> Result<()> {
let start = Instant::now();
let batches = match &mut self.query_engine {
None => {
println!("Error: no database selected.");
println!("Hint: Run USE DATABASE <dbname> to select database");
println!("Error: no namespace selected.");
println!("Hint: Run USE NAMESPACE <dbname> to select namespace");
return Ok(());
}
Some(QueryEngine::Remote(db_name)) => {
info!(%db_name, %sql, "Running sql on remote database");
info!(%db_name, %sql, "Running sql on remote namespace");
scrape_query(&mut self.flight_client, db_name, &sql).await?
}
@ -349,9 +349,9 @@ impl Repl {
}
}
fn use_database(&mut self, db_name: String) {
info!(%db_name, "setting current database");
println!("You are now in remote mode, querying database {}", db_name);
fn use_namespace(&mut self, db_name: String) {
info!(%db_name, "setting current namespace");
println!("You are now in remote mode, querying namespace {}", db_name);
self.set_query_engine(QueryEngine::Remote(db_name));
}

View File

@ -7,7 +7,7 @@ pub enum ReplCommand {
ShowNamespaces,
Observer,
SetFormat { format: String },
UseDatabase { db_name: String },
UseNamespace { db_name: String },
SqlCommand { sql: String },
Exit,
}
@ -64,18 +64,18 @@ impl TryFrom<&str> for ReplCommand {
["observer"] => Ok(Self::Observer),
["exit"] => Ok(Self::Exit),
["quit"] => Ok(Self::Exit),
["use", "database"] => {
Err("name not specified. Usage: USE DATABASE <name>".to_string())
} // USE DATABASE
["use", "database", _name] => {
// USE DATABASE <name>
Ok(Self::UseDatabase {
["use", "namespace"] => {
Err("name not specified. Usage: USE NAMESPACE <name>".to_string())
} // USE NAMESPACE
["use", "namespace", _name] => {
// USE namespace <name>
Ok(Self::UseNamespace {
db_name: raw_commands[2].to_string(),
})
}
["use", _command] => {
// USE <name>
Ok(Self::UseDatabase {
Ok(Self::UseNamespace {
db_name: raw_commands[1].to_string(),
})
}
@ -98,9 +98,9 @@ impl ReplCommand {
Available commands (not case sensitive):
HELP (this one)
SHOW NAMESPACES: List databases available on the server
SHOW NAMESPACES: List namespaces available on the server
USE [DATABASE|NAMESPACE] <name>: Set the current remote database to name
USE NAMESPACE <name>: Set the current remote namespace to name
SET FORMAT <format>: Set the output format to Pretty, csv or json
@ -108,9 +108,9 @@ OBSERVER: Locally query unified queryable views of remote system tables
[EXIT | QUIT]: Quit this session and exit the program
# Examples: use remote database foo
SHOW DATABASES;
USE DATABASE foo;
# Examples: use remote namespace foo
SHOW NAMESPACES;
USE foo;
# Basic IOx SQL Primer
@ -199,35 +199,35 @@ mod tests {
}
#[test]
fn use_database() {
let expected = Ok(ReplCommand::UseDatabase {
fn use_namespace() {
let expected = Ok(ReplCommand::UseNamespace {
db_name: "Foo".to_string(),
});
assert_eq!("use Foo".try_into(), expected);
assert_eq!("use Database Foo;".try_into(), expected);
assert_eq!("use Database Foo ;".try_into(), expected);
assert_eq!(" use Database Foo; ".try_into(), expected);
assert_eq!(" use Database Foo; ".try_into(), expected);
assert_eq!("use Namespace Foo;".try_into(), expected);
assert_eq!("use Namespace Foo ;".try_into(), expected);
assert_eq!(" use Namespace Foo; ".try_into(), expected);
assert_eq!(" use Namespace Foo; ".try_into(), expected);
// ensure that database name is case sensitive
let expected = Ok(ReplCommand::UseDatabase {
// ensure that namespace name is case sensitive
let expected = Ok(ReplCommand::UseNamespace {
db_name: "FOO".to_string(),
});
assert_eq!("use FOO".try_into(), expected);
assert_eq!("use DATABASE FOO;".try_into(), expected);
assert_eq!("USE DATABASE FOO;".try_into(), expected);
assert_eq!("use NAMESPACE FOO;".try_into(), expected);
assert_eq!("USE NAMESPACE FOO;".try_into(), expected);
let expected: Result<ReplCommand, String> =
Err("name not specified. Usage: USE DATABASE <name>".to_string());
assert_eq!("use Database;".try_into(), expected);
assert_eq!("use DATABASE".try_into(), expected);
assert_eq!("use database".try_into(), expected);
Err("name not specified. Usage: USE NAMESPACE <name>".to_string());
assert_eq!("use Namespace;".try_into(), expected);
assert_eq!("use NAMESPACE".try_into(), expected);
assert_eq!("use namespace".try_into(), expected);
let expected = sql_cmd("use database foo bar");
assert_eq!("use database foo bar".try_into(), expected);
let expected = sql_cmd("use namespace foo bar");
assert_eq!("use namespace foo bar".try_into(), expected);
let expected = sql_cmd("use database foo BAR");
assert_eq!("use database foo BAR".try_into(), expected);
let expected = sql_cmd("use namespace foo BAR");
assert_eq!("use namespace foo BAR".try_into(), expected);
}
#[test]

View File

@ -1,6 +1,14 @@
use futures::StreamExt;
use influxdb_iox_client::{connection::Connection, write};
use snafu::{ResultExt, Snafu};
use std::{fs::File, io::Read, path::PathBuf};
use observability_deps::tracing::info;
use snafu::{ensure, OptionExt, ResultExt, Snafu};
use std::{
fs::File,
io::{BufReader, Read},
num::NonZeroUsize,
path::PathBuf,
time::Instant,
};
#[allow(clippy::enum_variant_names)]
#[derive(Debug, Snafu)]
@ -11,10 +19,30 @@ pub enum Error {
source: std::io::Error,
},
#[snafu(display("Error reading files: {:#?}", sources))]
ReadingFiles { sources: Vec<Error> },
#[snafu(display("Client error: {source}"))]
ClientError {
source: influxdb_iox_client::error::Error,
},
#[snafu(display("Error converting parquet: {}", source))]
Conversion {
source: parquet_to_line_protocol::Error,
},
#[snafu(display("Line protocol was not valid utf8: {}", source))]
InvalidUtf8 { source: std::string::FromUtf8Error },
#[snafu(display("Error decoding gzip {:?}: {}", file_name, source))]
Gz {
file_name: PathBuf,
source: std::io::Error,
},
#[snafu(display("Max concurrent uploads must be greater than zero"))]
MaxConcurrentUploadsVerfication,
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -22,36 +50,176 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Write data into the specified database
#[derive(Debug, clap::Parser)]
pub struct Config {
/// If specified, restricts the maxium amount of line protocol
/// sent per request to this many bytes. Defaults to 1MB
#[clap(action, long, short = 'b', default_value = "1048576")]
max_request_payload_size_bytes: usize,
/// Uploads up to this many http requests at a time. Defaults to 10
#[clap(action, long, short = 'c', default_value = "10")]
max_concurrent_uploads: usize,
/// The namespace into which to write
#[clap(action)]
namespace: String,
/// File with data to load. Currently supported formats are .lp
/// File(s) with data to load. Currently supported formats are .lp (line protocol),
/// .parquet (IOx created parquet files), and .gz (gzipped line protocol)
#[clap(action)]
file_name: PathBuf,
file_names: Vec<PathBuf>,
}
pub async fn command(connection: Connection, config: Config) -> Result<()> {
let start = Instant::now();
let Config {
namespace,
file_name,
file_names,
max_request_payload_size_bytes,
max_concurrent_uploads,
} = config;
let file_name = &file_name;
let mut file = File::open(file_name).context(ReadingFileSnafu { file_name })?;
let max_concurrent_uploads =
NonZeroUsize::new(max_concurrent_uploads).context(MaxConcurrentUploadsVerficationSnafu)?;
let mut lp_data = String::new();
file.read_to_string(&mut lp_data)
.context(ReadingFileSnafu { file_name })?;
info!(
num_files = file_names.len(),
max_request_payload_size_bytes, max_concurrent_uploads, "Beginning upload"
);
let mut client = write::Client::new(connection);
// first pass is to check that all the files exist and can be
// opened and if not fail fast.
let file_open_errors: Vec<_> = file_names
.iter()
.filter_map(|file_name| {
File::open(file_name)
.context(ReadingFileSnafu { file_name })
.err()
})
.collect();
ensure!(
file_open_errors.is_empty(),
ReadingFilesSnafu {
sources: file_open_errors
}
);
// if everything looked good, go through and read the files out
// them potentially in parallel.
let lp_stream = futures_util::stream::iter(file_names)
.map(|file_name| tokio::task::spawn(slurp_file(file_name)))
// Since the contents of each file are buffered into a string,
// limit the number that are open at once to the maximum
// possible uploads
.buffered(max_concurrent_uploads.into())
// warn and skip any errors
.filter_map(|res| async move {
match res {
Ok(Ok(lp_data)) => Some(lp_data),
Ok(Err(e)) => {
eprintln!("WARNING: ignoring error : {}", e);
None
}
Err(e) => {
eprintln!("WARNING: ignoring task fail: {}", e);
None
}
}
});
let mut client = write::Client::new(connection)
.with_max_concurrent_uploads(max_concurrent_uploads)
.with_max_request_payload_size_bytes(Some(max_request_payload_size_bytes));
let total_bytes = client
.write_lp(namespace, lp_data)
.write_lp_stream(namespace, lp_stream)
.await
.context(ClientSnafu)?;
println!("{} Bytes OK", total_bytes);
let elapsed = Instant::now() - start;
let mb = (total_bytes as f64) / (1024.0 * 1024.0);
let mb_per_sec = (mb / (elapsed.as_millis() as f64)) * (1000.0);
println!("{total_bytes} Bytes OK in {elapsed:?}. {mb_per_sec:.2} MB/sec");
Ok(())
}
/// Reads the contents of `file_name into a string
///
/// .parquet files --> iox parquet files (convert to parquet)
/// .gz --> treated as gzipped line protocol
/// .lp (or anything else) --> treated as raw line protocol
///
async fn slurp_file(file_name: PathBuf) -> Result<String> {
let file_name = &file_name;
let extension = file_name
.extension()
.map(|extension| extension.to_ascii_lowercase());
match extension {
// Transform parquet to line protocol prior to upload
// Not the most efficient process, but it is expedient
Some(extension) if extension.to_string_lossy() == "parquet" => {
let mut lp_data = vec![];
parquet_to_line_protocol::convert_file(file_name, &mut lp_data)
.await
.context(ConversionSnafu)?;
let lp_data = String::from_utf8(lp_data).context(InvalidUtf8Snafu)?;
info!(
?file_name,
file_size_bytes = lp_data.len(),
"Buffered line protocol from parquet file"
);
Ok(lp_data)
}
// decompress as gz
Some(extension) if extension.to_string_lossy() == "gz" => {
let mut lp_data = String::new();
let reader =
BufReader::new(File::open(&file_name).context(ReadingFileSnafu { file_name })?);
flate2::read::GzDecoder::new(reader)
.read_to_string(&mut lp_data)
.context(GzSnafu { file_name })?;
info!(
?file_name,
file_size_bytes = lp_data.len(),
"Buffered line protocol from gzipped line protocol file"
);
Ok(lp_data)
}
// anything else, treat as line protocol
Some(_) | None => {
let lp_data =
std::fs::read_to_string(file_name).context(ReadingFileSnafu { file_name })?;
info!(
?file_name,
file_size_bytes = lp_data.len(),
"Buffered line protocol file"
);
Ok(lp_data)
}
}
}
#[cfg(test)]
mod test {
use clap::Parser;
use influxdb_iox_client::write::DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES;
use super::*;
#[test]
fn command_default_is_same_as_client_default() {
let config = Config::try_parse_from(vec!["my_db", "file1"]).unwrap();
assert_eq!(
Some(config.max_request_payload_size_bytes),
DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES
);
}
}

View File

@ -6,7 +6,6 @@ use predicates::prelude::*;
use serde_json::Value;
use std::time::{Duration, Instant};
use tempfile::tempdir;
use test_helpers::make_temp_file;
use test_helpers_end_to_end::{
maybe_skip_integration, AddAddrEnv, BindAddresses, MiniCluster, ServerType, Step, StepTest,
StepTestState,
@ -526,9 +525,6 @@ async fn write_and_query() {
vec![
Step::Custom(Box::new(|state: &mut StepTestState| {
async {
// write line protocol to a temp file
let lp_file = make_temp_file("m,tag=1 v=2 12345");
let lp_file_path = lp_file.path().to_string_lossy().to_string();
let router_addr = state.cluster().router().router_http_base().to_string();
let namespace = state.cluster().namespace();
@ -537,53 +533,48 @@ async fn write_and_query() {
// Validate the output of the schema CLI command
Command::cargo_bin("influxdb_iox")
.unwrap()
.arg("-v")
.arg("-h")
.arg(&router_addr)
.arg("write")
.arg(&namespace)
.arg(&lp_file_path)
// raw line protocol ('h2o_temperature' measurement)
.arg("../test_fixtures/lineproto/air_and_water.lp")
// gzipped line protocol ('m0')
.arg("../test_fixtures/lineproto/read_filter.lp.gz")
// iox formatted parquet ('cpu' measurement)
.arg("../test_fixtures/cpu.parquet")
.assert()
.success()
.stdout(predicate::str::contains("17 Bytes OK"));
// this number is the total size of
// uncompressed line protocol stored in all
// three files
.stdout(predicate::str::contains("1137058 Bytes OK"));
}
.boxed()
})),
Step::Custom(Box::new(|state: &mut StepTestState| {
async {
let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
let namespace = state.cluster().namespace();
// data from 'air_and_water.lp'
wait_for_query_result(
state,
"SELECT * from h2o_temperature order by time desc limit 10",
"| 51.3 | coyote_creek | CA | 55.1 | 1970-01-01T00:00:01.568756160Z |"
).await;
let max_wait_time = Duration::from_secs(10);
let expected = "| 1 | 1970-01-01T00:00:00.000012345Z | 2 |";
println!("Waiting for {expected}");
// data from 'read_filter.lp.gz'
wait_for_query_result(
state,
"SELECT * from m0 order by time desc limit 10;",
"| value1 | value9 | value9 | value49 | value0 | 2021-04-26T13:47:39.727574Z | 1 |"
).await;
// Validate the output of running the query CLI command appears after at most max_wait_time
let end = Instant::now() + max_wait_time;
while Instant::now() < end {
let maybe_result = Command::cargo_bin("influxdb_iox")
.unwrap()
.arg("-h")
.arg(&querier_addr)
.arg("query")
.arg(&namespace)
.arg("SELECT * from m")
.assert()
.success()
.try_stdout(predicate::str::contains(expected));
match maybe_result {
Err(e) => {
println!("Got err: {}, retrying", e);
}
Ok(r) => {
println!("Success: {:?}", r);
return;
}
}
// sleep and try again
tokio::time::sleep(Duration::from_millis(500)).await
}
panic!("Did not find expected output in allotted time");
// data from 'cpu.parquet'
wait_for_query_result(
state,
"SELECT * from cpu where cpu = 'cpu2' order by time desc limit 10",
"cpu2 | MacBook-Pro-8.hsd1.ma.comcast.net | 2022-09-30T12:55:00Z"
).await;
}
.boxed()
})),
@ -593,6 +584,53 @@ async fn write_and_query() {
.await
}
/// Runs the specified query in a loop for up to 10 seconds, waiting
/// for the specified output to appear
async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, expected: &str) {
let querier_addr = state.cluster().querier().querier_grpc_base().to_string();
let namespace = state.cluster().namespace();
let max_wait_time = Duration::from_secs(10);
println!("Waiting for {expected}");
// Validate the output of running the query CLI command appears after at most max_wait_time
let end = Instant::now() + max_wait_time;
while Instant::now() < end {
let assert = Command::cargo_bin("influxdb_iox")
.unwrap()
.arg("-h")
.arg(&querier_addr)
.arg("query")
.arg(&namespace)
.arg(query_sql)
.assert();
let assert = match assert.try_success() {
Err(e) => {
println!("Got err running command: {}, retrying", e);
continue;
}
Ok(a) => a,
};
match assert.try_stdout(predicate::str::contains(expected)) {
Err(e) => {
println!("No match: {}, retrying", e);
}
Ok(r) => {
println!("Success: {:?}", r);
return;
}
}
// sleep and try again
tokio::time::sleep(Duration::from_secs(1)).await
}
panic!(
"Did not find expected output {} within {:?}",
expected, max_wait_time
);
}
/// Test the schema cli command
#[tokio::test]
async fn namespaces_cli() {

View File

@ -52,7 +52,6 @@ async fn ingester_flight_api() {
partition_id,
status: Some(PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None
})
},
);

View File

@ -7,7 +7,8 @@ use futures::FutureExt;
use predicates::prelude::*;
use test_helpers::assert_contains;
use test_helpers_end_to_end::{
maybe_skip_integration, run_query, MiniCluster, Step, StepTest, StepTestState, TestConfig,
maybe_skip_integration, run_query, try_run_query, GrpcRequestBuilder, MiniCluster, Step,
StepTest, StepTestState, TestConfig,
};
#[tokio::test]
@ -454,6 +455,87 @@ async fn issue_4631_b() {
.await
}
#[tokio::test]
async fn oom_protection() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let router_config = TestConfig::new_router(&database_url);
let ingester_config = TestConfig::new_ingester(&router_config);
let querier_config =
TestConfig::new_querier(&ingester_config).with_querier_max_table_query_bytes(1);
let mut cluster = MiniCluster::new()
.with_router(router_config)
.await
.with_ingester(ingester_config)
.await
.with_querier(querier_config)
.await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!("{},tag1=A,tag2=B val=42i 123457", table_name)),
Step::WaitForReadable,
Step::AssertNotPersisted,
// SQL query
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let sql = format!("select * from {}", table_name);
let err = try_run_query(
sql,
state.cluster().namespace(),
state.cluster().querier().querier_grpc_connection(),
)
.await
.unwrap_err();
if let influxdb_iox_client::flight::Error::GrpcError(status) = err {
assert_eq!(
status.code(),
tonic::Code::ResourceExhausted,
"Wrong status code: {}\n\nStatus:\n{}",
status.code(),
status,
);
} else {
panic!("Not a gRPC error: {err}");
}
}
.boxed()
})),
// InfluxRPC/storage query
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut storage_client = state.cluster().querier_storage_client();
let read_filter_request = GrpcRequestBuilder::new()
.source(state.cluster())
.build_read_filter();
let status = storage_client
.read_filter(read_filter_request)
.await
.unwrap_err();
assert_eq!(
status.code(),
tonic::Code::ResourceExhausted,
"Wrong status code: {}\n\nStatus:\n{}",
status.code(),
status,
);
}
.boxed()
})),
],
)
.run()
.await
}
/// This structure holds information for tests that need to force a parquet file to be persisted
struct ForcePersistenceSetup {
// Set up a cluster that will will persist quickly

View File

@ -13,6 +13,7 @@ format = ["arrow", "arrow_util"]
# Workspace dependencies, in alphabetical order
arrow_util = { path = "../arrow_util", optional = true }
client_util = { path = "../client_util" }
influxdb_line_protocol = { path = "../influxdb_line_protocol"}
generated_types = { path = "../generated_types", default-features = false, features = ["data_types_conversions"] }
# Crates.io dependencies, in alphabetical order
@ -23,9 +24,7 @@ futures-util = { version = "0.3", optional = true }
prost = "0.11"
rand = "0.8.3"
reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
tokio-stream = "0.1.11"
thiserror = "1.0.37"
tonic = { version = "0.8" }
[dev-dependencies] # In alphabetical order
tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] }
mockito = "0.31"

View File

@ -1,15 +1,16 @@
/// Re-export generated_types
pub mod generated_types {
pub use generated_types::influxdata::pbdata::v1::*;
}
use std::{fmt::Debug, num::NonZeroUsize, sync::Arc};
use client_util::{connection::HttpConnection, namespace_translation::split_namespace};
use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt};
use crate::{
connection::Connection,
error::{translate_response, Error},
};
use reqwest::Method;
use reqwest::{Body, Method};
/// The default value for the maximum size of each request, in bytes
pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
/// An IOx Write API client.
///
@ -37,18 +38,67 @@ use reqwest::Method;
/// ```
#[derive(Debug, Clone)]
pub struct Client {
inner: HttpConnection,
/// The inner client used to actually make requests.
///
/// Uses a trait for test mocking.
///
/// Does not expose the trait in the `Client` type to avoid
/// exposing an internal implementation detail (the trait) in the
/// public interface.
inner: Arc<dyn RequestMaker>,
/// If `Some`, restricts the maximum amount of line protocol
/// sent per request to this many bytes. If `None`, does not restrict
/// the amount sent per request. Defaults to `Some(1MB)`
///
/// Splitting the upload size consumes a non trivial amount of CPU
/// to find line protocol boundaries. This can be disabled by
/// setting `max_request_payload_size_bytes` to `None`.
max_request_payload_size_bytes: Option<usize>,
/// Makes this many concurrent requests at a time. Defaults to 1
max_concurrent_uploads: NonZeroUsize,
}
impl Client {
/// Creates a new client with the provided connection
pub fn new(connection: Connection) -> Self {
Self::new_with_maker(Arc::new(connection.into_http_connection()))
}
/// Creates a new client with the provided request maker
fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
Self {
inner: connection.into_http_connection(),
inner,
max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
max_concurrent_uploads: NonZeroUsize::new(1).unwrap(),
}
}
/// Write the [LineProtocol] formatted data in `lp_data` to
/// Override the default of sending 1MB of line protocol per request.
/// If `Some` is specified, restricts the maximum amount of line protocol
/// sent per request to this many bytes. If `None`, does not restrict the amount of
/// line protocol sent per request.
pub fn with_max_request_payload_size_bytes(
self,
max_request_payload_size_bytes: Option<usize>,
) -> Self {
Self {
max_request_payload_size_bytes,
..self
}
}
/// The client makes this many concurrent uploads at a
/// time. Defaults to 1.
pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self {
Self {
max_concurrent_uploads,
..self
}
}
/// Write the [LineProtocol] formatted string in `lp_data` to
/// namespace `namespace`.
///
/// Returns the number of bytes which were written to the database
@ -59,11 +109,24 @@ impl Client {
namespace: impl AsRef<str> + Send,
lp_data: impl Into<String> + Send,
) -> Result<usize, Error> {
let lp_data = lp_data.into();
let data_len = lp_data.len();
let sources = futures_util::stream::iter([lp_data.into()]);
let write_url = format!("{}api/v2/write", self.inner.uri());
self.write_lp_stream(namespace, sources).await
}
/// Write the stream of [LineProtocol] formatted strings in
/// `sources` to namespace `namespace`. It is assumed that
/// individual lines (points) do not cross these strings
///
/// Returns the number of bytes, in total, which were written to
/// the database
///
/// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
pub async fn write_lp_stream(
&mut self,
namespace: impl AsRef<str> + Send,
sources: impl Stream<Item = String> + Send,
) -> Result<usize, Error> {
let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| {
Error::invalid_argument(
"namespace",
@ -71,47 +134,302 @@ impl Client {
)
})?;
let response = self
.inner
.client()
.request(Method::POST, &write_url)
.query(&[("bucket", bucket_id), ("org", org_id)])
.body(lp_data)
.send()
let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
// make a stream and process in parallel
let results = sources
// split each input source in parallel, if possible
.flat_map(|source| {
split_lp(
source,
max_request_payload_size_bytes,
max_concurrent_uploads,
)
})
// do the actual write
.map(|source| {
let org_id = org_id.to_string();
let bucket_id = bucket_id.to_string();
let inner = Arc::clone(&self.inner);
tokio::task::spawn(
async move { inner.write_source(org_id, bucket_id, source).await },
)
})
// Do the uploads in parallel
.buffered(max_concurrent_uploads)
.try_collect::<Vec<_>>()
// handle panics in tasks
.await
.map_err(Error::client)?;
.map_err(Error::client)?
// find / return any errors
.into_iter()
.collect::<Result<Vec<_>, Error>>()?;
translate_response(response).await?;
Ok(results.into_iter().sum())
}
}
Ok(data_len)
/// Something that knows how to send http data. Exists so it can be
/// mocked out for testing
trait RequestMaker: Debug + Send + Sync {
/// Write the body data to the specified org, bucket, and
/// returning the number of bytes written
///
/// (this is implemented manually to avoid `async_trait`)
fn write_source(
&self,
org_id: String,
bucket_id: String,
body: String,
) -> BoxFuture<'_, Result<usize, Error>>;
}
impl RequestMaker for HttpConnection {
fn write_source(
&self,
org_id: String,
bucket_id: String,
body: String,
) -> BoxFuture<'_, Result<usize, Error>> {
let write_url = format!("{}api/v2/write", self.uri());
async move {
let body: Body = body.into();
let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0);
let response = self
.client()
.request(Method::POST, &write_url)
.query(&[("bucket", bucket_id), ("org", org_id)])
.body(body)
.send()
.await
.map_err(Error::client)?;
translate_response(response).await?;
Ok(data_len)
}
.boxed()
}
}
/// splits input line protocol into one or more sizes of at most
/// `max_chunk` on line breaks in a separte tokio task
fn split_lp(
input: String,
max_chunk_size: Option<usize>,
max_concurrent_uploads: usize,
) -> impl Stream<Item = String> {
let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads);
tokio::task::spawn(async move {
match max_chunk_size {
None => {
// ignore errors (means the receiver hung up but nothing to communicate
tx.send(input).await.ok();
}
Some(max_chunk_size) => {
// use the actual line protocol parser to split on valid boundaries
let mut acc = LineAccumulator::new(max_chunk_size);
for l in influxdb_line_protocol::split_lines(&input) {
if let Some(chunk) = acc.push(l) {
// abort if receiver has hungup
if tx.send(chunk).await.is_err() {
return;
}
}
}
if let Some(chunk) = acc.flush() {
tx.send(chunk).await.ok();
}
}
}
});
tokio_stream::wrappers::ReceiverStream::new(rx)
}
#[derive(Debug)]
struct LineAccumulator {
current_chunk: String,
max_chunk_size: usize,
}
impl LineAccumulator {
fn new(max_chunk_size: usize) -> Self {
Self {
current_chunk: String::with_capacity(max_chunk_size),
max_chunk_size,
}
}
// Add data `l` to the current chunk being created, returning the
// current chunk if complete.
fn push(&mut self, l: &str) -> Option<String> {
let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size {
self.flush()
} else {
None
};
if !self.current_chunk.is_empty() {
self.current_chunk += "\n";
}
self.current_chunk += l;
chunk
}
/// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length
/// `self.current_chunk.len()` is zero
fn flush(&mut self) -> Option<String> {
if !self.current_chunk.is_empty() {
let mut new_chunk = String::with_capacity(self.max_chunk_size);
std::mem::swap(&mut new_chunk, &mut self.current_chunk);
Some(new_chunk)
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Mutex;
use super::*;
use crate::connection::Builder;
#[tokio::test]
/// Ensure the basic plumbing is hooked up correctly
async fn basic() {
let url = mockito::server_url();
let connection = Builder::new().build(&url).await.unwrap();
async fn test() {
let mock = Arc::new(MockRequestMaker::new());
let namespace = "orgname_bucketname";
let data = "m,t=foo f=4";
let m = mockito::mock("POST", "/api/v2/write?bucket=bucketname&org=orgname")
.with_status(201)
.match_body(data)
.create();
let expected = vec![MockRequest {
org_id: "orgname".into(),
bucket_id: "bucketname".into(),
body: data.into(),
}];
let res = Client::new(connection).write_lp(namespace, data).await;
m.assert();
let num_bytes = res.expect("Error making write request");
let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
.write_lp(namespace, data)
.await
.unwrap();
assert_eq!(expected, mock.requests());
assert_eq!(num_bytes, 11);
}
#[tokio::test]
async fn test_max_request_payload_size() {
let mock = Arc::new(MockRequestMaker::new());
let namespace = "orgname_bucketname";
let data = "m,t=foo f=4\n\
m,t=bar f=3\n\
m,t=fooddddddd f=4";
// expect the data to be broken up into two chunks:
let expected = vec![
MockRequest {
org_id: "orgname".into(),
bucket_id: "bucketname".into(),
body: "m,t=foo f=4\nm,t=bar f=3".into(),
},
MockRequest {
org_id: "orgname".into(),
bucket_id: "bucketname".into(),
body: "m,t=fooddddddd f=4".into(),
},
];
let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
// enough to get first two lines, but not last
.with_max_request_payload_size_bytes(Some(30))
.write_lp(namespace, data)
.await
.unwrap();
assert_eq!(expected, mock.requests());
assert_eq!(num_bytes, 41);
}
#[tokio::test]
async fn test_write_lp_stream() {
let mock = Arc::new(MockRequestMaker::new());
let namespace = "orgname_bucketname";
let data = futures_util::stream::iter(
vec!["m,t=foo f=4", "m,t=bar f=3"]
.into_iter()
.map(|s| s.to_string()),
);
// expect the data to come in two chunks
let expected = vec![
MockRequest {
org_id: "orgname".into(),
bucket_id: "bucketname".into(),
body: "m,t=foo f=4".into(),
},
MockRequest {
org_id: "orgname".into(),
bucket_id: "bucketname".into(),
body: "m,t=bar f=3".into(),
},
];
let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
.write_lp_stream(namespace, data)
.await
.unwrap();
assert_eq!(expected, mock.requests());
assert_eq!(num_bytes, 22);
}
#[derive(Debug, Clone, PartialEq)]
struct MockRequest {
org_id: String,
bucket_id: String,
body: String,
}
#[derive(Debug)]
struct MockRequestMaker {
requests: Mutex<Vec<MockRequest>>,
}
impl MockRequestMaker {
fn new() -> Self {
Self {
requests: Mutex::new(vec![]),
}
}
/// get a copy of the requests that were made using this mock
fn requests(&self) -> Vec<MockRequest> {
self.requests.lock().unwrap().clone()
}
}
impl RequestMaker for MockRequestMaker {
fn write_source(
&self,
org_id: String,
bucket_id: String,
body: String,
) -> BoxFuture<'_, Result<usize, Error>> {
let sz = body.len();
self.requests.lock().unwrap().push(MockRequest {
org_id,
bucket_id,
body,
});
async move { Ok(sz) }.boxed()
}
}
}

View File

@ -14,7 +14,7 @@ ffi = ["libc"]
bytes = "1.2"
libc = { version = "0.2", optional = true }
nom = { version = "7", default-features = false, features = ["std"] }
smallvec = { version = "1.9.0", features = ["union"] }
smallvec = { version = "1.10.0", features = ["union"] }
snafu = "0.7"
observability_deps = { path = "../observability_deps" }
workspace-hack = { path = "../workspace-hack"}

View File

@ -529,7 +529,7 @@ pub fn parse_lines(input: &str) -> impl Iterator<Item = Result<ParsedLine<'_>>>
/// logic duplication for scanning fields, duplicating it also means
/// we can be more sure of the compatibility of the rust parser and
/// the canonical Go parser.
fn split_lines(input: &str) -> impl Iterator<Item = &str> {
pub fn split_lines(input: &str) -> impl Iterator<Item = &str> {
// NB: This is ported as closely as possibly from the original Go code:
let mut quoted = false;
let mut fields = false;

View File

@ -4,8 +4,8 @@ version = "0.1.0"
edition = "2021"
[dependencies]
sqlparser = "0.24.0"
snafu = "0.7.1"
sqlparser = "0.25.0"
snafu = "0.7.2"
generated_types = { path = "../generated_types" }
workspace-hack = { path = "../workspace-hack"}

View File

@ -24,7 +24,7 @@ iox_catalog = { path = "../iox_catalog" }
metric = { path = "../metric" }
mutable_batch = { path = "../mutable_batch"}
mutable_batch_lp = { path = "../mutable_batch_lp" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
parking_lot = "0.12"
parquet_file = { path = "../parquet_file" }
@ -45,6 +45,7 @@ write_buffer = { path = "../write_buffer" }
write_summary = { path = "../write_summary" }
tokio-util = { version = "0.7.4" }
trace = { path = "../trace" }
rand = "0.8.5"
[dev-dependencies]
assert_matches = "1.5.0"
@ -52,4 +53,4 @@ bitflags = {version = "1.3.2"}
once_cell = "1"
paste = "1.0.9"
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
tokio-stream = {version = "0.1.10", default_features = false }
tokio-stream = {version = "0.1.11", default_features = false }

View File

@ -18,7 +18,7 @@ use crate::{data::partition::PersistingBatch, query::QueryableBatch};
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
pub(crate) enum Error {
#[snafu(display("Error while building logical plan for Ingester's compaction"))]
LogicalPlan {
source: iox_query::frontend::reorg::Error,
@ -86,11 +86,8 @@ pub(crate) async fn compact_persisting_batch(
namespace_id: i64,
partition_info: &PartitionInfo,
batch: Arc<PersistingBatch>,
) -> Result<Option<CompactedStream>> {
// Nothing to compact
if batch.data.data.is_empty() {
return Ok(None);
}
) -> Result<CompactedStream> {
assert!(!batch.data.data.is_empty());
let namespace_name = &partition_info.namespace_name;
let table_name = &partition_info.table_name;
@ -141,11 +138,11 @@ pub(crate) async fn compact_persisting_batch(
sort_key: Some(metadata_sort_key),
};
Ok(Some(CompactedStream {
Ok(CompactedStream {
stream,
iox_metadata,
sort_key_update,
}))
})
}
/// Compact a given Queryable Batch
@ -192,8 +189,8 @@ mod tests {
create_batches_with_influxtype_same_columns_different_type,
create_one_record_batch_with_influxtype_duplicates,
create_one_record_batch_with_influxtype_no_duplicates,
create_one_row_record_batch_with_influxtype, create_tombstone, make_meta,
make_persisting_batch, make_queryable_batch, make_queryable_batch_with_deletes,
create_one_row_record_batch_with_influxtype, make_meta, make_persisting_batch,
make_queryable_batch,
};
// this test was added to guard against https://github.com/influxdata/influxdb_iox/issues/3782
@ -226,7 +223,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -254,7 +250,6 @@ mod tests {
let CompactedStream { stream, .. } =
compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -297,7 +292,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -328,7 +322,6 @@ mod tests {
sort_key_update,
} = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -394,7 +387,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -426,7 +418,6 @@ mod tests {
sort_key_update,
} = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -494,7 +485,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -527,7 +517,6 @@ mod tests {
sort_key_update,
} = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -595,7 +584,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -629,7 +617,6 @@ mod tests {
sort_key_update,
} = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -700,7 +687,6 @@ mod tests {
partition_id,
uuid,
batches,
vec![],
);
// verify PK
@ -739,7 +725,6 @@ mod tests {
sort_key_update,
} = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch)
.await
.unwrap()
.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
@ -825,54 +810,6 @@ mod tests {
assert_batches_eq!(&expected, &output_batches);
}
#[tokio::test]
async fn test_compact_one_batch_no_dupilcates_with_deletes() {
test_helpers::maybe_start_logging();
// create input data
let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
// build queryable batch from the input batches
let compact_batch =
make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
// verify PK
let schema = compact_batch.schema();
let pk = schema.primary_key();
let expected_pk = vec!["tag1", "time"];
assert_eq!(expected_pk, pk);
let sort_key = compute_sort_key(
&schema,
compact_batch.data.iter().map(|sb| sb.data.as_ref()),
);
assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));
// compact
let exc = Executor::new(1);
let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
.await
.unwrap();
// verify no empty record batches - bug #3782
assert_eq!(output_batches.len(), 2);
assert_eq!(output_batches[0].num_rows(), 1);
assert_eq!(output_batches[1].num_rows(), 1);
// verify compacted data
// row with "tag1=UT" no longer available
let expected = vec![
"+-----------+------+-----------------------------+",
"| field_int | tag1 | time |",
"+-----------+------+-----------------------------+",
"| 10 | VT | 1970-01-01T00:00:00.000010Z |",
"| 1000 | WA | 1970-01-01T00:00:00.000008Z |",
"+-----------+------+-----------------------------+",
];
assert_batches_eq!(&expected, &output_batches);
}
#[tokio::test]
async fn test_compact_one_batch_with_duplicates() {
// create input data
@ -1019,23 +956,12 @@ mod tests {
}
#[tokio::test]
async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_deletes(
) {
async fn test_compact_many_batches_different_columns_different_order_with_duplicates() {
// create many-batches input data
let batches = create_batches_with_influxtype_different_columns_different_order().await;
let tombstones = vec![create_tombstone(
1,
1,
1,
100, // delete's seq_number
0, // min time of data to get deleted
200000, // max time of data to get deleted
"tag2=CT and field_int=1000", // delete predicate
)];
// build queryable batch from the input batches
let compact_batch =
make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
// verify PK
let schema = compact_batch.schema();
@ -1058,7 +984,6 @@ mod tests {
// verify compacted data
// data is sorted and all duplicates are removed
// all rows with ("tag2=CT and field_int=1000") are also removed
// CORRECT RESULT
let expected = vec![
"+-----------+------+------+--------------------------------+",
@ -1067,73 +992,15 @@ mod tests {
"| 5 | | AL | 1970-01-01T00:00:00.000005Z |",
"| 10 | | AL | 1970-01-01T00:00:00.000007Z |",
"| 70 | | CT | 1970-01-01T00:00:00.000000100Z |",
"| 1000 | | CT | 1970-01-01T00:00:00.000001Z |",
"| 100 | | MA | 1970-01-01T00:00:00.000000050Z |",
"| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |",
"| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |",
"| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |",
"| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |",
"| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |",
"+-----------+------+------+--------------------------------+",
];
assert_batches_eq!(&expected, &output_batches);
}
#[tokio::test]
async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes(
) {
// create many-batches input data
let batches = create_batches_with_influxtype_different_columns_different_order().await;
let tombstones = vec![
create_tombstone(
1,
1,
1,
100, // delete's seq_number
0, // min time of data to get deleted
200000, // max time of data to get deleted
"tag2=CT and field_int=1000", // delete predicate
),
create_tombstone(
1, 1, 1, 101, // delete's seq_number
0, // min time of data to get deleted
200000, // max time of data to get deleted
"tag1!=MT", // delete predicate
),
];
// build queryable batch from the input batches
let compact_batch =
make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
// verify PK
let schema = compact_batch.schema();
let pk = schema.primary_key();
let expected_pk = vec!["tag1", "tag2", "time"];
assert_eq!(expected_pk, pk);
let sort_key = compute_sort_key(
&schema,
compact_batch.data.iter().map(|sb| sb.data.as_ref()),
);
assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));
// compact
let exc = Executor::new(1);
let stream = compact(&exc, compact_batch, sort_key).await.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
.await
.unwrap();
// verify compacted data
// data is sorted and all duplicates are removed
// all rows with ("tag2=CT and field_int=1000") and ("tag1!=MT") are also removed
let expected = vec![
"+-----------+------+------+--------------------------------+",
"| field_int | tag1 | tag2 | time |",
"+-----------+------+------+--------------------------------+",
"| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |",
"| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |",
"| 1000 | MT | CT | 1970-01-01T00:00:00.000001Z |",
"| 1000 | MT | CT | 1970-01-01T00:00:00.000002Z |",
"+-----------+------+------+--------------------------------+",
];
@ -1142,31 +1009,12 @@ mod tests {
// BUG
#[tokio::test]
async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes_2(
) {
async fn test_compact_many_batches_different_columns_different_order_with_duplicates2() {
// create many-batches input data
let batches = create_batches_with_influxtype_different_columns_different_order().await;
let tombstones = vec![
create_tombstone(
1,
1,
1,
100, // delete's seq_number
0, // min time of data to get deleted
200000, // max time of data to get deleted
"tag2=CT and field_int=1000", // delete predicate
),
create_tombstone(
1, 1, 1, 101, // delete's seq_number
0, // min time of data to get deleted
200000, // max time of data to get deleted
"tag1=MT", // delete predicate
),
];
// build queryable batch from the input batches
let compact_batch =
make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
let compact_batch = make_queryable_batch("test_table", 0, 1, batches);
// verify PK
let schema = compact_batch.schema();
@ -1189,29 +1037,22 @@ mod tests {
// verify compacted data
// data is sorted and all duplicates are removed
// all rows with ("tag2=CT and field_int=1000") and ("tag1=MT") are also removed
// CORRECT RESULT
// let expected = vec![
// "+-----------+------+------+--------------------------------+",
// "| field_int | tag1 | tag2 | time |",
// "+-----------+------+------+--------------------------------+",
// "| 5 | | AL | 1970-01-01T00:00:00.000005Z |",
// "| 10 | | AL | 1970-01-01T00:00:00.000007Z |",
// "| 70 | | CT | 1970-01-01T00:00:00.000000100Z |",
// "| 100 | | MA | 1970-01-01T00:00:00.000000050Z |",
// "| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |",
// "| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |",
// "| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |",
// "+-----------+------+------+--------------------------------+",
// ];
// current WRONMG result: "tag1 is null" is also eliminated
let expected = vec![
"+-----------+------+------+--------------------------------+",
"| field_int | tag1 | tag2 | time |",
"+-----------+------+------+--------------------------------+",
"| 5 | | AL | 1970-01-01T00:00:00.000005Z |",
"| 10 | | AL | 1970-01-01T00:00:00.000007Z |",
"| 70 | | CT | 1970-01-01T00:00:00.000000100Z |",
"| 1000 | | CT | 1970-01-01T00:00:00.000001Z |",
"| 100 | | MA | 1970-01-01T00:00:00.000000050Z |",
"| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |",
"| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |",
"| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |",
"| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |",
"| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |",
"| 1000 | MT | CT | 1970-01-01T00:00:00.000001Z |",
"| 1000 | MT | CT | 1970-01-01T00:00:00.000002Z |",
"+-----------+------+------+--------------------------------+",
];

View File

@ -1,15 +1,12 @@
//! Data for the lifecycle of the Ingester
use std::{collections::BTreeMap, pin::Pin, sync::Arc};
use std::{collections::BTreeMap, sync::Arc};
use arrow::{error::ArrowError, record_batch::RecordBatch};
use arrow_util::optimize::{optimize_record_batch, optimize_schema};
use async_trait::async_trait;
use backoff::{Backoff, BackoffConfig};
use data_types::{PartitionId, SequenceNumber, ShardId, ShardIndex};
use datafusion::physical_plan::SendableRecordBatchStream;
use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId};
use dml::DmlOperation;
use futures::{Stream, StreamExt};
use iox_catalog::interface::{get_table_schema_by_id, Catalog};
use iox_query::exec::Executor;
use iox_time::SystemProvider;
@ -25,16 +22,12 @@ use crate::{
lifecycle::LifecycleHandle,
};
pub mod namespace;
pub(crate) mod namespace;
pub mod partition;
mod query_dedup;
pub mod shard;
pub mod table;
pub(crate) mod shard;
pub(crate) mod table;
use self::{
partition::{resolver::PartitionProvider, PartitionStatus},
shard::ShardData,
};
use self::{partition::resolver::PartitionProvider, shard::ShardData, table::TableName};
#[cfg(test)]
mod triggers;
@ -51,9 +44,6 @@ pub enum Error {
#[snafu(display("Table {} not found in buffer", table_name))]
TableNotFound { table_name: String },
#[snafu(display("Table must be specified in delete"))]
TableNotPresent,
#[snafu(display("Error accessing catalog: {}", source))]
Catalog {
source: iox_catalog::interface::Error,
@ -186,7 +176,7 @@ impl IngesterData {
.get(&shard_id)
.context(ShardNotFoundSnafu { shard_id })?;
shard_data
.buffer_operation(dml_operation, &self.catalog, lifecycle_handle, &self.exec)
.buffer_operation(dml_operation, &self.catalog, lifecycle_handle)
.await
}
@ -220,7 +210,13 @@ impl IngesterData {
#[async_trait]
pub trait Persister: Send + Sync + 'static {
/// Persits the partition ID. Will retry forever until it succeeds.
async fn persist(&self, partition_id: PartitionId);
async fn persist(
&self,
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
);
/// Updates the shard's `min_unpersisted_sequence_number` in the catalog.
/// This number represents the minimum that might be unpersisted, which is the
@ -235,7 +231,69 @@ pub trait Persister: Send + Sync + 'static {
#[async_trait]
impl Persister for IngesterData {
async fn persist(&self, partition_id: PartitionId) {
async fn persist(
&self,
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
) {
// lookup the state from the ingester data. If something isn't found,
// it's unexpected. Crash so someone can take a look.
let shard_data = self
.shards
.get(&shard_id)
.unwrap_or_else(|| panic!("shard state for {shard_id} not in ingester data"));
let namespace = shard_data
.namespace_by_id(namespace_id)
.unwrap_or_else(|| panic!("namespace {namespace_id} not in shard {shard_id} state"));
let partition_key;
let batch;
{
let table_data = namespace.table_id(table_id).unwrap_or_else(|| {
panic!("table {table_id} in namespace {namespace_id} not in shard {shard_id} state")
});
let mut guard = table_data.write().await;
let partition = guard.get_partition(partition_id).unwrap_or_else(|| {
panic!(
"partition {partition_id} in table {table_id} in namespace {namespace_id} not in shard {shard_id} state"
)
});
partition_key = partition.partition_key().clone();
batch = partition.snapshot_to_persisting_batch();
};
debug!(%shard_id, %namespace_id, %table_id, %partition_id, %partition_key, "persisting partition");
// Check if there is any data to persist.
let batch = match batch {
Some(v) if !v.data.data.is_empty() => v,
_ => {
warn!(
%shard_id,
%namespace_id,
%table_id,
%partition_id,
%partition_key,
"partition marked for persistence contains no data"
);
return;
}
};
// lookup column IDs from catalog
// TODO: this can be removed once the ingester uses column IDs internally as well
let table_schema = Backoff::new(&self.backoff_config)
.retry_all_errors("get table schema", || async {
let mut repos = self.catalog.repositories().await;
get_table_schema_by_id(table_id, repos.as_mut()).await
})
.await
.expect("retry forever");
// lookup the partition_info from the catalog
let partition_info = Backoff::new(&self.backoff_config)
.retry_all_errors("get partition_info_by_id", || async {
@ -243,217 +301,159 @@ impl Persister for IngesterData {
repos.partitions().partition_info_by_id(partition_id).await
})
.await
.expect("retry forever");
.expect("retry forever").unwrap_or_else(|| panic!("partition {partition_id} in table {table_id} in namespace {namespace_id} in shard {shard_id} has no partition info in catalog"));
// lookup the state from the ingester data. If something isn't found, it's unexpected. Crash
// so someone can take a look.
let partition_info = partition_info
.unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id));
let shard_data = self
.shards
.get(&partition_info.partition.shard_id)
.unwrap_or_else(|| {
panic!(
"shard state for {} not in ingester data",
partition_info.partition.shard_id
)
}); //{
let namespace = shard_data
.namespace(&partition_info.namespace_name)
.unwrap_or_else(|| {
panic!(
"namespace {} not in shard {} state",
partition_info.namespace_name, partition_info.partition.shard_id
)
});
debug!(?partition_id, ?partition_info, "persisting partition");
// do the CPU intensive work of compaction, de-duplication and sorting
let CompactedStream {
stream: record_stream,
iox_metadata,
sort_key_update,
} = compact_persisting_batch(
Arc::new(SystemProvider::new()),
&self.exec,
namespace.namespace_id().get(),
&partition_info,
Arc::clone(&batch),
)
.await
.expect("unable to compact persisting batch");
// lookup column IDs from catalog
// TODO: this can be removed once the ingester uses column IDs internally as well
let table_schema = Backoff::new(&self.backoff_config)
.retry_all_errors("get table schema", || async {
let mut repos = self.catalog.repositories().await;
let table = repos
.tables()
.get_by_namespace_and_name(namespace.namespace_id(), &partition_info.table_name)
.await?
.expect("table not found in catalog");
get_table_schema_by_id(table.id, repos.as_mut()).await
})
// Save the compacted data to a parquet file in object storage.
//
// This call retries until it completes.
let (md, file_size) = self
.store
.upload(record_stream, &iox_metadata)
.await
.expect("retry forever");
.expect("unexpected fatal persist error");
let persisting_batch = namespace
.snapshot_to_persisting(
&partition_info.table_name,
&partition_info.partition.partition_key,
)
.await;
if let Some(persisting_batch) = persisting_batch {
// do the CPU intensive work of compaction, de-duplication and sorting
let compacted_stream = match compact_persisting_batch(
Arc::new(SystemProvider::new()),
&self.exec,
namespace.namespace_id().get(),
&partition_info,
Arc::clone(&persisting_batch),
)
.await
{
Err(e) => {
// this should never error out. if it does, we need to crash hard so
// someone can take a look.
panic!("unable to compact persisting batch with error: {:?}", e);
}
Ok(Some(r)) => r,
Ok(None) => {
warn!("persist called with no data");
return;
}
};
let CompactedStream {
stream: record_stream,
iox_metadata,
sort_key_update,
} = compacted_stream;
// Save the compacted data to a parquet file in object storage.
//
// This call retries until it completes.
let (md, file_size) = self
.store
.upload(record_stream, &iox_metadata)
.await
.expect("unexpected fatal persist error");
// Update the sort key in the catalog if there are
// additional columns BEFORE adding parquet file to the
// catalog. If the order is reversed, the querier or
// compactor may see a parquet file with an inconsistent
// sort key. https://github.com/influxdata/influxdb_iox/issues/5090
if let Some(new_sort_key) = sort_key_update {
let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
Backoff::new(&self.backoff_config)
.retry_all_errors("update_sort_key", || async {
let mut repos = self.catalog.repositories().await;
let _partition = repos
.partitions()
.update_sort_key(partition_id, &sort_key)
.await?;
// compiler insisted on getting told the type of the error :shrug:
Ok(()) as Result<(), iox_catalog::interface::Error>
})
.await
.expect("retry forever");
debug!(
?partition_id,
table = partition_info.table_name,
?new_sort_key,
"adjusted sort key during batch compact & persist"
);
}
// Add the parquet file to the catalog until succeed
let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
table_schema.columns.get(name).expect("Unknown column").id
});
// Assert partitions are persisted in-order.
//
// It is an invariant that partitions are persisted in order so that
// both the per-shard, and per-partition watermarks are correctly
// advanced and accurate.
if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
assert!(
parquet_file.max_sequence_number > last_persist,
"out of order partition persistence, persisting {}, previously persisted {}",
parquet_file.max_sequence_number.get(),
last_persist.get(),
);
}
// Add the parquet file to the catalog.
//
// This has the effect of allowing the queriers to "discover" the
// parquet file by polling / querying the catalog.
// Update the sort key in the catalog if there are
// additional columns BEFORE adding parquet file to the
// catalog. If the order is reversed, the querier or
// compactor may see a parquet file with an inconsistent
// sort key. https://github.com/influxdata/influxdb_iox/issues/5090
if let Some(new_sort_key) = sort_key_update {
let sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
Backoff::new(&self.backoff_config)
.retry_all_errors("add parquet file to catalog", || async {
.retry_all_errors("update_sort_key", || async {
let mut repos = self.catalog.repositories().await;
let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
debug!(
?partition_id,
table_id=?parquet_file.table_id,
parquet_file_id=?parquet_file.id,
table_name=%iox_metadata.table_name,
"parquet file written to catalog"
);
let _partition = repos
.partitions()
.update_sort_key(partition_id, &sort_key)
.await?;
// compiler insisted on getting told the type of the error :shrug:
Ok(()) as Result<(), iox_catalog::interface::Error>
})
.await
.expect("retry forever");
// Update the per-partition persistence watermark, so that new
// ingester instances skip the just-persisted ops during replay.
//
// This could be transactional with the above parquet insert to
// maintain catalog consistency, though in practice it is an
// unnecessary overhead - the system can tolerate replaying the ops
// that lead to this parquet file being generated, and tolerate
// creating a parquet file containing duplicate data (remedied by
// compaction).
//
// This means it is possible to observe a parquet file with a
// max_persisted_sequence_number >
// partition.persisted_sequence_number, either in-between these
// catalog updates, or for however long it takes a crashed ingester
// to restart and replay the ops, and re-persist a file containing
// the same (or subset of) data.
//
// The above is also true of the per-shard persist marker that
// governs the ingester's replay start point, which is
// non-transactionally updated after all partitions have persisted.
Backoff::new(&self.backoff_config)
.retry_all_errors("set partition persist marker", || async {
self.catalog
.repositories()
.await
.partitions()
.update_persisted_sequence_number(
parquet_file.partition_id,
parquet_file.max_sequence_number,
)
.await
})
.await
.expect("retry forever");
// Record metrics
let attributes = Attributes::from([(
"shard_id",
format!("{}", partition_info.partition.shard_id).into(),
)]);
self.persisted_file_size_bytes
.recorder(attributes)
.record(file_size as u64);
// and remove the persisted data from memory
namespace
.mark_persisted(
&partition_info.table_name,
&partition_info.partition.partition_key,
iox_metadata.max_sequence_number,
)
.await;
debug!(
?partition_id,
table_name=%partition_info.table_name,
partition_key=%partition_info.partition.partition_key,
max_sequence_number=%iox_metadata.max_sequence_number.get(),
"marked partition as persisted"
table = partition_info.table_name,
?new_sort_key,
"adjusted sort key during batch compact & persist"
);
}
// Add the parquet file to the catalog until succeed
let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| {
table_schema.columns.get(name).expect("Unknown column").id
});
// Assert partitions are persisted in-order.
//
// It is an invariant that partitions are persisted in order so that
// both the per-shard, and per-partition watermarks are correctly
// advanced and accurate.
if let Some(last_persist) = partition_info.partition.persisted_sequence_number {
assert!(
parquet_file.max_sequence_number > last_persist,
"out of order partition persistence, persisting {}, previously persisted {}",
parquet_file.max_sequence_number.get(),
last_persist.get(),
);
}
// Add the parquet file to the catalog.
//
// This has the effect of allowing the queriers to "discover" the
// parquet file by polling / querying the catalog.
Backoff::new(&self.backoff_config)
.retry_all_errors("add parquet file to catalog", || async {
let mut repos = self.catalog.repositories().await;
let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?;
debug!(
?partition_id,
table_id=?parquet_file.table_id,
parquet_file_id=?parquet_file.id,
table_name=%iox_metadata.table_name,
"parquet file written to catalog"
);
// compiler insisted on getting told the type of the error :shrug:
Ok(()) as Result<(), iox_catalog::interface::Error>
})
.await
.expect("retry forever");
// Update the per-partition persistence watermark, so that new
// ingester instances skip the just-persisted ops during replay.
//
// This could be transactional with the above parquet insert to
// maintain catalog consistency, though in practice it is an
// unnecessary overhead - the system can tolerate replaying the ops
// that lead to this parquet file being generated, and tolerate
// creating a parquet file containing duplicate data (remedied by
// compaction).
//
// This means it is possible to observe a parquet file with a
// max_persisted_sequence_number >
// partition.persisted_sequence_number, either in-between these
// catalog updates, or for however long it takes a crashed ingester
// to restart and replay the ops, and re-persist a file containing
// the same (or subset of) data.
//
// The above is also true of the per-shard persist marker that
// governs the ingester's replay start point, which is
// non-transactionally updated after all partitions have persisted.
Backoff::new(&self.backoff_config)
.retry_all_errors("set partition persist marker", || async {
self.catalog
.repositories()
.await
.partitions()
.update_persisted_sequence_number(
parquet_file.partition_id,
parquet_file.max_sequence_number,
)
.await
})
.await
.expect("retry forever");
// Record metrics
let attributes = Attributes::from([(
"shard_id",
format!("{}", partition_info.partition.shard_id).into(),
)]);
self.persisted_file_size_bytes
.recorder(attributes)
.record(file_size as u64);
// and remove the persisted data from memory
let table_name = TableName::from(&partition_info.table_name);
namespace
.mark_persisted(
&table_name,
&partition_info.partition.partition_key,
iox_metadata.max_sequence_number,
)
.await;
debug!(
?partition_id,
%table_name,
partition_key=%partition_info.partition.partition_key,
max_sequence_number=%iox_metadata.max_sequence_number.get(),
"marked partition as persisted"
);
}
async fn update_min_unpersisted_sequence_number(
@ -475,172 +475,24 @@ impl Persister for IngesterData {
}
}
/// Stream of snapshots.
///
/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
pub(crate) type SnapshotStream =
Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
/// Response data for a single partition.
pub(crate) struct IngesterQueryPartition {
/// Stream of snapshots.
snapshots: SnapshotStream,
/// Partition ID.
id: PartitionId,
/// Partition persistence status.
status: PartitionStatus,
}
impl std::fmt::Debug for IngesterQueryPartition {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("IngesterQueryPartition")
.field("snapshots", &"<SNAPSHOT STREAM>")
.field("id", &self.id)
.field("status", &self.status)
.finish()
}
}
impl IngesterQueryPartition {
pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
Self {
snapshots,
id,
status,
}
}
}
/// Stream of partitions in this response.
pub(crate) type IngesterQueryPartitionStream =
Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
/// Response streams for querier<>ingester requests.
///
/// The data structure is constructed to allow lazy/streaming data generation. For easier
/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
pub struct IngesterQueryResponse {
/// Stream of partitions.
partitions: IngesterQueryPartitionStream,
}
impl std::fmt::Debug for IngesterQueryResponse {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("IngesterQueryResponse")
.field("partitions", &"<PARTITION STREAM>")
.finish()
}
}
impl IngesterQueryResponse {
/// Make a response
pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
Self { partitions }
}
/// Flattens the data according to the wire protocol.
pub fn flatten(self) -> FlatIngesterQueryResponseStream {
self.partitions
.flat_map(|partition_res| match partition_res {
Ok(partition) => {
let head = futures::stream::once(async move {
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: partition.id,
status: partition.status,
})
});
let tail = partition
.snapshots
.flat_map(|snapshot_res| match snapshot_res {
Ok(snapshot) => {
let schema = Arc::new(optimize_schema(&snapshot.schema()));
let schema_captured = Arc::clone(&schema);
let head = futures::stream::once(async {
Ok(FlatIngesterQueryResponse::StartSnapshot {
schema: schema_captured,
})
});
let tail = snapshot.map(move |batch_res| match batch_res {
Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
}),
Err(e) => Err(e),
});
head.chain(tail).boxed()
}
Err(e) => futures::stream::once(async { Err(e) }).boxed(),
});
head.chain(tail).boxed()
}
Err(e) => futures::stream::once(async { Err(e) }).boxed(),
})
.boxed()
}
}
/// Flattened version of [`IngesterQueryResponse`].
pub(crate) type FlatIngesterQueryResponseStream =
Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
/// Element within the flat wire protocol.
#[derive(Debug, PartialEq)]
pub enum FlatIngesterQueryResponse {
/// Start a new partition.
StartPartition {
/// Partition ID.
partition_id: PartitionId,
/// Partition persistence status.
status: PartitionStatus,
},
/// Start a new snapshot.
///
/// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
/// message.
StartSnapshot {
/// Snapshot schema.
schema: Arc<arrow::datatypes::Schema>,
},
/// Add a record batch to the snapshot that was announced by the last
/// [`StartSnapshot`](Self::StartSnapshot) message.
RecordBatch {
/// Record batch.
batch: RecordBatch,
},
}
#[cfg(test)]
mod tests {
use std::{
ops::DerefMut,
sync::Arc,
task::{Context, Poll},
time::Duration,
};
use std::{ops::DerefMut, sync::Arc, time::Duration};
use arrow::datatypes::SchemaRef;
use assert_matches::assert_matches;
use data_types::{
ColumnId, ColumnSet, CompactionLevel, DeletePredicate, NamespaceSchema, NonEmptyString,
ParquetFileParams, Sequence, Timestamp, TimestampRange,
};
use datafusion::physical_plan::RecordBatchStream;
use dml::{DmlDelete, DmlMeta, DmlWrite};
use futures::TryStreamExt;
use iox_catalog::{mem::MemCatalog, validate_or_insert_schema};
use iox_time::Time;
use metric::{MetricObserver, Observation};
use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch};
use mutable_batch_lp::lines_to_batches;
use object_store::memory::InMemory;
use schema::selection::Selection;
use uuid::Uuid;
use super::*;
@ -804,17 +656,20 @@ mod tests {
// limits)
assert!(!should_pause);
let partition_id = {
let (table_id, partition_id) = {
let sd = data.shards.get(&shard1.id).unwrap();
let n = sd.namespace("foo").unwrap();
let mem_table = n.table_data("mem").unwrap();
assert!(n.table_data("mem").is_some());
let n = sd.namespace(&"foo".into()).unwrap();
let mem_table = n.table_data(&"mem".into()).unwrap();
assert!(n.table_data(&"mem".into()).is_some());
let mem_table = mem_table.write().await;
let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
p.id()
let p = mem_table
.get_partition_by_key(&"1970-01-01".into())
.unwrap();
(mem_table.table_id(), p.partition_id())
};
data.persist(partition_id).await;
data.persist(shard1.id, namespace.id, table_id, partition_id)
.await;
// verify that a file got put into object store
let file_paths: Vec<_> = object_store
@ -945,17 +800,20 @@ mod tests {
assert_progress(&data, shard_index, expected_progress).await;
let sd = data.shards.get(&shard1.id).unwrap();
let n = sd.namespace("foo").unwrap();
let n = sd.namespace(&"foo".into()).unwrap();
let partition_id;
let table_id;
{
let mem_table = n.table_data("mem").unwrap();
assert!(n.table_data("cpu").is_some());
let mem_table = mem_table.write().await;
let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap();
let mem_table = n.table_data(&"mem".into()).unwrap();
assert!(n.table_data(&"cpu".into()).is_some());
let mem_table = mem_table.write().await;
table_id = mem_table.table_id();
partition_id = p.id();
let p = mem_table
.get_partition_by_key(&"1970-01-01".into())
.unwrap();
partition_id = p.partition_id();
}
{
// verify the partition doesn't have a sort key before any data has been persisted
@ -969,7 +827,8 @@ mod tests {
assert!(partition_info.partition.sort_key.is_empty());
}
data.persist(partition_id).await;
data.persist(shard1.id, namespace.id, table_id, partition_id)
.await;
// verify that a file got put into object store
let file_paths: Vec<_> = object_store
@ -1061,7 +920,7 @@ mod tests {
.unwrap();
assert_eq!(partition_info.partition.sort_key, vec!["time"]);
let mem_table = n.table_data("mem").unwrap();
let mem_table = n.table_data(&"mem".into()).unwrap();
let mem_table = mem_table.read().await;
// verify that the parquet_max_sequence_number got updated
@ -1177,7 +1036,7 @@ mod tests {
// Get the namespace
let sd = data.shards.get(&shard1.id).unwrap();
let n = sd.namespace("foo").unwrap();
let n = sd.namespace(&"foo".into()).unwrap();
let expected_progress = ShardProgress::new().with_buffered(SequenceNumber::new(1));
assert_progress(&data, shard_index, expected_progress).await;
@ -1336,23 +1195,28 @@ mod tests {
Arc::clone(&metrics),
Arc::new(SystemProvider::new()),
);
let exec = Executor::new(1);
let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog)));
let data = NamespaceData::new(namespace.id, shard.id, partition_provider, &*metrics);
let data = NamespaceData::new(
namespace.id,
"foo".into(),
shard.id,
partition_provider,
&*metrics,
);
// w1 should be ignored because the per-partition replay offset is set
// to 1 already, so it shouldn't be buffered and the buffer should
// remain empty.
let should_pause = data
.buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle(), &exec)
.buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle())
.await
.unwrap();
{
let table_data = data.table_data("mem").unwrap();
let table_data = data.table_data(&"mem".into()).unwrap();
let table = table_data.read().await;
let p = table.partition_data.get(&"1970-01-01".into()).unwrap();
let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
assert_eq!(
p.max_persisted_sequence_number(),
Some(SequenceNumber::new(1))
@ -1362,13 +1226,13 @@ mod tests {
assert!(!should_pause);
// w2 should be in the buffer
data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle(), &exec)
data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle())
.await
.unwrap();
let table_data = data.table_data("mem").unwrap();
let table_data = data.table_data(&"mem".into()).unwrap();
let table = table_data.read().await;
let partition = table.partition_data.get(&"1970-01-01".into()).unwrap();
let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap();
assert_eq!(
partition.data.buffer.as_ref().unwrap().min_sequence_number,
SequenceNumber::new(2)
@ -1454,19 +1318,6 @@ mod tests {
.await
.unwrap();
assert_eq!(
data.shard(shard1.id)
.unwrap()
.namespace(&namespace.name)
.unwrap()
.table_data("mem")
.unwrap()
.read()
.await
.tombstone_max_sequence_number(),
None,
);
let predicate = DeletePredicate {
range: TimestampRange::new(1, 2),
exprs: vec![],
@ -1485,19 +1336,6 @@ mod tests {
data.buffer_operation(shard1.id, DmlOperation::Delete(d1), &manager.handle())
.await
.unwrap();
assert_eq!(
data.shard(shard1.id)
.unwrap()
.namespace(&namespace.name)
.unwrap()
.table_data("mem")
.unwrap()
.read()
.await
.tombstone_max_sequence_number(),
Some(SequenceNumber::new(2)),
);
}
/// Verifies that the progress in data is the same as expected_progress
@ -1513,132 +1351,4 @@ mod tests {
assert_eq!(progresses, expected_progresses);
}
#[tokio::test]
async fn test_ingester_query_response_flatten() {
let batch_1_1 = lp_to_batch("table x=1 0");
let batch_1_2 = lp_to_batch("table x=2 1");
let batch_2 = lp_to_batch("table y=1 10");
let batch_3 = lp_to_batch("table z=1 10");
let schema_1 = batch_1_1.schema();
let schema_2 = batch_2.schema();
let schema_3 = batch_3.schema();
let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
Ok(IngesterQueryPartition::new(
Box::pin(futures::stream::iter([
Ok(Box::pin(TestRecordBatchStream::new(
vec![
Ok(batch_1_1.clone()),
Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
Ok(batch_1_2.clone()),
],
Arc::clone(&schema_1),
)) as _),
Err(ArrowError::InvalidArgumentError("invalid arg".into())),
Ok(Box::pin(TestRecordBatchStream::new(
vec![Ok(batch_2.clone())],
Arc::clone(&schema_2),
)) as _),
Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
])),
PartitionId::new(2),
PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
},
)),
Err(ArrowError::IoError("some io error".into())),
Ok(IngesterQueryPartition::new(
Box::pin(futures::stream::iter([])),
PartitionId::new(1),
PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
},
)),
])));
let actual: Vec<_> = response.flatten().collect().await;
let expected = vec![
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: PartitionId::new(2),
status: PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: Some(SequenceNumber::new(1)),
},
}),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
Err(ArrowError::InvalidArgumentError("invalid arg".into())),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
Err(ArrowError::IoError("some io error".into())),
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: PartitionId::new(1),
status: PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
},
}),
];
assert_eq!(actual.len(), expected.len());
for (actual, expected) in actual.into_iter().zip(expected) {
match (actual, expected) {
(Ok(actual), Ok(expected)) => {
assert_eq!(actual, expected);
}
(Err(_), Err(_)) => {
// cannot compare `ArrowError`, but it's unlikely that someone changed the error
}
(Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
(Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
}
}
}
fn lp_to_batch(lp: &str) -> RecordBatch {
lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
}
pub struct TestRecordBatchStream {
schema: SchemaRef,
batches: Vec<Result<RecordBatch, ArrowError>>,
}
impl TestRecordBatchStream {
pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
Self { schema, batches }
}
}
impl RecordBatchStream for TestRecordBatchStream {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
}
impl futures::Stream for TestRecordBatchStream {
type Item = Result<RecordBatch, ArrowError>;
fn poll_next(
mut self: std::pin::Pin<&mut Self>,
_: &mut Context<'_>,
) -> Poll<Option<Self::Item>> {
if self.batches.is_empty() {
Poll::Ready(None)
} else {
Poll::Ready(Some(self.batches.remove(0)))
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.batches.len(), Some(self.batches.len()))
}
}
}

View File

@ -1,36 +1,91 @@
//! Namespace level data buffer structures.
use std::{
collections::{btree_map::Entry, BTreeMap},
sync::Arc,
};
use std::{collections::HashMap, sync::Arc};
use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId};
use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId};
use dml::DmlOperation;
use iox_catalog::interface::Catalog;
use iox_query::exec::Executor;
use metric::U64Counter;
use observability_deps::tracing::warn;
use parking_lot::RwLock;
use snafu::{OptionExt, ResultExt};
use snafu::ResultExt;
use write_summary::ShardProgress;
#[cfg(test)]
use super::triggers::TestTriggers;
use super::{
partition::{resolver::PartitionProvider, PersistingBatch},
table::TableData,
partition::resolver::PartitionProvider,
table::{TableData, TableName},
};
use crate::lifecycle::LifecycleHandle;
/// A double-referenced map where [`TableData`] can be looked up by name, or ID.
#[derive(Debug, Default)]
struct DoubleRef {
// TODO(4880): this can be removed when IDs are sent over the wire.
by_name: HashMap<TableName, Arc<tokio::sync::RwLock<TableData>>>,
by_id: HashMap<TableId, Arc<tokio::sync::RwLock<TableData>>>,
}
impl DoubleRef {
fn insert(&mut self, t: TableData) -> Arc<tokio::sync::RwLock<TableData>> {
let name = t.table_name().clone();
let id = t.table_id();
let t = Arc::new(tokio::sync::RwLock::new(t));
self.by_name.insert(name, Arc::clone(&t));
self.by_id.insert(id, Arc::clone(&t));
t
}
fn by_name(&self, name: &TableName) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
self.by_name.get(name).map(Arc::clone)
}
fn by_id(&self, id: TableId) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
self.by_id.get(&id).map(Arc::clone)
}
}
/// The string name / identifier of a Namespace.
///
/// A reference-counted, cheap clone-able string.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(crate) struct NamespaceName(Arc<str>);
impl<T> From<T> for NamespaceName
where
T: AsRef<str>,
{
fn from(v: T) -> Self {
Self(Arc::from(v.as_ref()))
}
}
impl std::ops::Deref for NamespaceName {
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl std::fmt::Display for NamespaceName {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Data of a Namespace that belongs to a given Shard
#[derive(Debug)]
pub(crate) struct NamespaceData {
namespace_id: NamespaceId,
namespace_name: NamespaceName,
/// The catalog ID of the shard this namespace is being populated from.
shard_id: ShardId,
tables: RwLock<BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>>,
tables: RwLock<DoubleRef>,
table_count: U64Counter,
/// The resolver of `(shard_id, table_id, partition_key)` to
@ -87,8 +142,9 @@ pub(crate) struct NamespaceData {
impl NamespaceData {
/// Initialize new tables with default partition template of daily
pub fn new(
pub(super) fn new(
namespace_id: NamespaceId,
namespace_name: NamespaceName,
shard_id: ShardId,
partition_provider: Arc<dyn PartitionProvider>,
metrics: &metric::Registry,
@ -102,6 +158,7 @@ impl NamespaceData {
Self {
namespace_id,
namespace_name,
shard_id,
tables: Default::default(),
table_count,
@ -120,7 +177,6 @@ impl NamespaceData {
dml_operation: DmlOperation,
catalog: &Arc<dyn Catalog>,
lifecycle_handle: &dyn LifecycleHandle,
executor: &Executor,
) -> Result<bool, super::Error> {
let sequence_number = dml_operation
.meta()
@ -146,6 +202,7 @@ impl NamespaceData {
.clone();
for (t, b) in write.into_tables() {
let t = TableName::from(t);
let table_data = match self.table_data(&t) {
Some(t) => t,
None => self.insert_table(&t, catalog).await?,
@ -171,19 +228,17 @@ impl NamespaceData {
Ok(pause_writes)
}
DmlOperation::Delete(delete) => {
let table_name = delete.table_name().context(super::TableNotPresentSnafu)?;
let table_data = match self.table_data(table_name) {
Some(t) => t,
None => self.insert_table(table_name, catalog).await?,
};
// Deprecated delete support:
// https://github.com/influxdata/influxdb_iox/issues/5825
warn!(
shard_id=%self.shard_id,
namespace_name=%self.namespace_name,
namespace_id=%self.namespace_id,
table_name=?delete.table_name(),
sequence_number=?delete.meta().sequence(),
"discarding unsupported delete op"
);
let mut table_data = table_data.write().await;
table_data
.buffer_delete(delete.predicate(), sequence_number, &**catalog, executor)
.await?;
// don't pause writes since deletes don't count towards memory limits
Ok(false)
}
}
@ -194,16 +249,16 @@ impl NamespaceData {
#[cfg(test)] // Only used in tests
pub(crate) async fn snapshot(
&self,
table_name: &str,
table_name: &TableName,
partition_key: &PartitionKey,
) -> Option<(
Vec<Arc<super::partition::SnapshotBatch>>,
Option<Arc<PersistingBatch>>,
Option<Arc<super::partition::PersistingBatch>>,
)> {
if let Some(t) = self.table_data(table_name) {
let mut t = t.write().await;
return t.partition_data.get_mut(partition_key).map(|p| {
return t.get_partition_by_key_mut(partition_key).map(|p| {
p.data
.generate_snapshot()
.expect("snapshot on mutable batch should never fail");
@ -217,17 +272,17 @@ impl NamespaceData {
/// Snapshots the mutable buffer for the partition, which clears it out and then moves all
/// snapshots over to a persisting batch, which is returned. If there is no data to snapshot
/// or persist, None will be returned.
#[cfg(test)] // Only used in tests
pub(crate) async fn snapshot_to_persisting(
&self,
table_name: &str,
table_name: &TableName,
partition_key: &PartitionKey,
) -> Option<Arc<PersistingBatch>> {
) -> Option<Arc<super::partition::PersistingBatch>> {
if let Some(table_data) = self.table_data(table_name) {
let mut table_data = table_data.write().await;
return table_data
.partition_data
.get_mut(partition_key)
.get_partition_by_key_mut(partition_key)
.and_then(|partition_data| partition_data.snapshot_to_persisting_batch());
}
@ -237,45 +292,55 @@ impl NamespaceData {
/// Gets the buffered table data
pub(crate) fn table_data(
&self,
table_name: &str,
table_name: &TableName,
) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
let t = self.tables.read();
t.get(table_name).cloned()
t.by_name(table_name)
}
/// Return the table data by ID.
pub(crate) fn table_id(
&self,
table_id: TableId,
) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
let t = self.tables.read();
t.by_id(table_id)
}
/// Inserts the table or returns it if it happens to be inserted by some other thread
async fn insert_table(
&self,
table_name: &str,
table_name: &TableName,
catalog: &Arc<dyn Catalog>,
) -> Result<Arc<tokio::sync::RwLock<TableData>>, super::Error> {
let mut repos = catalog.repositories().await;
let info = repos
.tables()
.get_table_persist_info(self.shard_id, self.namespace_id, table_name)
.await
.context(super::CatalogSnafu)?
.context(super::TableNotFoundSnafu { table_name })?;
.ok_or_else(|| super::Error::TableNotFound {
table_name: table_name.to_string(),
})?;
let mut t = self.tables.write();
let data = match t.entry(table_name.to_string()) {
Entry::Vacant(v) => {
let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new(
Ok(match t.by_name(table_name) {
Some(v) => v,
None => {
self.table_count.inc(1);
// Insert the table and then return a ref to it.
t.insert(TableData::new(
info.table_id,
table_name,
table_name.clone(),
self.shard_id,
self.namespace_id,
info.tombstone_max_sequence_number,
Arc::clone(&self.partition_provider),
))));
self.table_count.inc(1);
Arc::clone(v)
))
}
Entry::Occupied(v) => Arc::clone(v.get()),
};
Ok(data)
})
}
/// Walks down the table and partition and clears the persisting batch. The sequence number is
@ -283,13 +348,13 @@ impl NamespaceData {
/// data buffer.
pub(super) async fn mark_persisted(
&self,
table_name: &str,
table_name: &TableName,
partition_key: &PartitionKey,
sequence_number: SequenceNumber,
) {
if let Some(t) = self.table_data(table_name) {
let mut t = t.write().await;
let partition = t.partition_data.get_mut(partition_key);
let partition = t.get_partition_by_key_mut(partition_key);
if let Some(p) = partition {
p.mark_persisted(sequence_number);
@ -299,7 +364,7 @@ impl NamespaceData {
/// Return progress from this Namespace
pub(super) async fn progress(&self) -> ShardProgress {
let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect();
let tables: Vec<_> = self.tables.read().by_id.values().map(Arc::clone).collect();
// Consolidate progtress across partitions.
let mut progress = ShardProgress::new()
@ -323,6 +388,12 @@ impl NamespaceData {
pub(super) fn table_count(&self) -> &U64Counter {
&self.table_count
}
/// Returns the [`NamespaceName`] for this namespace.
#[cfg(test)]
pub(crate) fn namespace_name(&self) -> &NamespaceName {
&self.namespace_name
}
}
/// RAAI struct that sets buffering sequence number on creation and clears it on free
@ -357,3 +428,92 @@ impl<'a> Drop for ScopedSequenceNumber<'a> {
*buffering_sequence_number = None;
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::{PartitionId, ShardIndex};
use metric::{Attributes, Metric};
use crate::{
data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
lifecycle::mock_handle::MockLifecycleHandle,
test_util::{make_write_op, populate_catalog},
};
use super::*;
const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
const TABLE_NAME: &str = "bananas";
const NAMESPACE_NAME: &str = "platanos";
#[tokio::test]
async fn test_namespace_double_ref() {
let metrics = Arc::new(metric::Registry::default());
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
// Configure the mock partition provider to return a partition for this
// table ID.
let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
PartitionData::new(
PartitionId::new(0),
PartitionKey::from("banana-split"),
shard_id,
ns_id,
table_id,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
),
));
let ns = NamespaceData::new(
ns_id,
NAMESPACE_NAME.into(),
shard_id,
partition_provider,
&*metrics,
);
// Assert the namespace name was stored
assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME);
// Assert the namespace does not contain the test data
assert!(ns.table_data(&TABLE_NAME.into()).is_none());
assert!(ns.table_id(table_id).is_none());
// Write some test data
ns.buffer_operation(
DmlOperation::Write(make_write_op(
&PartitionKey::from("banana-split"),
SHARD_INDEX,
NAMESPACE_NAME,
0,
r#"bananas,city=Medford day="sun",temp=55 22"#,
)),
&catalog,
&MockLifecycleHandle::default(),
)
.await
.expect("buffer op should succeed");
// Both forms of referencing the table should succeed
assert!(ns.table_data(&TABLE_NAME.into()).is_some());
assert!(ns.table_id(table_id).is_some());
// And the table counter metric should increase
let tables = metrics
.get_instrument::<Metric<U64Counter>>("ingester_tables_total")
.expect("failed to read metric")
.get_observer(&Attributes::from([]))
.expect("failed to get observer")
.fetch();
assert_eq!(tables, 1);
}
}

View File

@ -3,18 +3,21 @@
use std::sync::Arc;
use arrow::record_batch::RecordBatch;
use data_types::{
NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, Tombstone,
};
use iox_query::exec::Executor;
use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
use mutable_batch::MutableBatch;
use schema::selection::Selection;
use observability_deps::tracing::*;
use schema::{selection::Selection, sort::SortKey};
use snafu::ResultExt;
use uuid::Uuid;
use write_summary::ShardProgress;
use self::buffer::{BufferBatch, DataBuffer};
use crate::{data::query_dedup::query, query::QueryableBatch};
use self::{
buffer::{BufferBatch, DataBuffer},
resolver::DeferredSortKey,
};
use crate::{querier_handler::PartitionStatus, query::QueryableBatch};
use super::table::TableName;
mod buffer;
pub mod resolver;
@ -28,20 +31,6 @@ pub(crate) struct UnpersistedPartitionData {
pub(crate) partition_status: PartitionStatus,
}
/// Status of a partition that has unpersisted data.
///
/// Note that this structure is specific to a partition (which itself is bound to a table and
/// shard)!
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(missing_copy_implementations)]
pub struct PartitionStatus {
/// Max sequence number persisted
pub parquet_max_sequence_number: Option<SequenceNumber>,
/// Max sequence number for a tombstone
pub tombstone_max_sequence_number: Option<SequenceNumber>,
}
/// PersistingBatch contains all needed info and data for creating
/// a parquet file for given set of SnapshotBatches
#[derive(Debug, PartialEq, Clone)]
@ -132,7 +121,28 @@ impl SnapshotBatch {
}
}
/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard
/// The load state of the [`SortKey`] for a given partition.
#[derive(Debug)]
pub(crate) enum SortKeyState {
/// The [`SortKey`] has not yet been fetched from the catalog, and will be
/// lazy loaded (or loaded in the background) by a call to
/// [`DeferredSortKey::get()`].
Deferred(DeferredSortKey),
/// The sort key is known and specified.
Provided(Option<SortKey>),
}
impl SortKeyState {
async fn get(&self) -> Option<SortKey> {
match self {
Self::Deferred(v) => v.get().await,
Self::Provided(v) => v.clone(),
}
}
}
/// Data of an IOx Partition of a given Table of a Namespace that belongs to a
/// given Shard
#[derive(Debug)]
pub struct PartitionData {
/// The catalog ID of the partition this buffer is for.
@ -140,12 +150,23 @@ pub struct PartitionData {
/// The string partition key for this partition.
partition_key: PartitionKey,
/// The sort key of this partition.
///
/// This can known, in which case this field will contain a
/// [`SortKeyState::Provided`] with the [`SortKey`], or unknown with a value
/// of [`SortKeyState::Deferred`] causing it to be loaded from the catalog
/// (potentially) in the background or at read time.
///
/// Callers should use [`Self::sort_key()`] to be abstracted away from these
/// fetch details.
sort_key: SortKeyState,
/// The shard, namespace & table IDs for this partition.
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
/// The name of the table this partition is part of.
table_name: Arc<str>,
table_name: TableName,
pub(super) data: DataBuffer,
@ -156,18 +177,21 @@ pub struct PartitionData {
impl PartitionData {
/// Initialize a new partition data buffer
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
id: PartitionId,
partition_key: PartitionKey,
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
sort_key: SortKeyState,
max_persisted_sequence_number: Option<SequenceNumber>,
) -> Self {
Self {
id,
partition_key,
sort_key,
shard_id,
namespace_id,
table_id,
@ -209,100 +233,36 @@ impl PartitionData {
sequence_number: SequenceNumber,
mb: MutableBatch,
) -> Result<(), super::Error> {
match &mut self.data.buffer {
let (min_sequence_number, max_sequence_number) = match &mut self.data.buffer {
Some(buf) => {
buf.max_sequence_number = sequence_number.max(buf.max_sequence_number);
buf.data.extend_from(&mb).context(super::BufferWriteSnafu)?;
(buf.min_sequence_number, buf.max_sequence_number)
}
None => {
self.data.buffer = Some(BufferBatch {
min_sequence_number: sequence_number,
max_sequence_number: sequence_number,
data: mb,
})
});
(sequence_number, sequence_number)
}
}
};
trace!(
min_sequence_number=?min_sequence_number,
max_sequence_number=?max_sequence_number,
"buffered write"
);
Ok(())
}
/// Buffers a new tombstone:
/// . All the data in the `buffer` and `snapshots` will be replaced with one
/// tombstone-applied snapshot
/// . The tombstone is only added in the `deletes_during_persisting` if the `persisting`
/// exists
pub(super) async fn buffer_tombstone(&mut self, executor: &Executor, tombstone: Tombstone) {
self.data.add_tombstone(tombstone.clone());
// ----------------------------------------------------------
// First apply the tombstone on all in-memory & non-persisting data
// Make a QueryableBatch for all buffer + snapshots + the given tombstone
let max_sequence_number = tombstone.sequence_number;
let query_batch = match self.data.snapshot_to_queryable_batch(
&self.table_name,
self.id,
Some(tombstone.clone()),
) {
Some(query_batch) if !query_batch.is_empty() => query_batch,
_ => {
// No need to proceed further
return;
}
};
let (min_sequence_number, _) = query_batch.min_max_sequence_numbers();
assert!(min_sequence_number <= max_sequence_number);
// Run query on the QueryableBatch to apply the tombstone.
let stream = match query(executor, Arc::new(query_batch)).await {
Err(e) => {
// this should never error out. if it does, we need to crash hard so
// someone can take a look.
panic!("unable to apply tombstones on snapshots: {:?}", e);
}
Ok(stream) => stream,
};
let record_batches = match datafusion::physical_plan::common::collect(stream).await {
Err(e) => {
// this should never error out. if it does, we need to crash hard so
// someone can take a look.
panic!("unable to collect record batches: {:?}", e);
}
Ok(batches) => batches,
};
// Merge all result record batches into one record batch
// and make a snapshot for it
let snapshot = if !record_batches.is_empty() {
let record_batch =
arrow::compute::concat_batches(&record_batches[0].schema(), &record_batches)
.unwrap_or_else(|e| {
panic!("unable to concat record batches: {:?}", e);
});
let snapshot = SnapshotBatch {
min_sequence_number,
max_sequence_number,
data: Arc::new(record_batch),
};
Some(Arc::new(snapshot))
} else {
None
};
// ----------------------------------------------------------
// Add the tombstone-applied data back in as one snapshot
if let Some(snapshot) = snapshot {
self.data.snapshots.push(snapshot);
}
}
/// Return the progress from this Partition
pub(super) fn progress(&self) -> ShardProgress {
self.data.progress()
}
pub(super) fn id(&self) -> PartitionId {
pub(super) fn partition_id(&self) -> PartitionId {
self.id
}
@ -347,6 +307,13 @@ impl PartitionData {
pub fn namespace_id(&self) -> NamespaceId {
self.namespace_id
}
/// Return the [`SortKey`] for this partition.
///
/// NOTE: this MAY involve querying the catalog with unbounded retries.
pub async fn sort_key(&self) -> Option<SortKey> {
self.sort_key.get().await
}
}
#[cfg(test)]
@ -355,7 +322,6 @@ mod tests {
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
use super::*;
use crate::test_util::create_tombstone;
#[test]
fn snapshot_buffer_different_but_compatible_schemas() {
@ -366,6 +332,7 @@ mod tests {
NamespaceId::new(42),
TableId::new(1),
"foo".into(),
SortKeyState::Provided(None),
None,
);
@ -401,7 +368,7 @@ mod tests {
// Test deletes mixed with writes on a single parittion
#[tokio::test]
async fn writes_and_deletes() {
async fn writes() {
// Make a partition with empty DataBuffer
let s_id = 1;
let t_id = 1;
@ -413,9 +380,9 @@ mod tests {
NamespaceId::new(42),
TableId::new(t_id),
"restaurant".into(),
SortKeyState::Provided(None),
None,
);
let exec = Executor::new(1);
// ------------------------------------------
// Fill `buffer`
@ -438,42 +405,8 @@ mod tests {
SequenceNumber::new(2)
);
assert_eq!(p.data.snapshots.len(), 0);
assert_eq!(p.data.deletes_during_persisting().len(), 0);
assert_eq!(p.data.persisting, None);
// ------------------------------------------
// Delete
// --- seq_num: 3
let ts = create_tombstone(
1, // tombstone id
t_id, // table id
s_id, // shard id
3, // delete's seq_number
0, // min time of data to get deleted
20, // max time of data to get deleted
"day=thu", // delete predicate
);
// one row will get deleted, the other is moved to snapshot
p.buffer_tombstone(&exec, ts).await;
// verify data
assert!(p.data.buffer.is_none()); // always empty after delete
assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data
assert_eq!(p.data.deletes_during_persisting().len(), 0);
assert_eq!(p.data.persisting, None);
// snapshot only has one row since the other one got deleted
let data = (*p.data.snapshots[0].data).clone();
let expected = vec![
"+--------+-----+------+--------------------------------+",
"| city | day | temp | time |",
"+--------+-----+------+--------------------------------+",
"| Boston | fri | 50 | 1970-01-01T00:00:00.000000010Z |",
"+--------+-----+------+--------------------------------+",
];
assert_batches_sorted_eq!(&expected, &[data]);
assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 3);
// ------------------------------------------
// Fill `buffer`
// --- seq_num: 4
@ -493,50 +426,15 @@ mod tests {
// verify data
assert_eq!(
p.data.buffer.as_ref().unwrap().min_sequence_number,
SequenceNumber::new(4)
SequenceNumber::new(1)
);
assert_eq!(
p.data.buffer.as_ref().unwrap().max_sequence_number,
SequenceNumber::new(5)
);
assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot
assert_eq!(p.data.deletes_during_persisting().len(), 0);
assert_eq!(p.data.snapshots.len(), 0);
assert_eq!(p.data.persisting, None);
// ------------------------------------------
// Delete
// --- seq_num: 6
let ts = create_tombstone(
2, // tombstone id
t_id, // table id
s_id, // shard id
6, // delete's seq_number
10, // min time of data to get deleted
50, // max time of data to get deleted
"city=Boston", // delete predicate
);
// two rows will get deleted, one from existing snapshot, one from the buffer being moved
// to snpashot
p.buffer_tombstone(&exec, ts).await;
// verify data
assert!(p.data.buffer.is_none()); // always empty after delete
assert_eq!(p.data.snapshots.len(), 1); // one snpashot
assert_eq!(p.data.deletes_during_persisting().len(), 0);
assert_eq!(p.data.persisting, None);
// snapshot only has two rows since the other 2 rows with city=Boston have got deleted
let data = (*p.data.snapshots[0].data).clone();
let expected = vec![
"+---------+-----+------+--------------------------------+",
"| city | day | temp | time |",
"+---------+-----+------+--------------------------------+",
"| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |",
"+---------+-----+------+--------------------------------+",
];
assert_batches_sorted_eq!(&expected, &[data]);
assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1);
assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 6);
assert!(p.data.buffer.is_some());
// ------------------------------------------
// Persisting
@ -545,32 +443,12 @@ mod tests {
// verify data
assert!(p.data.buffer.is_none()); // always empty after issuing persit
assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit
assert_eq!(p.data.deletes_during_persisting().len(), 0); // deletes not happen yet
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
// ------------------------------------------
// Delete
// --- seq_num: 7
let ts = create_tombstone(
3, // tombstone id
t_id, // table id
s_id, // shard id
7, // delete's seq_number
10, // min time of data to get deleted
50, // max time of data to get deleted
"temp=55", // delete predicate
);
// if a query come while persisting, the row with temp=55 will be deleted before
// data is sent back to Querier
p.buffer_tombstone(&exec, ts).await;
// verify data
assert!(p.data.buffer.is_none()); // always empty after delete
// no snpashots becasue buffer has not data yet and the
// snapshot was empty too
assert_eq!(p.data.snapshots.len(), 0);
assert_eq!(p.data.deletes_during_persisting().len(), 1); // tombstone added since data is
// persisting
assert!(p.data.buffer.is_none());
assert_eq!(p.data.snapshots.len(), 0); // no snpashots becasue buffer has not data yet and the
// snapshot was empty too
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
// ------------------------------------------
@ -591,7 +469,6 @@ mod tests {
SequenceNumber::new(8)
); // 1 newly added mutable batch of 3 rows of data
assert_eq!(p.data.snapshots.len(), 0); // still empty
assert_eq!(p.data.deletes_during_persisting().len(), 1);
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
// ------------------------------------------
@ -600,7 +477,6 @@ mod tests {
// verify data
assert!(p.data.buffer.is_none()); // empty after snapshot
assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer
assert_eq!(p.data.deletes_during_persisting().len(), 1);
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
// snapshot has three rows moved from buffer
let data = (*p.data.snapshots[0].data).clone();
@ -616,41 +492,5 @@ mod tests {
assert_batches_sorted_eq!(&expected, &[data]);
assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 8);
// ------------------------------------------
// Delete
// --- seq_num: 9
let ts = create_tombstone(
4, // tombstone id
t_id, // table id
s_id, // shard id
9, // delete's seq_number
10, // min time of data to get deleted
50, // max time of data to get deleted
"temp=60", // delete predicate
);
// the row with temp=60 will be removed from the sanphot
p.buffer_tombstone(&exec, ts).await;
// verify data
assert!(p.data.buffer.is_none()); // always empty after delete
assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied
assert_eq!(p.data.deletes_during_persisting().len(), 2); // one more tombstone added make it 2
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
// snapshot has only 2 rows because the row with tem=60 was removed
let data = (*p.data.snapshots[0].data).clone();
let expected = vec![
"+------------+-----+------+--------------------------------+",
"| city | day | temp | time |",
"+------------+-----+------+--------------------------------+",
"| Wilmington | sun | 55 | 1970-01-01T00:00:00.000000035Z |",
"| Boston | sun | 62 | 1970-01-01T00:00:00.000000038Z |",
"+------------+-----+------+--------------------------------+",
];
assert_batches_sorted_eq!(&expected, &[data]);
assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8);
assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 9);
exec.join().await;
}
}

View File

@ -2,13 +2,15 @@
use std::sync::Arc;
use data_types::{PartitionId, SequenceNumber, ShardId, TableId, Tombstone};
use data_types::{PartitionId, SequenceNumber, ShardId, TableId};
use mutable_batch::MutableBatch;
use schema::selection::Selection;
use snafu::ResultExt;
use uuid::Uuid;
use write_summary::ShardProgress;
use crate::data::table::TableName;
use super::{PersistingBatch, QueryableBatch, SnapshotBatch};
/// Data of an IOx partition split into batches
@ -38,14 +40,6 @@ pub(crate) struct DataBuffer {
/// Buffer of incoming writes
pub(crate) buffer: Option<BufferBatch>,
/// Buffer of tombstones whose time range may overlap with this partition.
/// All tombstones were already applied to corresponding snapshots. This list
/// only keep the ones that come during persisting. The reason
/// we keep them becasue if a query comes, we need to apply these tombstones
/// on the persiting data before sending it to the Querier
/// When the `persiting` is done and removed, this list will get empty, too
deletes_during_persisting: Vec<Tombstone>,
/// Data in `buffer` will be moved to a `snapshot` when one of these happens:
/// . A background persist is called
/// . A read request from Querier
@ -70,14 +64,6 @@ pub(crate) struct DataBuffer {
}
impl DataBuffer {
/// Add a new tombstones into the [`DataBuffer`].
pub(super) fn add_tombstone(&mut self, tombstone: Tombstone) {
// Only keep this tombstone if some data is being persisted
if self.persisting.is_some() {
self.deletes_during_persisting.push(tombstone);
}
}
/// If a [`BufferBatch`] exists, convert it to a [`SnapshotBatch`] and add
/// it to the list of snapshots.
///
@ -109,9 +95,8 @@ impl DataBuffer {
/// Both buffer and snapshots will be empty after this
pub(super) fn snapshot_to_queryable_batch(
&mut self,
table_name: &Arc<str>,
table_name: &TableName,
partition_id: PartitionId,
tombstone: Option<Tombstone>,
) -> Option<QueryableBatch> {
self.generate_snapshot()
.expect("This mutable batch snapshot error should be impossible.");
@ -119,21 +104,11 @@ impl DataBuffer {
let mut data = vec![];
std::mem::swap(&mut data, &mut self.snapshots);
let mut tombstones = vec![];
if let Some(tombstone) = tombstone {
tombstones.push(tombstone);
}
// only produce batch if there is any data
if data.is_empty() {
None
} else {
Some(QueryableBatch::new(
Arc::clone(table_name),
partition_id,
data,
tombstones,
))
Some(QueryableBatch::new(table_name.clone(), partition_id, data))
}
}
@ -164,15 +139,13 @@ impl DataBuffer {
shard_id: ShardId,
table_id: TableId,
partition_id: PartitionId,
table_name: &Arc<str>,
table_name: &TableName,
) -> Option<Arc<PersistingBatch>> {
if self.persisting.is_some() {
panic!("Unable to snapshot while persisting. This is an unexpected state.")
}
if let Some(queryable_batch) =
self.snapshot_to_queryable_batch(table_name, partition_id, None)
{
if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, partition_id) {
let persisting_batch = Arc::new(PersistingBatch {
shard_id,
table_id,
@ -197,12 +170,7 @@ impl DataBuffer {
};
// persisting data
let mut queryable_batch = (*persisting.data).clone();
// Add new tombstones if any
queryable_batch.add_tombstones(&self.deletes_during_persisting);
Some(queryable_batch)
Some((*persisting.data).clone())
}
/// Return the progress in this DataBuffer
@ -239,12 +207,6 @@ impl DataBuffer {
pub(crate) fn mark_persisted(&mut self) {
self.persisting = None;
self.deletes_during_persisting.clear()
}
#[cfg(test)]
pub(super) fn deletes_during_persisting(&self) -> &[Tombstone] {
self.deletes_during_persisting.as_ref()
}
}

View File

@ -1,13 +1,18 @@
use std::{collections::HashMap, sync::Arc};
use std::{collections::HashMap, sync::Arc, time::Duration};
use async_trait::async_trait;
use backoff::BackoffConfig;
use data_types::{
NamespaceId, Partition, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId,
};
use iox_catalog::interface::Catalog;
use observability_deps::tracing::debug;
use parking_lot::Mutex;
use crate::data::partition::PartitionData;
use crate::data::{
partition::{resolver::DeferredSortKey, PartitionData, SortKeyState},
table::TableName,
};
use super::r#trait::PartitionProvider;
@ -43,6 +48,18 @@ struct Entry {
/// Each cache hit _removes_ the entry from the cache - this eliminates the
/// memory overhead for items that were hit. This is the expected (only valid!)
/// usage pattern.
///
/// # Deferred Sort Key Loading
///
/// This cache does NOT cache the [`SortKey`] for each [`PartitionData`], as the
/// sort key can be large and is likely unique per table, and thus not
/// share-able across instances / prohibitively expensive to cache.
///
/// Instead cached instances are returned with a deferred sort key resolver
/// which attempts to fetch the sort key in the background some time after
/// construction.
///
/// [`SortKey`]: schema::sort::SortKey
#[derive(Debug)]
pub(crate) struct PartitionCache<T> {
// The inner delegate called for a cache miss.
@ -59,13 +76,31 @@ pub(crate) struct PartitionCache<T> {
/// a faster search for cache misses.
#[allow(clippy::type_complexity)]
entries: Mutex<HashMap<PartitionKey, HashMap<ShardId, HashMap<TableId, Entry>>>>,
/// Data needed to construct the [`DeferredSortKey`] for cached entries.
catalog: Arc<dyn Catalog>,
backoff_config: BackoffConfig,
/// The maximum amount of time a [`DeferredSortKey`] may wait until
/// pre-fetching the sort key in the background.
max_smear: Duration,
}
impl<T> PartitionCache<T> {
/// Initialise a [`PartitionCache`] containing the specified partitions.
///
/// Any cache miss is passed through to `inner`.
pub(crate) fn new<P>(inner: T, partitions: P) -> Self
///
/// Any cache hit returns a [`PartitionData`] configured with a
/// [`SortKeyState::Deferred`] for deferred key loading in the background.
/// The [`DeferredSortKey`] is initialised with the given `catalog`,
/// `backoff_config`, and `max_smear` maximal load wait duration.
pub(crate) fn new<P>(
inner: T,
partitions: P,
max_smear: Duration,
catalog: Arc<dyn Catalog>,
backoff_config: BackoffConfig,
) -> Self
where
P: IntoIterator<Item = Partition>,
{
@ -97,6 +132,9 @@ impl<T> PartitionCache<T> {
Self {
entries: Mutex::new(entries),
inner,
catalog,
backoff_config,
max_smear,
}
}
@ -154,7 +192,7 @@ where
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
) -> PartitionData {
// Use the cached PartitionKey instead of the caller's partition_key,
// instead preferring to reuse the already-shared Arc<str> in the cache.
@ -171,6 +209,12 @@ where
namespace_id,
table_id,
table_name,
SortKeyState::Deferred(DeferredSortKey::new(
cached.partition_id,
self.max_smear,
Arc::clone(&__self.catalog),
self.backoff_config.clone(),
)),
cached.max_sequence_number,
);
}
@ -186,6 +230,8 @@ where
#[cfg(test)]
mod tests {
use iox_catalog::mem::MemCatalog;
use crate::data::partition::resolver::MockPartitionProvider;
use super::*;
@ -197,6 +243,22 @@ mod tests {
const TABLE_ID: TableId = TableId::new(3);
const TABLE_NAME: &str = "platanos";
fn new_cache<P>(
inner: MockPartitionProvider,
partitions: P,
) -> PartitionCache<MockPartitionProvider>
where
P: IntoIterator<Item = Partition>,
{
PartitionCache::new(
inner,
partitions,
Duration::from_secs(10_000_000),
Arc::new(MemCatalog::new(Arc::new(metric::Registry::default()))),
BackoffConfig::default(),
)
}
#[tokio::test]
async fn test_miss() {
let data = PartitionData::new(
@ -206,11 +268,12 @@ mod tests {
NAMESPACE_ID,
TABLE_ID,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
);
let inner = MockPartitionProvider::default().with_partition(data);
let cache = PartitionCache::new(inner, []);
let cache = new_cache(inner, []);
let got = cache
.get_partition(
PARTITION_KEY.into(),
@ -221,7 +284,7 @@ mod tests {
)
.await;
assert_eq!(got.id(), PARTITION_ID);
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.shard_id(), SHARD_ID);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(got.table_name(), TABLE_NAME);
@ -238,11 +301,11 @@ mod tests {
shard_id: SHARD_ID,
table_id: TABLE_ID,
partition_key: stored_partition_key.clone(),
sort_key: Default::default(),
sort_key: vec!["dos".to_string(), "bananas".to_string()],
persisted_sequence_number: Default::default(),
};
let cache = PartitionCache::new(inner, [partition]);
let cache = new_cache(inner, [partition]);
let callers_partition_key = PartitionKey::from(PARTITION_KEY);
let got = cache
@ -255,7 +318,7 @@ mod tests {
)
.await;
assert_eq!(got.id(), PARTITION_ID);
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.shard_id(), SHARD_ID);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(got.table_name(), TABLE_NAME);
@ -274,7 +337,7 @@ mod tests {
}
#[tokio::test]
async fn test_miss_partition_jey() {
async fn test_miss_partition_key() {
let other_key = PartitionKey::from("test");
let other_key_id = PartitionId::new(99);
let inner = MockPartitionProvider::default().with_partition(PartitionData::new(
@ -284,6 +347,7 @@ mod tests {
NAMESPACE_ID,
TABLE_ID,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
));
@ -296,7 +360,7 @@ mod tests {
persisted_sequence_number: Default::default(),
};
let cache = PartitionCache::new(inner, [partition]);
let cache = new_cache(inner, [partition]);
let got = cache
.get_partition(
other_key.clone(),
@ -307,7 +371,7 @@ mod tests {
)
.await;
assert_eq!(got.id(), other_key_id);
assert_eq!(got.partition_id(), other_key_id);
assert_eq!(got.shard_id(), SHARD_ID);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(got.table_name(), TABLE_NAME);
@ -323,6 +387,7 @@ mod tests {
NAMESPACE_ID,
other_table,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
));
@ -335,7 +400,7 @@ mod tests {
persisted_sequence_number: Default::default(),
};
let cache = PartitionCache::new(inner, [partition]);
let cache = new_cache(inner, [partition]);
let got = cache
.get_partition(
PARTITION_KEY.into(),
@ -346,7 +411,7 @@ mod tests {
)
.await;
assert_eq!(got.id(), PARTITION_ID);
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.shard_id(), SHARD_ID);
assert_eq!(got.table_id(), other_table);
assert_eq!(got.table_name(), TABLE_NAME);
@ -362,6 +427,7 @@ mod tests {
NAMESPACE_ID,
TABLE_ID,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
));
@ -374,7 +440,7 @@ mod tests {
persisted_sequence_number: Default::default(),
};
let cache = PartitionCache::new(inner, [partition]);
let cache = new_cache(inner, [partition]);
let got = cache
.get_partition(
PARTITION_KEY.into(),
@ -385,7 +451,7 @@ mod tests {
)
.await;
assert_eq!(got.id(), PARTITION_ID);
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.shard_id(), other_shard);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(got.table_name(), TABLE_NAME);

View File

@ -9,7 +9,10 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
use iox_catalog::interface::Catalog;
use observability_deps::tracing::debug;
use crate::data::partition::PartitionData;
use crate::data::{
partition::{PartitionData, SortKeyState},
table::TableName,
};
use super::r#trait::PartitionProvider;
@ -55,7 +58,7 @@ impl PartitionProvider for CatalogPartitionResolver {
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
) -> PartitionData {
debug!(
%partition_key,
@ -78,6 +81,7 @@ impl PartitionProvider for CatalogPartitionResolver {
namespace_id,
table_id,
table_name,
SortKeyState::Provided(p.sort_key()),
p.persisted_sequence_number,
)
}
@ -131,7 +135,7 @@ mod tests {
};
let callers_partition_key = PartitionKey::from(PARTITION_KEY);
let table_name = TABLE_NAME.into();
let table_name = TableName::from(TABLE_NAME);
let resolver = CatalogPartitionResolver::new(Arc::clone(&catalog));
let got = resolver
.get_partition(
@ -139,11 +143,12 @@ mod tests {
shard_id,
namespace_id,
table_id,
Arc::clone(&table_name),
table_name.clone(),
)
.await;
assert_eq!(got.namespace_id(), namespace_id);
assert_eq!(*got.table_name(), *table_name);
assert_eq!(got.sort_key().await, None);
assert_eq!(got.max_persisted_sequence_number(), None);
assert!(got.partition_key.ptr_eq(&callers_partition_key));

View File

@ -1,12 +1,12 @@
//! A mock [`PartitionProvider`] to inject [`PartitionData`] for tests.
use std::{collections::HashMap, sync::Arc};
use std::collections::HashMap;
use async_trait::async_trait;
use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
use parking_lot::Mutex;
use crate::data::partition::PartitionData;
use crate::data::{partition::PartitionData, table::TableName};
use super::r#trait::PartitionProvider;
@ -58,7 +58,7 @@ impl PartitionProvider for MockPartitionProvider {
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
) -> PartitionData {
let p = self
.partitions

View File

@ -11,6 +11,9 @@ pub use r#trait::*;
mod catalog;
pub use catalog::*;
mod sort_key;
pub(crate) use sort_key::*;
#[cfg(test)]
mod mock;
#[cfg(test)]

View File

@ -0,0 +1,331 @@
//! A optimised resolver of a partition [`SortKey`].
use std::{sync::Arc, time::Duration};
use backoff::{Backoff, BackoffConfig};
use data_types::PartitionId;
use iox_catalog::interface::Catalog;
use parking_lot::Mutex;
use rand::Rng;
use schema::sort::SortKey;
use tokio::task::JoinHandle;
/// The states of a [`DeferredSortKey`] instance.
#[derive(Debug)]
enum State {
/// The value has not yet been fetched by the background task.
Unresolved,
/// The value was fetched by the background task and is read to be consumed.
Resolved(Option<SortKey>),
}
/// A resolver of [`SortKey`] from the catalog for a given partition.
///
/// This implementation combines lazy / deferred loading of the [`SortKey`] from
/// the [`Catalog`], and a background timer that pre-fetches the [`SortKey`]
/// after some random duration of time. Combined, these behaviours smear the
/// [`SortKey`] queries across the allowable time range, avoiding a large number
/// of queries from executing when multiple [`SortKey`] are needed in the system
/// at one point in time.
///
/// If the [`DeferredSortKey`] is dropped and the background task is still
/// incomplete (sleeping / actively fetching the [`SortKey`]) it is aborted
/// immediately. The background task exists once it has successfully fetched the
/// [`SortKey`].
///
/// # Stale Cached Values
///
/// This is effectively a cache that is pre-warmed in the background - this
/// necessitates that the caller can tolerate, or determine, stale values.
#[derive(Debug)]
pub(crate) struct DeferredSortKey {
value: Arc<Mutex<State>>,
partition_id: PartitionId,
handle: JoinHandle<()>,
backoff_config: BackoffConfig,
catalog: Arc<dyn Catalog>,
}
impl DeferredSortKey {
/// Construct a [`DeferredSortKey`] instance that fetches the [`SortKey`]
/// for the specified `partition_id`.
///
/// The background task will wait a uniformly random duration of time
/// between `[0, max_smear)` before attempting to pre-fetch the [`SortKey`]
/// from `catalog`.
pub(crate) fn new(
partition_id: PartitionId,
max_smear: Duration,
catalog: Arc<dyn Catalog>,
backoff_config: BackoffConfig,
) -> Self {
// Init the value container the background thread populates.
let value = Arc::new(Mutex::new(State::Unresolved));
// Select random duration from a uniform distribution, up to the
// configured maximum.
let wait_for = rand::thread_rng().gen_range(Duration::ZERO..max_smear);
// Spawn the background task, sleeping for the random duration of time
// before fetching the sort key.
let handle = tokio::spawn({
let value = Arc::clone(&value);
let catalog = Arc::clone(&catalog);
let backoff_config = backoff_config.clone();
async move {
// Sleep for the random duration
tokio::time::sleep(wait_for).await;
// Fetch the sort key from the catalog
let v = fetch(partition_id, &*catalog, &backoff_config).await;
// And attempt to update the value container, if it hasn't
// already resolved
let mut state = value.lock();
*state = match *state {
State::Unresolved => State::Resolved(v),
State::Resolved(_) => return,
};
}
});
Self {
value,
partition_id,
handle,
backoff_config,
catalog,
}
}
/// Read the [`SortKey`] for the partition.
///
/// If the [`SortKey`] was pre-fetched in the background, it is returned
/// immediately. If the [`SortKey`] has not yet been resolved, this call
/// blocks while it is read from the [`Catalog`].
///
/// # Concurrency
///
/// If this method requires resolving the [`SortKey`], N concurrent callers
/// will cause N queries against the catalog.
///
/// # Await Safety
///
/// Cancelling the future returned by calling [`Self::get()`] before
/// completion will leave [`Self`] without a background task. The next call
/// to [`Self::get()`] will incur a catalog query (see concurrency above).
pub(crate) async fn get(&self) -> Option<SortKey> {
{
let state = self.value.lock();
// If there is a resolved value, return it.
if let State::Resolved(v) = &*state {
return v.clone();
}
}
// Otherwise resolve the value immediately, aborting the background
// task.
self.handle.abort();
let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await;
{
let mut state = self.value.lock();
*state = State::Resolved(sort_key.clone());
}
sort_key
}
}
impl Drop for DeferredSortKey {
fn drop(&mut self) {
// Attempt to abort the background task, regardless of it having
// completed or not.
self.handle.abort()
}
}
/// Fetch the [`SortKey`] from the [`Catalog`] for `partition_id`, retrying
/// endlessly when errors occur.
async fn fetch(
partition_id: PartitionId,
catalog: &dyn Catalog,
backoff_config: &BackoffConfig,
) -> Option<SortKey> {
Backoff::new(backoff_config)
.retry_all_errors("fetch partition sort key", || async {
let s = catalog
.repositories()
.await
.partitions()
.get_by_id(partition_id)
.await?
.expect("resolving sort key for non-existent partition")
.sort_key();
Result::<_, iox_catalog::interface::Error>::Ok(s)
})
.await
.expect("retry forever")
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::ShardIndex;
use test_helpers::timeout::FutureTimeout;
use crate::test_util::populate_catalog;
use super::*;
const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
const TABLE_NAME: &str = "bananas";
const NAMESPACE_NAME: &str = "platanos";
const PARTITION_KEY: &str = "platanos";
// A test that (most likely) exercises the "read on demand" code path.
//
// The background task is configured to run some time between now, and
// 10,000,000 seconds in the future - it most likely doesn't get to complete
// before the get() call is issued.
//
// If this test flakes, it is POSSIBLE but UNLIKELY that the background task
// has completed and the get() call reads a pre-fetched value.
#[tokio::test]
async fn test_read_demand() {
const LONG_LONG_TIME: Duration = Duration::from_secs(10_000_000);
let metrics = Arc::new(metric::Registry::default());
let backoff_config = BackoffConfig::default();
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, _ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
let partition_id = catalog
.repositories()
.await
.partitions()
.create_or_get(PARTITION_KEY.into(), shard_id, table_id)
.await
.expect("should create")
.id;
// Read the just-created sort key (None)
let fetched = DeferredSortKey::new(
partition_id,
Duration::from_secs(36_000_000),
Arc::clone(&catalog),
backoff_config.clone(),
)
.get()
.await;
assert!(fetched.is_none());
// Set the sort key
let catalog_state = catalog
.repositories()
.await
.partitions()
.update_sort_key(partition_id, &["uno", "dos", "bananas"])
.await
.expect("should update existing partition key");
// Read the updated sort key
let fetched = DeferredSortKey::new(
partition_id,
LONG_LONG_TIME,
Arc::clone(&catalog),
backoff_config,
)
.get()
.await;
assert!(fetched.is_some());
assert_eq!(fetched, catalog_state.sort_key());
}
// A test that deterministically exercises the "background pre-fetch" code path.
#[tokio::test]
async fn test_read_pre_fetched() {
let metrics = Arc::new(metric::Registry::default());
let backoff_config = BackoffConfig::default();
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, _ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
let partition_id = catalog
.repositories()
.await
.partitions()
.create_or_get(PARTITION_KEY.into(), shard_id, table_id)
.await
.expect("should create")
.id;
// Read the just-created sort key (None)
let fetcher = DeferredSortKey::new(
partition_id,
Duration::from_nanos(1),
Arc::clone(&catalog),
backoff_config.clone(),
);
// Spin, waiting for the background task to show as complete.
async {
loop {
if fetcher.handle.is_finished() {
return;
}
tokio::task::yield_now().await
}
}
.with_timeout_panic(Duration::from_secs(5))
.await;
assert!(fetcher.get().await.is_none());
// Set the sort key
let catalog_state = catalog
.repositories()
.await
.partitions()
.update_sort_key(partition_id, &["uno", "dos", "bananas"])
.await
.expect("should update existing partition key");
// Read the updated sort key
let fetcher = DeferredSortKey::new(
partition_id,
Duration::from_nanos(1),
Arc::clone(&catalog),
backoff_config.clone(),
);
// Spin, waiting for the background task to show as complete.
async {
loop {
if fetcher.handle.is_finished() {
return;
}
tokio::task::yield_now().await
}
}
.with_timeout_panic(Duration::from_secs(5))
.await;
let fetched = fetcher.get().await;
assert!(fetched.is_some());
assert_eq!(fetched, catalog_state.sort_key());
}
}

View File

@ -3,7 +3,7 @@ use std::{fmt::Debug, sync::Arc};
use async_trait::async_trait;
use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
use crate::data::partition::PartitionData;
use crate::data::{partition::PartitionData, table::TableName};
/// An infallible resolver of [`PartitionData`] for the specified shard, table,
/// and partition key, returning an initialised [`PartitionData`] buffer for it.
@ -20,7 +20,7 @@ pub trait PartitionProvider: Send + Sync + Debug {
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
) -> PartitionData;
}
@ -35,7 +35,7 @@ where
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
) -> PartitionData {
(**self)
.get_partition(partition_key, shard_id, namespace_id, table_id, table_name)
@ -49,7 +49,7 @@ mod tests {
use data_types::PartitionId;
use crate::data::partition::resolver::MockPartitionProvider;
use crate::data::partition::{resolver::MockPartitionProvider, SortKeyState};
use super::*;
@ -59,7 +59,7 @@ mod tests {
let shard_id = ShardId::new(42);
let namespace_id = NamespaceId::new(1234);
let table_id = TableId::new(24);
let table_name = "platanos".into();
let table_name = TableName::from("platanos");
let partition = PartitionId::new(4242);
let data = PartitionData::new(
partition,
@ -67,22 +67,17 @@ mod tests {
shard_id,
namespace_id,
table_id,
Arc::clone(&table_name),
table_name.clone(),
SortKeyState::Provided(None),
None,
);
let mock = Arc::new(MockPartitionProvider::default().with_partition(data));
let got = mock
.get_partition(
key,
shard_id,
namespace_id,
table_id,
Arc::clone(&table_name),
)
.get_partition(key, shard_id, namespace_id, table_id, table_name.clone())
.await;
assert_eq!(got.id(), partition);
assert_eq!(got.partition_id(), partition);
assert_eq!(got.namespace_id(), namespace_id);
assert_eq!(*got.table_name(), *table_name);
}

View File

@ -1,159 +0,0 @@
use std::sync::Arc;
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_query::{
exec::{Executor, ExecutorType},
QueryChunk, QueryChunkMeta, ScanPlanBuilder,
};
use observability_deps::tracing::debug;
use snafu::{ResultExt, Snafu};
use crate::query::QueryableBatch;
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
#[snafu(display("Error creating plan for querying Ingester data to send to Querier"))]
Frontend {
source: iox_query::frontend::common::Error,
},
#[snafu(display("Error building logical plan for querying Ingester data to send to Querier"))]
LogicalPlan { source: DataFusionError },
#[snafu(display(
"Error building physical plan for querying Ingester data to send to Querier: {}",
source
))]
PhysicalPlan { source: DataFusionError },
#[snafu(display(
"Error executing the query for getting Ingester data to send to Querier: {}",
source
))]
ExecutePlan { source: DataFusionError },
}
/// A specialized `Error` for Ingester's Query errors
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Query a given Queryable Batch, applying selection and filters as appropriate
/// Return stream of record batches
pub(crate) async fn query(
executor: &Executor,
data: Arc<QueryableBatch>,
) -> Result<SendableRecordBatchStream> {
// Build logical plan for filtering data
// Note that this query will also apply the delete predicates that go with the QueryableBatch
// TODO: Since we have different type of servers (router,
// ingester, compactor, and querier), we may want to add more
// types into the ExecutorType to have better log and resource
// managment
let ctx = executor.new_context(ExecutorType::Query);
// Creates an execution plan for a scan and filter data of a single chunk
let schema = data.schema();
let table_name = data.table_name().to_string();
debug!(%table_name, "Creating single chunk scan plan");
let logical_plan = ScanPlanBuilder::new(schema, ctx.child_ctx("scan_and_filter planning"))
.with_chunks([data as _])
.build()
.context(FrontendSnafu)?
.plan_builder
.build()
.context(LogicalPlanSnafu)?;
debug!(%table_name, plan=%logical_plan.display_indent_schema(),
"created single chunk scan plan");
// Build physical plan
let physical_plan = ctx
.create_physical_plan(&logical_plan)
.await
.context(PhysicalPlanSnafu {})?;
// Execute the plan and return the filtered stream
let output_stream = ctx
.execute_stream(physical_plan)
.await
.context(ExecutePlanSnafu {})?;
Ok(output_stream)
}
#[cfg(test)]
mod tests {
use arrow_util::assert_batches_eq;
use super::*;
use crate::test_util::{
create_one_record_batch_with_influxtype_no_duplicates, create_tombstone,
make_queryable_batch, make_queryable_batch_with_deletes,
};
#[tokio::test]
async fn test_query() {
test_helpers::maybe_start_logging();
// create input data
let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
// build queryable batch from the input batches
let batch = make_queryable_batch("test_table", 0, 1, batches);
// query without filters
let exc = Executor::new(1);
let stream = query(&exc, batch).await.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
.await
.unwrap();
// verify data: all rows and columns should be returned
let expected = vec![
"+-----------+------+-----------------------------+",
"| field_int | tag1 | time |",
"+-----------+------+-----------------------------+",
"| 70 | UT | 1970-01-01T00:00:00.000020Z |",
"| 10 | VT | 1970-01-01T00:00:00.000010Z |",
"| 1000 | WA | 1970-01-01T00:00:00.000008Z |",
"+-----------+------+-----------------------------+",
];
assert_batches_eq!(&expected, &output_batches);
exc.join().await;
}
#[tokio::test]
async fn test_query_with_delete() {
test_helpers::maybe_start_logging();
// create input data
let batches = create_one_record_batch_with_influxtype_no_duplicates().await;
let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")];
// build queryable batch from the input batches
let batch = make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones);
let exc = Executor::new(1);
let stream = query(&exc, batch).await.unwrap();
let output_batches = datafusion::physical_plan::common::collect(stream)
.await
.unwrap();
// verify data:
let expected = vec![
"+-----------+------+-----------------------------+",
"| field_int | tag1 | time |",
"+-----------+------+-----------------------------+",
"| 10 | VT | 1970-01-01T00:00:00.000010Z |",
"| 1000 | WA | 1970-01-01T00:00:00.000008Z |",
"+-----------+------+-----------------------------+",
];
assert_batches_eq!(&expected, &output_batches);
exc.join().await;
}
}

View File

@ -1,22 +1,49 @@
//! Shard level data buffer structures.
use std::{
collections::{btree_map::Entry, BTreeMap},
sync::Arc,
};
use std::{collections::HashMap, sync::Arc};
use data_types::{ShardId, ShardIndex};
use data_types::{NamespaceId, ShardId, ShardIndex};
use dml::DmlOperation;
use iox_catalog::interface::Catalog;
use iox_query::exec::Executor;
use metric::U64Counter;
use parking_lot::RwLock;
use snafu::{OptionExt, ResultExt};
use write_summary::ShardProgress;
use super::{namespace::NamespaceData, partition::resolver::PartitionProvider};
use super::{
namespace::{NamespaceData, NamespaceName},
partition::resolver::PartitionProvider,
};
use crate::lifecycle::LifecycleHandle;
/// A double-referenced map where [`NamespaceData`] can be looked up by name, or
/// ID.
#[derive(Debug, Default)]
struct DoubleRef {
// TODO(4880): this can be removed when IDs are sent over the wire.
by_name: HashMap<NamespaceName, Arc<NamespaceData>>,
by_id: HashMap<NamespaceId, Arc<NamespaceData>>,
}
impl DoubleRef {
fn insert(&mut self, name: NamespaceName, ns: NamespaceData) -> Arc<NamespaceData> {
let id = ns.namespace_id();
let ns = Arc::new(ns);
self.by_name.insert(name, Arc::clone(&ns));
self.by_id.insert(id, Arc::clone(&ns));
ns
}
fn by_name(&self, name: &NamespaceName) -> Option<Arc<NamespaceData>> {
self.by_name.get(name).map(Arc::clone)
}
fn by_id(&self, id: NamespaceId) -> Option<Arc<NamespaceData>> {
self.by_id.get(&id).map(Arc::clone)
}
}
/// Data of a Shard
#[derive(Debug)]
pub(crate) struct ShardData {
@ -32,7 +59,7 @@ pub(crate) struct ShardData {
partition_provider: Arc<dyn PartitionProvider>,
// New namespaces can come in at any time so we need to be able to add new ones
namespaces: RwLock<BTreeMap<String, Arc<NamespaceData>>>,
namespaces: RwLock<DoubleRef>,
metrics: Arc<metric::Registry>,
namespace_count: U64Counter,
@ -72,9 +99,8 @@ impl ShardData {
dml_operation: DmlOperation,
catalog: &Arc<dyn Catalog>,
lifecycle_handle: &dyn LifecycleHandle,
executor: &Executor,
) -> Result<bool, super::Error> {
let namespace_data = match self.namespace(dml_operation.namespace()) {
let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) {
Some(d) => d,
None => {
self.insert_namespace(dml_operation.namespace(), &**catalog)
@ -83,14 +109,24 @@ impl ShardData {
};
namespace_data
.buffer_operation(dml_operation, catalog, lifecycle_handle, executor)
.buffer_operation(dml_operation, catalog, lifecycle_handle)
.await
}
/// Gets the namespace data out of the map
pub(crate) fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
pub(crate) fn namespace(&self, namespace: &NamespaceName) -> Option<Arc<NamespaceData>> {
let n = self.namespaces.read();
n.get(namespace).cloned()
n.by_name(namespace)
}
/// Gets the namespace data out of the map
pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option<Arc<NamespaceData>> {
// TODO: this should be the default once IDs are pushed over the wire.
//
// At which point the map should be indexed by IDs, instead of namespace
// names.
let n = self.namespaces.read();
n.by_id(namespace_id)
}
/// Retrieves the namespace from the catalog and initializes an empty buffer, or
@ -101,6 +137,8 @@ impl ShardData {
catalog: &dyn Catalog,
) -> Result<Arc<NamespaceData>, super::Error> {
let mut repos = catalog.repositories().await;
let ns_name = NamespaceName::from(namespace);
let namespace = repos
.namespaces()
.get_by_name(namespace)
@ -110,26 +148,35 @@ impl ShardData {
let mut n = self.namespaces.write();
let data = match n.entry(namespace.name) {
Entry::Vacant(v) => {
let v = v.insert(Arc::new(NamespaceData::new(
namespace.id,
self.shard_id,
Arc::clone(&self.partition_provider),
&*self.metrics,
)));
Ok(match n.by_name(&ns_name) {
Some(v) => v,
None => {
self.namespace_count.inc(1);
Arc::clone(v)
}
Entry::Occupied(v) => Arc::clone(v.get()),
};
Ok(data)
// Insert the table and then return a ref to it.
n.insert(
ns_name.clone(),
NamespaceData::new(
namespace.id,
ns_name,
self.shard_id,
Arc::clone(&self.partition_provider),
&*self.metrics,
),
)
}
})
}
/// Return the progress of this shard
pub(super) async fn progress(&self) -> ShardProgress {
let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect();
let namespaces: Vec<_> = self
.namespaces
.read()
.by_id
.values()
.map(Arc::clone)
.collect();
let mut progress = ShardProgress::new();
@ -144,3 +191,89 @@ impl ShardData {
self.shard_index
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::{PartitionId, PartitionKey, ShardIndex};
use metric::{Attributes, Metric};
use crate::{
data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
lifecycle::mock_handle::MockLifecycleHandle,
test_util::{make_write_op, populate_catalog},
};
use super::*;
const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
const TABLE_NAME: &str = "bananas";
const NAMESPACE_NAME: &str = "platanos";
#[tokio::test]
async fn test_shard_double_ref() {
let metrics = Arc::new(metric::Registry::default());
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
// Configure the mock partition provider to return a partition for this
// table ID.
let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
PartitionData::new(
PartitionId::new(0),
PartitionKey::from("banana-split"),
shard_id,
ns_id,
table_id,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
),
));
let shard = ShardData::new(
SHARD_INDEX,
shard_id,
partition_provider,
Arc::clone(&metrics),
);
// Assert the namespace does not contain the test data
assert!(shard.namespace(&NAMESPACE_NAME.into()).is_none());
assert!(shard.namespace_by_id(ns_id).is_none());
// Write some test data
shard
.buffer_operation(
DmlOperation::Write(make_write_op(
&PartitionKey::from("banana-split"),
SHARD_INDEX,
NAMESPACE_NAME,
0,
r#"bananas,city=Medford day="sun",temp=55 22"#,
)),
&catalog,
&MockLifecycleHandle::default(),
)
.await
.expect("buffer op should succeed");
// Both forms of referencing the table should succeed
assert!(shard.namespace(&NAMESPACE_NAME.into()).is_some());
assert!(shard.namespace_by_id(ns_id).is_some());
// And the table counter metric should increase
let tables = metrics
.get_instrument::<Metric<U64Counter>>("ingester_namespaces_total")
.expect("failed to read metric")
.get_observer(&Attributes::from([]))
.expect("failed to get observer")
.fetch();
assert_eq!(tables, 1);
}
}

View File

@ -1,41 +1,94 @@
//! Table level data buffer structures.
use std::{collections::BTreeMap, sync::Arc};
use std::{collections::HashMap, sync::Arc};
use data_types::{
DeletePredicate, NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId, Timestamp,
};
use iox_catalog::interface::Catalog;
use iox_query::exec::Executor;
use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
use mutable_batch::MutableBatch;
use snafu::ResultExt;
use observability_deps::tracing::*;
use write_summary::ShardProgress;
use super::partition::{
resolver::PartitionProvider, PartitionData, PartitionStatus, UnpersistedPartitionData,
};
use crate::lifecycle::LifecycleHandle;
use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData};
use crate::{lifecycle::LifecycleHandle, querier_handler::PartitionStatus};
/// A double-referenced map where [`PartitionData`] can be looked up by
/// [`PartitionKey`], or ID.
#[derive(Debug, Default)]
struct DoubleRef {
// TODO(4880): this can be removed when IDs are sent over the wire.
by_key: HashMap<PartitionKey, PartitionData>,
by_id: HashMap<PartitionId, PartitionKey>,
}
impl DoubleRef {
fn insert(&mut self, ns: PartitionData) {
let id = ns.partition_id();
let key = ns.partition_key().clone();
assert!(self.by_key.insert(key.clone(), ns).is_none());
assert!(self.by_id.insert(id, key).is_none());
}
#[cfg(test)]
fn by_key(&self, key: &PartitionKey) -> Option<&PartitionData> {
self.by_key.get(key)
}
fn by_key_mut(&mut self, key: &PartitionKey) -> Option<&mut PartitionData> {
self.by_key.get_mut(key)
}
fn by_id_mut(&mut self, id: PartitionId) -> Option<&mut PartitionData> {
let key = self.by_id.get(&id)?.clone();
self.by_key_mut(&key)
}
}
/// The string name / identifier of a Table.
///
/// A reference-counted, cheap clone-able string.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TableName(Arc<str>);
impl<T> From<T> for TableName
where
T: AsRef<str>,
{
fn from(v: T) -> Self {
Self(Arc::from(v.as_ref()))
}
}
impl std::fmt::Display for TableName {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl std::ops::Deref for TableName {
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// Data of a Table in a given Namesapce that belongs to a given Shard
#[derive(Debug)]
pub(crate) struct TableData {
table_id: TableId,
table_name: Arc<str>,
table_name: TableName,
/// The catalog ID of the shard & namespace this table is being populated
/// from.
shard_id: ShardId,
namespace_id: NamespaceId,
// the max sequence number for a tombstone associated with this table
tombstone_max_sequence_number: Option<SequenceNumber>,
/// An abstract constructor of [`PartitionData`] instances for a given
/// `(key, shard, table)` triplet.
partition_provider: Arc<dyn PartitionProvider>,
// Map pf partition key to its data
pub(super) partition_data: BTreeMap<PartitionKey, PartitionData>,
// Map of partition key to its data
partition_data: DoubleRef,
}
impl TableData {
@ -51,18 +104,16 @@ impl TableData {
/// for the first time.
pub(super) fn new(
table_id: TableId,
table_name: &str,
table_name: TableName,
shard_id: ShardId,
namespace_id: NamespaceId,
tombstone_max_sequence_number: Option<SequenceNumber>,
partition_provider: Arc<dyn PartitionProvider>,
) -> Self {
Self {
table_id,
table_name: table_name.into(),
table_name,
shard_id,
namespace_id,
tombstone_max_sequence_number,
partition_data: Default::default(),
partition_provider,
}
@ -71,18 +122,13 @@ impl TableData {
/// Return parquet_max_sequence_number
pub(super) fn parquet_max_sequence_number(&self) -> Option<SequenceNumber> {
self.partition_data
.by_key
.values()
.map(|p| p.max_persisted_sequence_number())
.max()
.flatten()
}
/// Return tombstone_max_sequence_number
#[allow(dead_code)] // Used in tests
pub(super) fn tombstone_max_sequence_number(&self) -> Option<SequenceNumber> {
self.tombstone_max_sequence_number
}
// buffers the table write and returns true if the lifecycle manager indicates that
// ingest should be paused.
pub(super) async fn buffer_table_write(
@ -92,7 +138,7 @@ impl TableData {
partition_key: PartitionKey,
lifecycle_handle: &dyn LifecycleHandle,
) -> Result<bool, super::Error> {
let partition_data = match self.partition_data.get_mut(&partition_key) {
let partition_data = match self.partition_data.by_key.get_mut(&partition_key) {
Some(p) => p,
None => {
let p = self
@ -102,86 +148,87 @@ impl TableData {
self.shard_id,
self.namespace_id,
self.table_id,
Arc::clone(&self.table_name),
self.table_name.clone(),
)
.await;
// Add the partition to the map.
assert!(self
.partition_data
.insert(partition_key.clone(), p)
.is_none());
self.partition_data.get_mut(&partition_key).unwrap()
// Add the double-referenced partition to the map.
self.partition_data.insert(p);
self.partition_data.by_key_mut(&partition_key).unwrap()
}
};
// skip the write if it has already been persisted
if let Some(max) = partition_data.max_persisted_sequence_number() {
if max >= sequence_number {
trace!(
shard_id=%self.shard_id,
op_sequence_number=?sequence_number,
"skipping already-persisted write"
);
return Ok(false);
}
}
let size = batch.size();
let rows = batch.rows();
partition_data.buffer_write(sequence_number, batch)?;
// Record the write as having been buffered.
//
// This should happen AFTER the write is applied, because buffering the
// op may fail which would lead to a write being recorded, but not
// applied.
let should_pause = lifecycle_handle.log_write(
partition_data.id(),
partition_data.partition_id(),
self.shard_id,
self.namespace_id,
self.table_id,
sequence_number,
batch.size(),
batch.rows(),
size,
rows,
);
partition_data.buffer_write(sequence_number, batch)?;
Ok(should_pause)
}
pub(super) async fn buffer_delete(
/// Return the [`PartitionData`] for the specified ID.
#[allow(unused)]
pub(crate) fn get_partition(
&mut self,
predicate: &DeletePredicate,
sequence_number: SequenceNumber,
catalog: &dyn Catalog,
executor: &Executor,
) -> Result<(), super::Error> {
let min_time = Timestamp::new(predicate.range.start());
let max_time = Timestamp::new(predicate.range.end());
partition_id: PartitionId,
) -> Option<&mut PartitionData> {
self.partition_data.by_id_mut(partition_id)
}
let mut repos = catalog.repositories().await;
let tombstone = repos
.tombstones()
.create_or_get(
self.table_id,
self.shard_id,
sequence_number,
min_time,
max_time,
&predicate.expr_sql_string(),
)
.await
.context(super::CatalogSnafu)?;
/// Return the [`PartitionData`] for the specified partition key.
#[cfg(test)]
pub(crate) fn get_partition_by_key(
&self,
partition_key: &PartitionKey,
) -> Option<&PartitionData> {
self.partition_data.by_key(partition_key)
}
// remember "persisted" state
self.tombstone_max_sequence_number = Some(sequence_number);
// modify one partition at a time
for data in self.partition_data.values_mut() {
data.buffer_tombstone(executor, tombstone.clone()).await;
}
Ok(())
/// Return the [`PartitionData`] for the specified partition key.
pub(crate) fn get_partition_by_key_mut(
&mut self,
partition_key: &PartitionKey,
) -> Option<&mut PartitionData> {
self.partition_data.by_key_mut(partition_key)
}
pub(crate) fn unpersisted_partition_data(&self) -> Vec<UnpersistedPartitionData> {
self.partition_data
.by_key
.values()
.map(|p| UnpersistedPartitionData {
partition_id: p.id(),
partition_id: p.partition_id(),
non_persisted: p
.get_non_persisting_data()
.expect("get_non_persisting should always work"),
persisting: p.get_persisting_data(),
partition_status: PartitionStatus {
parquet_max_sequence_number: p.max_persisted_sequence_number(),
tombstone_max_sequence_number: self.tombstone_max_sequence_number,
},
})
.collect()
@ -196,14 +243,223 @@ impl TableData {
};
self.partition_data
.by_key
.values()
.fold(progress, |progress, partition_data| {
progress.combine(partition_data.progress())
})
}
#[cfg(test)]
/// Returns the table ID for this partition.
pub(super) fn table_id(&self) -> TableId {
self.table_id
}
/// Returns the name of this table.
pub(crate) fn table_name(&self) -> &TableName {
&self.table_name
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use assert_matches::assert_matches;
use data_types::{PartitionId, ShardIndex};
use iox_catalog::interface::Catalog;
use mutable_batch::writer;
use mutable_batch_lp::lines_to_batches;
use schema::{InfluxColumnType, InfluxFieldType};
use crate::{
data::{
partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState},
Error,
},
lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle},
test_util::populate_catalog,
};
use super::*;
const SHARD_INDEX: ShardIndex = ShardIndex::new(24);
const TABLE_NAME: &str = "bananas";
const NAMESPACE_NAME: &str = "platanos";
const PARTITION_KEY: &str = "platanos";
const PARTITION_ID: PartitionId = PartitionId::new(0);
#[tokio::test]
async fn test_partition_double_ref() {
let metrics = Arc::new(metric::Registry::default());
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
// Configure the mock partition provider to return a partition for this
// table ID.
let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
PartitionData::new(
PARTITION_ID,
PARTITION_KEY.into(),
shard_id,
ns_id,
table_id,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
),
));
let mut table = TableData::new(
table_id,
TABLE_NAME.into(),
shard_id,
ns_id,
partition_provider,
);
let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
.unwrap()
.remove(TABLE_NAME)
.unwrap();
// Assert the table does not contain the test partition
assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
assert!(table.partition_data.by_id_mut(PARTITION_ID).is_none());
// Write some test data
let pause = table
.buffer_table_write(
SequenceNumber::new(42),
batch,
PARTITION_KEY.into(),
&MockLifecycleHandle::default(),
)
.await
.expect("buffer op should succeed");
assert!(!pause);
// Referencing the partition should succeed
assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
assert!(table.partition_data.by_id_mut(PARTITION_ID).is_some());
}
#[tokio::test]
async fn test_bad_write_memory_counting() {
let metrics = Arc::new(metric::Registry::default());
let catalog: Arc<dyn Catalog> =
Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics)));
// Populate the catalog with the shard / namespace / table
let (shard_id, ns_id, table_id) =
populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await;
// Configure the mock partition provider to return a partition for this
// table ID.
let partition_provider = Arc::new(MockPartitionProvider::default().with_partition(
PartitionData::new(
PARTITION_ID,
PARTITION_KEY.into(),
shard_id,
ns_id,
table_id,
TABLE_NAME.into(),
SortKeyState::Provided(None),
None,
),
));
let mut table = TableData::new(
table_id,
TABLE_NAME.into(),
shard_id,
ns_id,
partition_provider,
);
let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0)
.unwrap()
.remove(TABLE_NAME)
.unwrap();
// Initialise the mock lifecycle handle and use it to inspect the calls
// made to the lifecycle manager during buffering.
let handle = MockLifecycleHandle::default();
// Assert the table does not contain the test partition
assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none());
// Write some test data
let pause = table
.buffer_table_write(
SequenceNumber::new(42),
batch,
PARTITION_KEY.into(),
&handle,
)
.await
.expect("buffer op should succeed");
assert!(!pause);
// Referencing the partition should succeed
assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some());
// And the lifecycle handle was called with the expected values
assert_eq!(
handle.get_log_calls(),
&[MockLifecycleCall {
partition_id: PARTITION_ID,
shard_id,
namespace_id: ns_id,
table_id,
sequence_number: SequenceNumber::new(42),
bytes_written: 1131,
rows_written: 1,
}]
);
// Attempt to buffer the second op that contains a type conflict - this
// should return an error, and not make a call to the lifecycle handle
// (as no data was buffered)
//
// Note the type of value was numeric previously, and here it is a string.
let batch = lines_to_batches(r#"bananas,bat=man value="platanos" 42"#, 0)
.unwrap()
.remove(TABLE_NAME)
.unwrap();
let err = table
.buffer_table_write(
SequenceNumber::new(42),
batch,
PARTITION_KEY.into(),
&handle,
)
.await
.expect_err("type conflict should error");
// The buffer op should return a column type error
assert_matches!(
err,
Error::BufferWrite {
source: mutable_batch::Error::WriterError {
source: writer::Error::TypeMismatch {
existing: InfluxColumnType::Field(InfluxFieldType::Float),
inserted: InfluxColumnType::Field(InfluxFieldType::String),
column: col_name,
}
},
} => { assert_eq!(col_name, "value") }
);
// And the lifecycle handle should not be called.
//
// It still contains the first call, so the desired length is 1
// indicating no second call was made.
assert_eq!(handle.get_log_calls().len(), 1);
}
}

View File

@ -30,17 +30,24 @@ use crate::{
data::{
partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
shard::ShardData,
IngesterData, IngesterQueryResponse,
IngesterData,
},
lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager},
poison::PoisonCabinet,
querier_handler::prepare_data_to_querier,
querier_handler::{prepare_data_to_querier, IngesterQueryResponse},
stream_handler::{
handler::SequencedStreamHandler, sink_adaptor::IngestSinkAdaptor,
sink_instrumentation::SinkInstrumentation, PeriodicWatermarkFetcher,
},
};
/// The maximum duration of time between creating a [`PartitionData`] and its
/// [`SortKey`] being fetched from the catalog.
///
/// [`PartitionData`]: crate::data::partition::PartitionData
/// [`SortKey`]: schema::sort::SortKey
const SORT_KEY_PRE_FETCH: Duration = Duration::from_secs(30);
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
@ -160,7 +167,13 @@ impl IngestHandlerImpl {
// Build the partition provider.
let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
let partition_provider = PartitionCache::new(partition_provider, recent_partitions);
let partition_provider = PartitionCache::new(
partition_provider,
recent_partitions,
SORT_KEY_PRE_FETCH,
Arc::clone(&catalog),
BackoffConfig::default(),
);
let partition_provider: Arc<dyn PartitionProvider> = Arc::new(partition_provider);
// build the initial ingester data state
@ -432,7 +445,7 @@ mod tests {
use write_buffer::mock::{MockBufferForReading, MockBufferSharedState};
use super::*;
use crate::data::partition::SnapshotBatch;
use crate::data::{partition::SnapshotBatch, table::TableName};
#[tokio::test]
async fn read_from_write_buffer_write_to_mutable_buffer() {
@ -499,13 +512,16 @@ mod tests {
// give the writes some time to go through the buffer. Exit once we've verified there's
// data in there from both writes.
tokio::time::timeout(Duration::from_secs(2), async {
let ns_name = ingester.namespace.name.into();
let table_name = TableName::from("a");
loop {
let mut has_measurement = false;
if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) {
if let Some(data) = data.namespace(&ingester.namespace.name) {
if let Some(data) = data.namespace(&ns_name) {
// verify there's data in the buffer
if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await {
if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
{
if let Some(b) = b.first() {
if b.data.num_rows() > 0 {
has_measurement = true;
@ -740,13 +756,16 @@ mod tests {
// give the writes some time to go through the buffer. Exit once we've verified there's
// data in there
tokio::time::timeout(Duration::from_secs(1), async move {
let ns_name = namespace.name.into();
let table_name = TableName::from("cpu");
loop {
let mut has_measurement = false;
if let Some(data) = ingester.data.shard(shard.id) {
if let Some(data) = data.namespace(&namespace.name) {
if let Some(data) = data.namespace(&ns_name) {
// verify there's data in the buffer
if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await {
if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await
{
if let Some(b) = b.first() {
custom_batch_verification(b);

View File

@ -12,7 +12,7 @@ use std::{collections::BTreeMap, sync::Arc, time::Duration};
use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
use iox_time::{Time, TimeProvider};
use metric::{Metric, U64Counter};
use observability_deps::tracing::{error, info, warn};
use observability_deps::tracing::{error, info, trace, warn};
use parking_lot::Mutex;
use tokio_util::sync::CancellationToken;
use tracker::TrackedFutureExt;
@ -97,6 +97,18 @@ impl LifecycleHandle for LifecycleHandleImpl {
stats.last_write = now;
stats.rows_written += rows_written;
trace!(
shard_id=%stats.shard_id,
partition_id=%stats.partition_id,
namespace_id=%stats.namespace_id,
table_id=%stats.table_id,
first_write=%stats.first_write,
last_write=%stats.last_write,
bytes_written=%stats.bytes_written,
first_sequence_number=?stats.first_sequence_number,
"logged write"
);
s.total_bytes += bytes_written;
// Pause if the server has exceeded the configured memory limit.
@ -234,7 +246,7 @@ struct LifecycleStats {
}
/// The stats for a partition
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone)]
struct PartitionLifecycleStats {
/// The shard this partition is under
shard_id: ShardId,
@ -469,6 +481,18 @@ impl LifecycleManager {
let persist_tasks: Vec<_> = to_persist
.into_iter()
.map(|s| {
// BUG: TOCTOU: memory usage released may be incorrect.
//
// Here the amount of memory to be reduced is acquired, but this
// code does not prevent continued writes adding more data to
// the partition in another thread.
//
// This may lead to more actual data being persisted than the
// call below returns to the server pool - this would slowly
// starve the ingester of memory it thinks it has.
//
// See https://github.com/influxdata/influxdb_iox/issues/5777
// Mark this partition as being persisted, and remember the
// memory allocation it had accumulated.
let partition_memory_usage = self
@ -483,7 +507,9 @@ impl LifecycleManager {
let state = Arc::clone(&self.state);
tokio::task::spawn(async move {
persister.persist(s.partition_id).await;
persister
.persist(s.shard_id, s.namespace_id, s.table_id, s.partition_id)
.await;
// Now the data has been uploaded and the memory it was
// using has been freed, released the memory capacity back
// the ingester.
@ -524,6 +550,12 @@ impl LifecycleManager {
.map(|s| s.first_sequence_number)
.min()
.unwrap_or(sequence_number);
trace!(
min_unpersisted_sequence_number=?min,
shard_id=%shard_id,
sequence_number=?sequence_number,
"updated min_unpersisted_sequence_number for persisted shard"
);
persister
.update_min_unpersisted_sequence_number(shard_id, min)
.await;
@ -602,7 +634,13 @@ mod tests {
#[async_trait]
impl Persister for TestPersister {
async fn persist(&self, partition_id: PartitionId) {
async fn persist(
&self,
_shard_id: ShardId,
_namespace_id: NamespaceId,
_table_id: TableId,
partition_id: PartitionId,
) {
let mut p = self.persist_called.lock();
p.insert(partition_id);
}
@ -662,8 +700,16 @@ mod tests {
#[async_trait]
impl Persister for PausablePersister {
async fn persist(&self, partition_id: PartitionId) {
self.inner.persist(partition_id).await;
async fn persist(
&self,
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
) {
self.inner
.persist(shard_id, namespace_id, table_id, partition_id)
.await;
if let Some(event) = self.event(partition_id) {
event.before.wait().await;
event.after.wait().await;

View File

@ -1,26 +1,66 @@
//! A mock [`LifecycleHandle`] impl for testing.
use std::sync::Arc;
use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId};
use parking_lot::Mutex;
use super::LifecycleHandle;
/// Special [`LifecycleHandle`] that never persists and always accepts more data.
///
/// This is useful to control persists manually.
#[derive(Debug, Default, Clone, Copy)]
pub struct NoopLifecycleHandle;
/// A set of arguments captured from a call to
/// [`MockLifecycleHandle::log_write()`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(missing_docs)]
pub struct MockLifecycleCall {
pub partition_id: PartitionId,
pub shard_id: ShardId,
pub namespace_id: NamespaceId,
pub table_id: TableId,
pub sequence_number: SequenceNumber,
pub bytes_written: usize,
pub rows_written: usize,
}
impl LifecycleHandle for NoopLifecycleHandle {
/// A mock [`LifecycleHandle`] implementation that records calls made to
/// [`Self::log_write()`] and never blocks ingest, always accepting more data.
///
/// # Cloning
///
/// Cloning a [`MockLifecycleHandle`] will clone the inner state - calls to all
/// cloned instances are reported in a call to [`Self::get_log_calls()`].
#[derive(Debug, Default, Clone)]
pub struct MockLifecycleHandle {
log_calls: Arc<Mutex<Vec<MockLifecycleCall>>>,
}
impl MockLifecycleHandle {
/// Returns the ordered [`Self::log_write()`] calls made to this mock.
pub fn get_log_calls(&self) -> Vec<MockLifecycleCall> {
self.log_calls.lock().clone()
}
}
impl LifecycleHandle for MockLifecycleHandle {
fn log_write(
&self,
_partition_id: PartitionId,
_shard_id: ShardId,
_namespace_id: NamespaceId,
_table_id: TableId,
_sequence_number: SequenceNumber,
_bytes_written: usize,
_rows_written: usize,
partition_id: PartitionId,
shard_id: ShardId,
namespace_id: NamespaceId,
table_id: TableId,
sequence_number: SequenceNumber,
bytes_written: usize,
rows_written: usize,
) -> bool {
self.log_calls.lock().push(MockLifecycleCall {
partition_id,
shard_id,
namespace_id,
table_id,
sequence_number,
bytes_written,
rows_written,
});
// do NOT pause ingest
false
}

View File

@ -1,10 +1,13 @@
//! Handle all requests from Querier
use std::sync::Arc;
use std::{pin::Pin, sync::Arc};
use arrow::{error::ArrowError, record_batch::RecordBatch};
use arrow_util::optimize::{optimize_record_batch, optimize_schema};
use data_types::{PartitionId, SequenceNumber};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion_util::MemoryStream;
use futures::StreamExt;
use futures::{Stream, StreamExt};
use generated_types::ingester::IngesterQueryRequest;
use observability_deps::tracing::debug;
use schema::selection::Selection;
@ -12,8 +15,8 @@ use snafu::{ensure, Snafu};
use crate::{
data::{
partition::UnpersistedPartitionData, IngesterData, IngesterQueryPartition,
IngesterQueryResponse,
namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName,
IngesterData,
},
query::QueryableBatch,
};
@ -47,6 +50,159 @@ pub enum Error {
/// A specialized `Error` for Ingester's Query errors
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Stream of snapshots.
///
/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
pub(crate) type SnapshotStream =
Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
/// Status of a partition that has unpersisted data.
///
/// Note that this structure is specific to a partition (which itself is bound to a table and
/// shard)!
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(missing_copy_implementations)]
pub struct PartitionStatus {
/// Max sequence number persisted
pub parquet_max_sequence_number: Option<SequenceNumber>,
}
/// Response data for a single partition.
pub(crate) struct IngesterQueryPartition {
/// Stream of snapshots.
snapshots: SnapshotStream,
/// Partition ID.
id: PartitionId,
/// Partition persistence status.
status: PartitionStatus,
}
impl std::fmt::Debug for IngesterQueryPartition {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("IngesterQueryPartition")
.field("snapshots", &"<SNAPSHOT STREAM>")
.field("id", &self.id)
.field("status", &self.status)
.finish()
}
}
impl IngesterQueryPartition {
pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
Self {
snapshots,
id,
status,
}
}
}
/// Stream of partitions in this response.
pub(crate) type IngesterQueryPartitionStream =
Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
/// Response streams for querier<>ingester requests.
///
/// The data structure is constructed to allow lazy/streaming data generation. For easier
/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
pub struct IngesterQueryResponse {
/// Stream of partitions.
partitions: IngesterQueryPartitionStream,
}
impl std::fmt::Debug for IngesterQueryResponse {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("IngesterQueryResponse")
.field("partitions", &"<PARTITION STREAM>")
.finish()
}
}
impl IngesterQueryResponse {
/// Make a response
pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
Self { partitions }
}
/// Flattens the data according to the wire protocol.
pub fn flatten(self) -> FlatIngesterQueryResponseStream {
self.partitions
.flat_map(|partition_res| match partition_res {
Ok(partition) => {
let head = futures::stream::once(async move {
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: partition.id,
status: partition.status,
})
});
let tail = partition
.snapshots
.flat_map(|snapshot_res| match snapshot_res {
Ok(snapshot) => {
let schema = Arc::new(optimize_schema(&snapshot.schema()));
let schema_captured = Arc::clone(&schema);
let head = futures::stream::once(async {
Ok(FlatIngesterQueryResponse::StartSnapshot {
schema: schema_captured,
})
});
let tail = snapshot.map(move |batch_res| match batch_res {
Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch {
batch: optimize_record_batch(&batch, Arc::clone(&schema))?,
}),
Err(e) => Err(e),
});
head.chain(tail).boxed()
}
Err(e) => futures::stream::once(async { Err(e) }).boxed(),
});
head.chain(tail).boxed()
}
Err(e) => futures::stream::once(async { Err(e) }).boxed(),
})
.boxed()
}
}
/// Flattened version of [`IngesterQueryResponse`].
pub(crate) type FlatIngesterQueryResponseStream =
Pin<Box<dyn Stream<Item = Result<FlatIngesterQueryResponse, ArrowError>> + Send>>;
/// Element within the flat wire protocol.
#[derive(Debug, PartialEq)]
pub enum FlatIngesterQueryResponse {
/// Start a new partition.
StartPartition {
/// Partition ID.
partition_id: PartitionId,
/// Partition persistence status.
status: PartitionStatus,
},
/// Start a new snapshot.
///
/// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
/// message.
StartSnapshot {
/// Snapshot schema.
schema: Arc<arrow::datatypes::Schema>,
},
/// Add a record batch to the snapshot that was announced by the last
/// [`StartSnapshot`](Self::StartSnapshot) message.
RecordBatch {
/// Record batch.
batch: RecordBatch,
},
}
/// Return data to send as a response back to the Querier per its request
pub async fn prepare_data_to_querier(
ingest_data: &Arc<IngesterData>,
@ -57,7 +213,8 @@ pub async fn prepare_data_to_querier(
let mut found_namespace = false;
for (shard_id, shard_data) in ingest_data.shards() {
debug!(shard_id=%shard_id.get());
let namespace_data = match shard_data.namespace(&request.namespace) {
let namespace_name = NamespaceName::from(&request.namespace);
let namespace_data = match shard_data.namespace(&namespace_name) {
Some(namespace_data) => {
debug!(namespace=%request.namespace, "found namespace");
found_namespace = true;
@ -68,7 +225,8 @@ pub async fn prepare_data_to_querier(
}
};
let table_data = match namespace_data.table_data(&request.table) {
let table_name = TableName::from(&request.table);
let table_data = match namespace_data.table_data(&table_name) {
Some(table_data) => {
debug!(table_name=%request.table, "found table");
table_data
@ -153,7 +311,6 @@ fn prepare_data_to_querier_for_partition(
request.table.clone().into(),
unpersisted_partition_data.partition_id,
vec![],
vec![],
)
})
.with_data(unpersisted_partition_data.non_persisted);
@ -188,22 +345,106 @@ fn prepare_data_to_querier_for_partition(
#[cfg(test)]
mod tests {
use arrow::{array::new_null_array, record_batch::RecordBatch};
use std::task::{Context, Poll};
use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch};
use arrow_util::assert_batches_sorted_eq;
use assert_matches::assert_matches;
use datafusion::logical_plan::{col, lit};
use datafusion::{
logical_plan::{col, lit},
physical_plan::RecordBatchStream,
};
use futures::TryStreamExt;
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
use predicate::Predicate;
use schema::merge::SchemaMerger;
use super::*;
use crate::{
data::FlatIngesterQueryResponse,
test_util::{
make_ingester_data, make_ingester_data_with_tombstones, DataLocation, TEST_NAMESPACE,
TEST_TABLE,
},
};
use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE};
#[tokio::test]
async fn test_ingester_query_response_flatten() {
let batch_1_1 = lp_to_batch("table x=1 0");
let batch_1_2 = lp_to_batch("table x=2 1");
let batch_2 = lp_to_batch("table y=1 10");
let batch_3 = lp_to_batch("table z=1 10");
let schema_1 = batch_1_1.schema();
let schema_2 = batch_2.schema();
let schema_3 = batch_3.schema();
let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([
Ok(IngesterQueryPartition::new(
Box::pin(futures::stream::iter([
Ok(Box::pin(TestRecordBatchStream::new(
vec![
Ok(batch_1_1.clone()),
Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
Ok(batch_1_2.clone()),
],
Arc::clone(&schema_1),
)) as _),
Err(ArrowError::InvalidArgumentError("invalid arg".into())),
Ok(Box::pin(TestRecordBatchStream::new(
vec![Ok(batch_2.clone())],
Arc::clone(&schema_2),
)) as _),
Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _),
])),
PartitionId::new(2),
PartitionStatus {
parquet_max_sequence_number: None,
},
)),
Err(ArrowError::IoError("some io error".into())),
Ok(IngesterQueryPartition::new(
Box::pin(futures::stream::iter([])),
PartitionId::new(1),
PartitionStatus {
parquet_max_sequence_number: None,
},
)),
])));
let actual: Vec<_> = response.flatten().collect().await;
let expected = vec![
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: PartitionId::new(2),
status: PartitionStatus {
parquet_max_sequence_number: None,
},
}),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }),
Err(ArrowError::NotYetImplemented("not yet implemeneted".into())),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }),
Err(ArrowError::InvalidArgumentError("invalid arg".into())),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }),
Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }),
Err(ArrowError::IoError("some io error".into())),
Ok(FlatIngesterQueryResponse::StartPartition {
partition_id: PartitionId::new(1),
status: PartitionStatus {
parquet_max_sequence_number: None,
},
}),
];
assert_eq!(actual.len(), expected.len());
for (actual, expected) in actual.into_iter().zip(expected) {
match (actual, expected) {
(Ok(actual), Ok(expected)) => {
assert_eq!(actual, expected);
}
(Err(_), Err(_)) => {
// cannot compare `ArrowError`, but it's unlikely that someone changed the error
}
(Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"),
(Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"),
}
}
}
#[tokio::test]
async fn test_prepare_data_to_querier() {
@ -360,180 +601,44 @@ mod tests {
}
}
#[tokio::test]
async fn test_prepare_data_to_querier_with_tombstones() {
test_helpers::maybe_start_logging();
pub struct TestRecordBatchStream {
schema: SchemaRef,
batches: Vec<Result<RecordBatch, ArrowError>>,
}
// make 7 scenarios for ingester data with tombstones
let mut scenarios = vec![];
for loc in &[
DataLocation::BUFFER,
DataLocation::BUFFER_SNAPSHOT,
DataLocation::BUFFER_PERSISTING,
DataLocation::BUFFER_SNAPSHOT_PERSISTING,
DataLocation::SNAPSHOT,
DataLocation::SNAPSHOT_PERSISTING,
DataLocation::PERSISTING,
] {
let scenario = Arc::new(make_ingester_data_with_tombstones(*loc).await);
scenarios.push((loc, scenario));
impl TestRecordBatchStream {
pub fn new(batches: Vec<Result<RecordBatch, ArrowError>>, schema: SchemaRef) -> Self {
Self { schema, batches }
}
}
// read data from all scenarios without any filters
let request = Arc::new(IngesterQueryRequest::new(
TEST_NAMESPACE.to_string(),
TEST_TABLE.to_string(),
vec![],
None,
));
let expected_not_persisting = vec![
"+------------+-----+------+--------------------------------+",
"| city | day | temp | time |",
"+------------+-----+------+--------------------------------+",
"| Andover | mon | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Medford | wed | | 1970-01-01T00:00:00.000000026Z |",
"| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |",
"+------------+-----+------+--------------------------------+",
];
// For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
// transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
let expected_persisting = vec![
"+------------+-----+------+--------------------------------+",
"| city | day | temp | time |",
"+------------+-----+------+--------------------------------+",
"| Andover | mon | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Boston | mon | | 1970-01-01T00:00:00.000000038Z |",
"| Boston | sun | 60 | 1970-01-01T00:00:00.000000036Z |",
"| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Medford | wed | | 1970-01-01T00:00:00.000000026Z |",
"| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |",
"+------------+-----+------+--------------------------------+",
];
for (loc, scenario) in &scenarios {
println!("Location: {loc:?}");
let expected = if loc.intersects(DataLocation::PERSISTING) {
&expected_persisting
impl RecordBatchStream for TestRecordBatchStream {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
}
impl futures::Stream for TestRecordBatchStream {
type Item = Result<RecordBatch, ArrowError>;
fn poll_next(
mut self: std::pin::Pin<&mut Self>,
_: &mut Context<'_>,
) -> Poll<Option<Self::Item>> {
if self.batches.is_empty() {
Poll::Ready(None)
} else {
&expected_not_persisting
};
let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
let result = ingester_response_to_record_batches(stream).await;
assert_batches_sorted_eq!(expected, &result);
Poll::Ready(Some(self.batches.remove(0)))
}
}
// read data from all scenarios and filter out column day
let request = Arc::new(IngesterQueryRequest::new(
TEST_NAMESPACE.to_string(),
TEST_TABLE.to_string(),
vec!["city".to_string(), "temp".to_string(), "time".to_string()],
None,
));
let expected_not_persisting = vec![
"+------------+------+--------------------------------+",
"| city | temp | time |",
"+------------+------+--------------------------------+",
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
"| Medford | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
"+------------+------+--------------------------------+",
];
// For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
// transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
let expected_persisting = vec![
"+------------+------+--------------------------------+",
"| city | temp | time |",
"+------------+------+--------------------------------+",
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Boston | | 1970-01-01T00:00:00.000000038Z |",
"| Boston | 60 | 1970-01-01T00:00:00.000000036Z |",
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
"| Medford | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
"+------------+------+--------------------------------+",
];
for (loc, scenario) in &scenarios {
println!("Location: {loc:?}");
let expected = if loc.intersects(DataLocation::PERSISTING) {
&expected_persisting
} else {
&expected_not_persisting
};
let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
let result = ingester_response_to_record_batches(stream).await;
assert_batches_sorted_eq!(expected, &result);
fn size_hint(&self) -> (usize, Option<usize>) {
(self.batches.len(), Some(self.batches.len()))
}
}
// read data from all scenarios, filter out column day, city Medford, time outside range [0, 42)
let expr = col("city").not_eq(lit("Medford"));
let pred = Predicate::default().with_expr(expr).with_range(0, 42);
let request = Arc::new(IngesterQueryRequest::new(
TEST_NAMESPACE.to_string(),
TEST_TABLE.to_string(),
vec!["city".to_string(), "temp".to_string(), "time".to_string()],
Some(pred),
));
// predicates and de-dup are NOT applied!, otherwise this would look like this:
// let expected = vec![
// "+------------+------+--------------------------------+",
// "| city | temp | time |",
// "+------------+------+--------------------------------+",
// "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
// "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
// "| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
// "+------------+------+--------------------------------+",
// ];
let expected_not_persisting = vec![
"+------------+------+--------------------------------+",
"| city | temp | time |",
"+------------+------+--------------------------------+",
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
"| Medford | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
"+------------+------+--------------------------------+",
];
// For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data
// transitioned into the "persisting" state. In this case, the ingester will apply the tombstones.
let expected_persisting = vec![
"+------------+------+--------------------------------+",
"| city | temp | time |",
"+------------+------+--------------------------------+",
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
"| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
"| Boston | | 1970-01-01T00:00:00.000000038Z |",
"| Boston | 60 | 1970-01-01T00:00:00.000000036Z |",
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
"| Medford | 55 | 1970-01-01T00:00:00.000000022Z |",
"| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
"+------------+------+--------------------------------+",
];
for (loc, scenario) in &scenarios {
println!("Location: {loc:?}");
let expected = if loc.intersects(DataLocation::PERSISTING) {
&expected_persisting
} else {
&expected_not_persisting
};
let stream = prepare_data_to_querier(scenario, &request).await.unwrap();
let result = ingester_response_to_record_batches(stream).await;
assert_batches_sorted_eq!(expected, &result);
}
fn lp_to_batch(lp: &str) -> RecordBatch {
lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap()
}
/// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.

View File

@ -6,26 +6,26 @@ use arrow::record_batch::RecordBatch;
use arrow_util::util::ensure_schema;
use data_types::{
ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary,
TimestampMinMax, Tombstone,
TimestampMinMax,
};
use datafusion::physical_plan::{
common::SizedRecordBatchStream,
metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
SendableRecordBatchStream,
use datafusion::{
error::DataFusionError,
physical_plan::{
common::SizedRecordBatchStream,
metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics},
SendableRecordBatchStream,
},
};
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
QueryChunk, QueryChunkError, QueryChunkMeta,
QueryChunk, QueryChunkMeta,
};
use observability_deps::tracing::trace;
use predicate::{
delete_predicate::{tombstones_to_delete_predicates, tombstones_to_delete_predicates_iter},
Predicate,
};
use predicate::Predicate;
use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema};
use snafu::{ResultExt, Snafu};
use crate::data::partition::SnapshotBatch;
use crate::data::{partition::SnapshotBatch, table::TableName};
#[allow(clippy::enum_variant_names)]
#[derive(Debug, Snafu)]
@ -53,11 +53,8 @@ pub(crate) struct QueryableBatch {
/// data
pub(crate) data: Vec<Arc<SnapshotBatch>>,
/// Delete predicates of the tombstones
pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
/// This is needed to return a reference for a trait function
pub(crate) table_name: Arc<str>,
pub(crate) table_name: TableName,
/// Partition ID
pub(crate) partition_id: PartitionId,
@ -66,15 +63,12 @@ pub(crate) struct QueryableBatch {
impl QueryableBatch {
/// Initilaize a QueryableBatch
pub(crate) fn new(
table_name: Arc<str>,
table_name: TableName,
partition_id: PartitionId,
data: Vec<Arc<SnapshotBatch>>,
deletes: Vec<Tombstone>,
) -> Self {
let delete_predicates = tombstones_to_delete_predicates(&deletes);
Self {
data,
delete_predicates,
table_name,
partition_id,
}
@ -86,12 +80,6 @@ impl QueryableBatch {
self
}
/// Add more tombstones
pub(crate) fn add_tombstones(&mut self, deletes: &[Tombstone]) {
let delete_predicates = tombstones_to_delete_predicates_iter(deletes);
self.delete_predicates.extend(delete_predicates);
}
/// return min and max of all the snapshots
pub(crate) fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) {
let min = self
@ -110,11 +98,6 @@ impl QueryableBatch {
(min, max)
}
/// return true if it has no data
pub(crate) fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
impl QueryChunkMeta for QueryableBatch {
@ -144,16 +127,16 @@ impl QueryChunkMeta for QueryableBatch {
None // Ingester data is not sorted
}
fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
self.delete_predicates.as_ref()
}
fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
// Note: we need to consider which option we want to go with
// . Return None here and avoid taking time to compute time's min max of RecordBacthes (current choice)
// . Compute time's min max here and avoid compacting non-overlapped QueryableBatches in the Ingester
None
}
fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
&[]
}
}
impl QueryChunk for QueryableBatch {
@ -185,7 +168,7 @@ impl QueryChunk for QueryableBatch {
_ctx: IOxSessionContext,
_predicate: &Predicate,
_columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
@ -199,7 +182,7 @@ impl QueryChunk for QueryableBatch {
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
@ -210,12 +193,16 @@ impl QueryChunk for QueryableBatch {
mut ctx: IOxSessionContext,
_predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError> {
) -> Result<SendableRecordBatchStream, DataFusionError> {
ctx.set_metadata("storage", "ingester");
ctx.set_metadata("projection", format!("{}", selection));
trace!(?selection, "selection");
let schema = self.schema().select(selection).context(SchemaSnafu)?;
let schema = self
.schema()
.select(selection)
.context(SchemaSnafu)
.map_err(|e| DataFusionError::External(Box::new(e)))?;
// Get all record batches from their snapshots
let batches = self
@ -234,7 +221,8 @@ impl QueryChunk for QueryableBatch {
.map(Arc::new);
Some(batch)
})
.collect::<Result<Vec<_>, _>>()?;
.collect::<Result<Vec<_>, _>>()
.map_err(|e| DataFusionError::External(Box::new(e)))?;
// Return stream of data
let dummy_metrics = ExecutionPlanMetricsSet::new();
@ -257,165 +245,3 @@ impl QueryChunk for QueryableBatch {
self
}
}
#[cfg(test)]
mod tests {
use arrow::{
array::{
ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
TimestampNanosecondArray, UInt64Array,
},
datatypes::{DataType, Int32Type, TimeUnit},
};
use data_types::{DeleteExpr, Op, Scalar, TimestampRange};
use super::*;
use crate::test_util::create_tombstone;
#[tokio::test]
async fn test_merge_batch_schema() {
// Merge schema of the batches
// The fields in the schema are sorted by column name
let batches = create_batches();
let merged_schema = (*merge_record_batch_schemas(&batches)).clone();
// Expected Arrow schema
let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![
arrow::datatypes::Field::new(
"dict",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
true,
),
arrow::datatypes::Field::new("int64", DataType::Int64, true),
arrow::datatypes::Field::new("string", DataType::Utf8, true),
arrow::datatypes::Field::new("bool", DataType::Boolean, true),
arrow::datatypes::Field::new(
"time",
DataType::Timestamp(TimeUnit::Nanosecond, None),
false,
),
arrow::datatypes::Field::new("uint64", DataType::UInt64, false),
arrow::datatypes::Field::new("float64", DataType::Float64, true),
]));
let expected_schema = Schema::try_from(arrow_schema)
.unwrap()
.sort_fields_by_name();
assert_eq!(
expected_schema, merged_schema,
"\nExpected:\n{:#?}\nActual:\n{:#?}",
expected_schema, merged_schema
);
}
#[tokio::test]
async fn test_tombstones_to_delete_predicates() {
// create tombstones
let tombstones = vec![
create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"),
create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"),
];
// This new queryable batch will convert tombstone to delete predicates
let query_batch =
QueryableBatch::new("test_table".into(), PartitionId::new(0), vec![], tombstones);
let predicates = query_batch.delete_predicates();
let expected = vec![
Arc::new(DeletePredicate {
range: TimestampRange::new(100, 200),
exprs: vec![DeleteExpr {
column: String::from("temp"),
op: Op::Eq,
scalar: Scalar::I64(10),
}],
}),
Arc::new(DeletePredicate {
range: TimestampRange::new(100, 350),
exprs: vec![
DeleteExpr {
column: String::from("temp"),
op: Op::Ne,
scalar: Scalar::I64(10),
},
DeleteExpr {
column: String::from("city"),
op: Op::Eq,
scalar: Scalar::String(String::from(r#"Boston"#)),
},
],
}),
];
assert_eq!(expected, predicates);
}
// ----------------------------------------------------------------------------------------------
// Data for testing
// Create pure RecordBatches without knowledge of Influx datatype
fn create_batches() -> Vec<Arc<RecordBatch>> {
// Batch 1: <dict, i64, str, bool, time> & 3 rows
let dict_array: ArrayRef = Arc::new(
vec![Some("a"), None, Some("b")]
.into_iter()
.collect::<DictionaryArray<Int32Type>>(),
);
let int64_array: ArrayRef =
Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
let string_array: ArrayRef = Arc::new(
vec![Some("foo"), Some("and"), Some("bar")]
.into_iter()
.collect::<StringArray>(),
);
let bool_array: ArrayRef = Arc::new(
[Some(true), None, Some(false)]
.iter()
.collect::<BooleanArray>(),
);
let ts_array: ArrayRef = Arc::new(
[Some(150), Some(200), Some(1526823730000000000)]
.iter()
.collect::<TimestampNanosecondArray>(),
);
let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
("dict", dict_array, true),
("int64", int64_array, true),
("string", string_array, true),
("bool", bool_array, true),
("time", ts_array, false), // not null
])
.unwrap();
// Batch 2: <dict, u64, f64, str, bool, time> & 2 rows
let dict_array: ArrayRef = Arc::new(
vec![None, Some("d")]
.into_iter()
.collect::<DictionaryArray<Int32Type>>(),
);
let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::<UInt64Array>()); // not null
let float64_array: ArrayRef =
Arc::new([Some(1.0), Some(2.0)].iter().collect::<Float64Array>());
let string_array: ArrayRef = Arc::new(
vec![Some("foo"), Some("bar")]
.into_iter()
.collect::<StringArray>(),
);
let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::<BooleanArray>());
let ts_array: ArrayRef = Arc::new(
[Some(100), Some(1626823730000000000)] // not null
.iter()
.collect::<TimestampNanosecondArray>(),
);
let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
("dict", dict_array, true),
("uint64", uint64_array, false), // not null
("float64", float64_array, true),
("string", string_array, true),
("bool", bool_array, true),
("time", ts_array, false), // not null
])
.unwrap();
vec![Arc::new(batch1), Arc::new(batch2)]
}
}

View File

@ -30,8 +30,8 @@ use trace::ctx::SpanContext;
use write_summary::WriteSummary;
use crate::{
data::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
handler::IngestHandler,
querier_handler::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream},
};
/// This type is responsible for managing all gRPC services exposed by
@ -410,9 +410,6 @@ impl Stream for GetStream {
parquet_max_sequence_number: status
.parquet_max_sequence_number
.map(|x| x.get()),
tombstone_max_sequence_number: status
.tombstone_max_sequence_number
.map(|x| x.get()),
}),
};
prost::Message::encode(&app_metadata, &mut bytes)
@ -467,8 +464,9 @@ mod tests {
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
use schema::selection::Selection;
use crate::querier_handler::PartitionStatus;
use super::*;
use crate::data::partition::PartitionStatus;
#[tokio::test]
async fn test_get_stream_empty() {
@ -489,7 +487,6 @@ mod tests {
partition_id: PartitionId::new(1),
status: PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
},
}),
Ok(FlatIngesterQueryResponse::StartSnapshot { schema }),
@ -502,7 +499,6 @@ mod tests {
partition_id: 1,
status: Some(proto::PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
}),
@ -527,7 +523,6 @@ mod tests {
partition_id: PartitionId::new(1),
status: PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
},
}),
Err(ArrowError::IoError("foo".into())),
@ -535,7 +530,6 @@ mod tests {
partition_id: PartitionId::new(1),
status: PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
},
}),
],
@ -546,7 +540,6 @@ mod tests {
partition_id: 1,
status: Some(proto::PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
}),

View File

@ -396,6 +396,12 @@ something clever.",
if let Some(delta) = duration_since_production {
// Update the TTBR metric before potentially sleeping.
self.time_to_be_readable.set(delta);
trace!(
kafka_topic=%self.topic_name,
shard_index=%self.shard_index,
delta=%delta.as_millis(),
"reporting TTBR for shard (ms)"
);
}
if should_pause {
@ -939,7 +945,7 @@ mod tests {
Ok(DmlOperation::Write(make_write("good_op", 2)))
]],
sink_rets = [
Err(crate::data::Error::TableNotPresent),
Err(crate::data::Error::NamespaceNotFound{namespace: "bananas".to_string() }),
Ok(true),
],
want_ttbr = 2,

View File

@ -17,7 +17,7 @@
//! [`LifecycleManager`]: crate::lifecycle::LifecycleManager
//! [`LifecycleHandle::can_resume_ingest()`]: crate::lifecycle::LifecycleHandle::can_resume_ingest()
pub mod handler;
pub(crate) mod handler;
mod periodic_watermark_fetcher;
mod sink;
@ -25,8 +25,8 @@ mod sink;
pub mod mock_sink;
#[cfg(test)]
pub mod mock_watermark_fetcher;
pub mod sink_adaptor;
pub mod sink_instrumentation;
pub(crate) mod sink_adaptor;
pub(crate) mod sink_instrumentation;
pub use periodic_watermark_fetcher::*;
pub use sink::*;
pub(crate) use periodic_watermark_fetcher::*;
pub(crate) use sink::*;

View File

@ -24,7 +24,7 @@ use super::sink_instrumentation::WatermarkFetcher;
/// Emits an error metric named `write_buffer_watermark_fetch_errors` that
/// increments once per fetch error.
#[derive(Debug)]
pub struct PeriodicWatermarkFetcher {
pub(crate) struct PeriodicWatermarkFetcher {
last_watermark: Arc<AtomicI64>,
poll_handle: JoinHandle<()>,
}

View File

@ -5,7 +5,7 @@ use dml::DmlOperation;
/// A [`DmlSink`] handles [`DmlOperation`] instances read from a shard.
#[async_trait]
pub trait DmlSink: Debug + Send + Sync {
pub(crate) trait DmlSink: Debug + Send + Sync {
/// Apply `op` read from a shard, returning `Ok(true)` if ingest should
/// be paused.
async fn apply(&self, op: DmlOperation) -> Result<bool, crate::data::Error>;

View File

@ -414,11 +414,13 @@ mod tests {
let got = test(
op,
&metrics,
Err(crate::data::Error::TableNotPresent),
Err(crate::data::Error::NamespaceNotFound {
namespace: "bananas".to_string(),
}),
Some(12345),
)
.await;
assert_matches!(got, Err(crate::data::Error::TableNotPresent));
assert_matches!(got, Err(crate::data::Error::NamespaceNotFound { .. }));
// Validate the various write buffer metrics
assert_matches!(

View File

@ -9,17 +9,16 @@ use arrow::record_batch::RecordBatch;
use arrow_util::assert_batches_eq;
use bitflags::bitflags;
use data_types::{
CompactionLevel, NamespaceId, NonEmptyString, PartitionId, PartitionKey, Sequence,
SequenceNumber, ShardId, ShardIndex, TableId, Timestamp, Tombstone, TombstoneId,
CompactionLevel, NamespaceId, PartitionId, PartitionKey, Sequence, SequenceNumber, ShardId,
ShardIndex, TableId,
};
use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite};
use dml::{DmlMeta, DmlOperation, DmlWrite};
use iox_catalog::{interface::Catalog, mem::MemCatalog};
use iox_query::test::{raw_data, TestChunk};
use iox_time::{SystemProvider, Time};
use mutable_batch_lp::lines_to_batches;
use object_store::memory::InMemory;
use parquet_file::metadata::IoxMetadata;
use predicate::delete_predicate::parse_delete_predicate;
use schema::sort::SortKey;
use uuid::Uuid;
@ -28,31 +27,10 @@ use crate::{
partition::{resolver::CatalogPartitionResolver, PersistingBatch, SnapshotBatch},
IngesterData,
},
lifecycle::{LifecycleConfig, LifecycleHandle, LifecycleManager},
lifecycle::{LifecycleConfig, LifecycleManager},
query::QueryableBatch,
};
/// Create tombstone for testing
pub(crate) fn create_tombstone(
id: i64,
table_id: i64,
shard_id: i64,
seq_num: i64,
min_time: i64,
max_time: i64,
predicate: &str,
) -> Tombstone {
Tombstone {
id: TombstoneId::new(id),
table_id: TableId::new(table_id),
shard_id: ShardId::new(shard_id),
sequence_number: SequenceNumber::new(seq_num),
min_time: Timestamp::new(min_time),
max_time: Timestamp::new(max_time),
serialized_predicate: predicate.to_string(),
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn make_meta(
object_store_id: Uuid,
@ -93,15 +71,8 @@ pub(crate) fn make_persisting_batch(
partition_id: i64,
object_store_id: Uuid,
batches: Vec<Arc<RecordBatch>>,
tombstones: Vec<Tombstone>,
) -> Arc<PersistingBatch> {
let queryable_batch = make_queryable_batch_with_deletes(
table_name,
partition_id,
seq_num_start,
batches,
tombstones,
);
let queryable_batch = make_queryable_batch(table_name, partition_id, seq_num_start, batches);
Arc::new(PersistingBatch {
shard_id: ShardId::new(shard_id),
table_id: TableId::new(table_id),
@ -116,16 +87,6 @@ pub(crate) fn make_queryable_batch(
partition_id: i64,
seq_num_start: i64,
batches: Vec<Arc<RecordBatch>>,
) -> Arc<QueryableBatch> {
make_queryable_batch_with_deletes(table_name, partition_id, seq_num_start, batches, vec![])
}
pub(crate) fn make_queryable_batch_with_deletes(
table_name: &str,
partition_id: i64,
seq_num_start: i64,
batches: Vec<Arc<RecordBatch>>,
tombstones: Vec<Tombstone>,
) -> Arc<QueryableBatch> {
// make snapshots for the batches
let mut snapshots = vec![];
@ -140,7 +101,6 @@ pub(crate) fn make_queryable_batch_with_deletes(
table_name.into(),
PartitionId::new(partition_id),
snapshots,
tombstones,
))
}
@ -655,65 +615,24 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation)
let _ignored = ingester
.shard(shard_id)
.unwrap()
.namespace(TEST_NAMESPACE)
.namespace(&TEST_NAMESPACE.into())
.unwrap()
.snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
.snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
.await;
} else if loc.contains(DataLocation::SNAPSHOT) {
// move partition 1 data to snapshot
let _ignored = ingester
.shard(shard_id)
.unwrap()
.namespace(TEST_NAMESPACE)
.namespace(&TEST_NAMESPACE.into())
.unwrap()
.snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
.snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1))
.await;
}
ingester
}
pub(crate) async fn make_ingester_data_with_tombstones(loc: DataLocation) -> IngesterData {
// Whatever data because they won't be used in the tests
let metrics: Arc<metric::Registry> = Default::default();
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
let object_store = Arc::new(InMemory::new());
let exec = Arc::new(iox_query::exec::Executor::new(1));
let lifecycle = LifecycleManager::new(
LifecycleConfig::new(
200_000_000,
100_000_000,
100_000_000,
Duration::from_secs(100_000_000),
Duration::from_secs(100_000_000),
100_000_000,
),
Arc::clone(&metrics),
Arc::new(SystemProvider::default()),
);
// Make data for one shard and two tables
let shard_index = ShardIndex::new(0);
let (shard_id, _, _) =
populate_catalog(&*catalog, shard_index, TEST_NAMESPACE, TEST_TABLE).await;
let ingester = IngesterData::new(
object_store,
Arc::clone(&catalog),
[(shard_id, shard_index)],
exec,
Arc::new(CatalogPartitionResolver::new(catalog)),
backoff::BackoffConfig::default(),
metrics,
);
// Make partitions per requested
make_one_partition_with_tombstones(&ingester, &lifecycle.handle(), loc, shard_index, shard_id)
.await;
ingester
}
/// Make data for one or two partitions per requested
pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> Vec<DmlOperation> {
// In-memory data includes these rows but split between 4 groups go into
@ -783,133 +702,6 @@ pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) ->
ops
}
/// Make data for one partition with tombstones
async fn make_one_partition_with_tombstones(
ingester: &IngesterData,
lifecycle_handle: &dyn LifecycleHandle,
loc: DataLocation,
shard_index: ShardIndex,
shard_id: ShardId,
) {
// In-memory data includes these rows but split between 4 groups go into
// different batches of parittion 1 or partittion 2 as requeted
// let expected = vec![
// "+------------+-----+------+--------------------------------+",
// "| city | day | temp | time |",
// "+------------+-----+------+--------------------------------+",
// "| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |", // in group 1 - seq_num: 2
// "| Andover | mon | | 1970-01-01T00:00:00.000000046Z |", // in group 2 - seq_num: 3
// "| Boston | sun | 60 | 1970-01-01T00:00:00.000000036Z |", // in group 1 - seq_num: 1 --> will get deleted
// "| Boston | mon | | 1970-01-01T00:00:00.000000038Z |", // in group 3 - seq_num: 5 --> will get deleted
// "| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 8 (after the tombstone's seq num)
// "| Medford | wed | | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4
// "| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 9
// "| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6
// "+------------+-----+------+--------------------------------+",
// ];
let (ops, seq_num) =
make_first_partition_data(&PartitionKey::from(TEST_PARTITION_1), shard_index);
// Apply all ops
for op in ops {
ingester
.buffer_operation(shard_id, op, lifecycle_handle)
.await
.unwrap();
}
if loc.contains(DataLocation::PERSISTING) {
// Move partition 1 data to persisting
let _ignored = ingester
.shard(shard_id)
.unwrap()
.namespace(TEST_NAMESPACE)
.unwrap()
.snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
.await;
} else if loc.contains(DataLocation::SNAPSHOT) {
// move partition 1 data to snapshot
let _ignored = ingester
.shard(shard_id)
.unwrap()
.namespace(TEST_NAMESPACE)
.unwrap()
.snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1))
.await;
}
// Add tombstones
// Depending on where the existing data is, they (buffer & snapshot) will be either moved to a new snapshot after
// applying the tombstone or (persisting) stay where they are and the tombstones is kept to get applied later
// ------------------------------------------
// Delete
let mut seq_num = seq_num.get();
seq_num += 1;
let delete = parse_delete_predicate(
"1970-01-01T00:00:00.000000010Z",
"1970-01-01T00:00:00.000000050Z",
"city=Boston",
)
.unwrap();
ingester
.buffer_operation(
shard_id,
DmlOperation::Delete(DmlDelete::new(
TEST_NAMESPACE.to_string(),
delete,
NonEmptyString::new(TEST_TABLE),
DmlMeta::sequenced(
Sequence {
shard_index,
sequence_number: SequenceNumber::new(seq_num),
},
Time::MIN,
None,
42,
),
)),
lifecycle_handle,
)
.await
.unwrap();
// Group 4: in buffer of p1 after the tombstone
ingester
.buffer_operation(
shard_id,
DmlOperation::Write(make_write_op(
&PartitionKey::from(TEST_PARTITION_1),
shard_index,
TEST_NAMESPACE,
seq_num,
r#"test_table,city=Medford day="sun",temp=55 22"#,
)),
lifecycle_handle,
)
.await
.unwrap();
seq_num += 1;
ingester
.buffer_operation(
shard_id,
DmlOperation::Write(make_write_op(
&PartitionKey::from(TEST_PARTITION_1),
shard_index,
TEST_NAMESPACE,
seq_num,
r#"test_table,city=Reading day="mon",temp=58 40"#,
)),
lifecycle_handle,
)
.await
.unwrap();
}
pub(crate) fn make_write_op(
partition_key: &PartitionKey,
shard_index: ShardIndex,

View File

@ -463,7 +463,10 @@ pub trait PartitionRepo: Send + Sync {
partition_id: PartitionId,
) -> Result<Option<PartitionInfo>>;
/// Update the sort key for the partition
/// Update the sort key for the partition.
///
/// NOTE: it is expected that ONLY the ingesters update sort keys for
/// existing partitions.
async fn update_sort_key(
&mut self,
partition_id: PartitionId,

View File

@ -1878,7 +1878,7 @@ LIMIT $4;
sqlx::query_as::<_, PartitionParam>(
r#"
SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id,
parquet_file.table_id,
parquet_file.table_id,
count(case when to_delete is null then 1 end) total_count,
max(case when compaction_level= $4 then parquet_file.created_at end)
FROM parquet_file

View File

@ -11,7 +11,7 @@ chrono = { version = "0.4", default-features = false }
chrono-english = "0.1.4"
clap = { version = "4", features = ["derive", "env", "cargo"] }
futures = "0.3"
handlebars = "4.3.4"
handlebars = "4.3.5"
humantime = "2.1.0"
influxdb2_client = { path = "../influxdb2_client" }
itertools = "0.10.5"
@ -22,7 +22,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
regex = "1.6"
schema = { path = "../schema" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.83"
serde_json = "1.0.86"
snafu = "0.7"
tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
toml = "0.5.9"

View File

@ -762,7 +762,7 @@ mod tests {
.unwrap();
// Input has one row that has no value (NULL value) for tag_b, which is its own series
let input = stream_from_batch(batch);
let input = stream_from_batch(batch.schema(), batch);
let table_name = "foo";
let tag_columns = ["tag_a", "tag_b"];
@ -873,7 +873,8 @@ mod tests {
.collect();
// stream from those batches
stream_from_batches(batches)
assert!(!batches.is_empty());
stream_from_batches(batches[0].schema(), batches)
})
.collect()
}

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@ use async_trait::async_trait;
use data_types::{
ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary, TimestampMinMax,
};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use exec::{stringset::StringSet, IOxSessionContext};
use hashbrown::HashMap;
use observability_deps::tracing::{debug, trace};
@ -141,9 +141,6 @@ impl Drop for QueryCompletedToken {
/// This avoids storing potentially large strings
pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
/// Error type for [`QueryDatabase`] operations.
pub type QueryDatabaseError = Box<dyn std::error::Error + Send + Sync + 'static>;
/// A `Database` is the main trait implemented by the IOx subsystems
/// that store actual data.
///
@ -154,12 +151,15 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
/// Returns a set of chunks within the partition with data that may match
/// the provided predicate. If possible, chunks which have no rows that can
/// possibly match the predicate may be omitted.
/// If projection is None, returned chunks will include all columns of its original data. Otherwise,
/// returned chunks will includs PK columns (tags and time) and columns specified in the projection.
async fn chunks(
&self,
table_name: &str,
predicate: &Predicate,
projection: &Option<Vec<usize>>,
ctx: IOxSessionContext,
) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError>;
) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;
/// Record that particular type of query was run / planned
fn record_query(
@ -175,9 +175,6 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync {
fn as_meta(&self) -> &dyn QueryDatabaseMeta;
}
/// Error type for [`QueryChunk`] operations.
pub type QueryChunkError = Box<dyn std::error::Error + Send + Sync + 'static>;
/// Collection of data that shares the same partition key
pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
/// returns the Id of this chunk. Ids are unique within a
@ -200,7 +197,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
fn apply_predicate_to_metadata(
&self,
predicate: &Predicate,
) -> Result<PredicateMatch, QueryChunkError> {
) -> Result<PredicateMatch, DataFusionError> {
Ok(self
.summary()
.map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow()))
@ -216,7 +213,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
ctx: IOxSessionContext,
predicate: &Predicate,
columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError>;
) -> Result<Option<StringSet>, DataFusionError>;
/// Return a set of Strings containing the distinct values in the
/// specified columns. If the predicate can be evaluated entirely
@ -228,7 +225,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
ctx: IOxSessionContext,
column_name: &str,
predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError>;
) -> Result<Option<StringSet>, DataFusionError>;
/// Provides access to raw `QueryChunk` data as an
/// asynchronous stream of `RecordBatch`es filtered by a *required*
@ -248,7 +245,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static {
ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError>;
) -> Result<SendableRecordBatchStream, DataFusionError>;
/// Returns chunk type. Useful in tests and debug logs.
fn chunk_type(&self) -> &str;

View File

@ -262,7 +262,7 @@ mod tests {
let batch = make_batch();
let output_schema = batch.schema();
let input_stream = stream_from_batch(batch);
let input_stream = stream_from_batch(batch.schema(), batch);
let adapter_stream =
SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
@ -291,7 +291,7 @@ mod tests {
Field::new("c", DataType::Utf8, false),
Field::new("a", DataType::Int32, false),
]));
let input_stream = stream_from_batch(batch);
let input_stream = stream_from_batch(batch.schema(), batch);
let adapter_stream =
SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
@ -321,7 +321,7 @@ mod tests {
Field::new("d", DataType::Float32, true),
Field::new("a", DataType::Int32, false),
]));
let input_stream = stream_from_batch(batch);
let input_stream = stream_from_batch(batch.schema(), batch);
let adapter_stream =
SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap();
@ -349,7 +349,7 @@ mod tests {
Field::new("c", DataType::Utf8, false),
Field::new("a", DataType::Int32, false),
]));
let input_stream = stream_from_batch(batch);
let input_stream = stream_from_batch(batch.schema(), batch);
let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
assert_contains!(
@ -368,7 +368,7 @@ mod tests {
Field::new("b", DataType::Int32, false),
Field::new("a", DataType::Int32, false),
]));
let input_stream = stream_from_batch(batch);
let input_stream = stream_from_batch(batch.schema(), batch);
let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics());
assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'");

View File

@ -8,8 +8,8 @@ use crate::{
stringset::{StringSet, StringSetRef},
ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext,
},
Predicate, PredicateMatch, QueryChunk, QueryChunkError, QueryChunkMeta, QueryCompletedToken,
QueryDatabase, QueryDatabaseError, QueryText,
Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase,
QueryText,
};
use arrow::{
array::{
@ -24,7 +24,7 @@ use data_types::{
ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues,
Statistics, TableSummary, TimestampMinMax,
};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use datafusion_util::stream_from_batches;
use futures::StreamExt;
use hashbrown::HashSet;
@ -108,18 +108,54 @@ impl QueryDatabase for TestDatabase {
&self,
table_name: &str,
predicate: &Predicate,
projection: &Option<Vec<usize>>,
_ctx: IOxSessionContext,
) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
// save last predicate
*self.chunks_predicate.lock() = predicate.clone();
let partitions = self.partitions.lock();
Ok(partitions
let partitions = self.partitions.lock().clone();
let chunks = partitions
.values()
.flat_map(|x| x.values())
.filter(|x| x.table_name == table_name)
.map(|x| Arc::clone(x) as _)
.collect())
.map(|x| Arc::clone(x) as Arc<dyn QueryChunk>)
.collect::<Vec<_>>();
// Return chunks with fewer columns if a projection is specified
let mut new_chunks = Vec::with_capacity(chunks.len());
for c in chunks {
let schema = c.schema();
let cols = schema.select_given_and_pk_columns(projection);
let cols = cols.iter().map(|c| c.as_str()).collect::<Vec<_>>();
let selection = Selection::Some(&cols);
let read_result =
c.read_filter(IOxSessionContext::with_testing(), predicate, selection);
if read_result.is_err() {
return Err(read_result.err().unwrap());
}
let mut stream = read_result.unwrap();
let mut new_chunk = TestChunk::new(c.table_name());
while let Some(b) = stream.next().await {
let b = b.expect("Error in stream");
new_chunk.table_data.push(Arc::new(b));
}
let new_chunk = if !new_chunk.table_data.is_empty() {
let new_schema = Schema::try_from(new_chunk.table_data[0].schema()).unwrap();
let new_chunk = new_chunk.add_schema_to_table(new_schema, true, None);
Arc::new(new_chunk) as _
} else {
// No data, return the original empty chunk with the original schema
c
};
new_chunks.push(new_chunk);
}
Ok(new_chunks)
}
fn record_query(
@ -327,9 +363,9 @@ impl TestChunk {
}
/// Checks the saved error, and returns it if any, otherwise returns OK
fn check_error(&self) -> Result<(), QueryChunkError> {
fn check_error(&self) -> Result<(), DataFusionError> {
if let Some(message) = self.saved_error.as_ref() {
Err(message.clone().into())
Err(DataFusionError::External(message.clone().into()))
} else {
Ok(())
}
@ -509,12 +545,8 @@ impl TestChunk {
mut self,
new_column_schema: Schema,
add_column_summary: bool,
stats: Option<Statistics>,
input_stats: Option<Statistics>,
) -> Self {
// assume the new schema has exactly a single table
assert_eq!(new_column_schema.len(), 1);
let (col_type, new_field) = new_column_schema.field(0);
let mut merger = SchemaMerger::new();
merger = merger.merge(&new_column_schema).unwrap();
merger = merger
@ -522,34 +554,38 @@ impl TestChunk {
.expect("merging was successful");
self.schema = merger.build();
if add_column_summary {
let influxdb_type = col_type.map(|t| match t {
InfluxColumnType::Tag => InfluxDbType::Tag,
InfluxColumnType::Field(_) => InfluxDbType::Field,
InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
});
for i in 0..new_column_schema.len() {
let (col_type, new_field) = new_column_schema.field(i);
if add_column_summary {
let influxdb_type = col_type.map(|t| match t {
InfluxColumnType::Tag => InfluxDbType::Tag,
InfluxColumnType::Field(_) => InfluxDbType::Field,
InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
});
let stats = stats.unwrap_or_else(|| match new_field.data_type() {
DataType::Boolean => Statistics::Bool(StatValues::default()),
DataType::Int64 => Statistics::I64(StatValues::default()),
DataType::UInt64 => Statistics::U64(StatValues::default()),
DataType::Utf8 => Statistics::String(StatValues::default()),
DataType::Dictionary(_, value_type) => {
assert!(matches!(**value_type, DataType::Utf8));
Statistics::String(StatValues::default())
}
DataType::Float64 => Statistics::F64(StatValues::default()),
DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
_ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
});
let stats = input_stats.clone();
let stats = stats.unwrap_or_else(|| match new_field.data_type() {
DataType::Boolean => Statistics::Bool(StatValues::default()),
DataType::Int64 => Statistics::I64(StatValues::default()),
DataType::UInt64 => Statistics::U64(StatValues::default()),
DataType::Utf8 => Statistics::String(StatValues::default()),
DataType::Dictionary(_, value_type) => {
assert!(matches!(**value_type, DataType::Utf8));
Statistics::String(StatValues::default())
}
DataType::Float64 => Statistics::F64(StatValues::default()),
DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()),
_ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()),
});
let column_summary = ColumnSummary {
name: new_field.name().clone(),
influxdb_type,
stats,
};
let column_summary = ColumnSummary {
name: new_field.name().clone(),
influxdb_type,
stats,
};
self.table_summary.columns.push(column_summary);
self.table_summary.columns.push(column_summary);
}
}
self
@ -921,13 +957,17 @@ impl QueryChunk for TestChunk {
_ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError> {
) -> Result<SendableRecordBatchStream, DataFusionError> {
self.check_error()?;
// save the predicate
self.predicates.lock().push(predicate.clone());
let batches = match self.schema.df_projection(selection)? {
let batches = match self
.schema
.df_projection(selection)
.map_err(|e| DataFusionError::External(Box::new(e)))?
{
None => self.table_data.clone(),
Some(projection) => self
.table_data
@ -938,7 +978,8 @@ impl QueryChunk for TestChunk {
})
.collect::<std::result::Result<Vec<_>, ArrowError>>()?,
};
Ok(stream_from_batches(batches))
Ok(stream_from_batches(self.schema().as_arrow(), batches))
}
fn chunk_type(&self) -> &str {
@ -948,7 +989,7 @@ impl QueryChunk for TestChunk {
fn apply_predicate_to_metadata(
&self,
predicate: &Predicate,
) -> Result<PredicateMatch, QueryChunkError> {
) -> Result<PredicateMatch, DataFusionError> {
self.check_error()?;
// save the predicate
@ -967,7 +1008,7 @@ impl QueryChunk for TestChunk {
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
// Model not being able to get column values from metadata
Ok(None)
}
@ -977,7 +1018,7 @@ impl QueryChunk for TestChunk {
_ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
self.check_error()?;
// save the predicate

View File

@ -14,7 +14,7 @@ iox_catalog = { path = "../iox_catalog" }
iox_time = { path = "../iox_time" }
metric = { path = "../metric" }
mutable_batch_lp = { path = "../mutable_batch_lp" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
once_cell = { version = "1.15.0", features = ["parking_lot"] }
parquet_file = { path = "../parquet_file" }

View File

@ -40,7 +40,7 @@ log = "0.4"
parking_lot = "0.12"
reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.83"
serde_json = "1.0.86"
serde_urlencoded = "0.7.0"
snafu = "0.7"
tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }

View File

@ -15,7 +15,7 @@ iox_catalog = { path = "../iox_catalog" }
ioxd_common = { path = "../ioxd_common" }
metric = { path = "../metric" }
iox_query = { path = "../iox_query" }
object_store = "0.5.0"
object_store = "0.5.1"
iox_time = { path = "../iox_time" }
trace = { path = "../trace" }

View File

@ -11,7 +11,7 @@ ingester = { path = "../ingester" }
iox_catalog = { path = "../iox_catalog" }
ioxd_common = { path = "../ioxd_common" }
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
iox_query = { path = "../iox_query" }
trace = { path = "../trace" }
write_buffer = { path = "../write_buffer" }

View File

@ -11,7 +11,7 @@ generated_types = { path = "../generated_types" }
iox_catalog = { path = "../iox_catalog" }
ioxd_common = { path = "../ioxd_common" }
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
querier = { path = "../querier" }
iox_query = { path = "../iox_query" }
router = { path = "../router" }

View File

@ -11,7 +11,7 @@ iox_catalog = { path = "../iox_catalog" }
ioxd_common = { path = "../ioxd_common" }
metric = { path = "../metric" }
mutable_batch = { path = "../mutable_batch" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
router = { path = "../router" }
sharder = { path = "../sharder" }

View File

@ -10,7 +10,7 @@ bytes = "1.2"
futures = "0.3"
iox_time = { version = "0.1.0", path = "../iox_time" }
metric = { version = "0.1.0", path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
pin-project = "1.0.12"
tokio = { version = "1.21", features = ["io-util"] }
workspace-hack = { path = "../workspace-hack" }

View File

@ -14,7 +14,7 @@ datafusion_util = { path = "../datafusion_util" }
futures = "0.3"
generated_types = { path = "../generated_types" }
iox_time = { path = "../iox_time" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
parking_lot = "0.12"
parquet = {version = "23.0.0", features = ["experimental"]}

View File

@ -10,7 +10,7 @@ datafusion = { path = "../datafusion" }
influxdb_line_protocol = { path = "../influxdb_line_protocol" }
futures = {version = "0.3"}
num_cpus = "1.13.1"
object_store = { version = "0.5.0" }
object_store = { version = "0.5.1" }
parquet_file = { path = "../parquet_file" }
schema = { path = "../schema" }
tokio = "1.0"

View File

@ -13,9 +13,9 @@ itertools = "0.10"
observability_deps = { path = "../observability_deps" }
query_functions = { path = "../query_functions"}
schema = { path = "../schema" }
serde_json = "1.0.83"
serde_json = "1.0.86"
snafu = "0.7"
sqlparser = "0.24.0"
sqlparser = "0.25.0"
workspace-hack = { path = "../workspace-hack"}
[dev-dependencies]

View File

@ -12,7 +12,6 @@
pub mod delete_expr;
pub mod delete_predicate;
pub mod rewrite;
pub mod rpc_predicate;
use arrow::{

View File

@ -1,19 +1,23 @@
mod column_rewrite;
mod field_rewrite;
mod measurement_rewrite;
mod rewrite;
mod value_rewrite;
use crate::{rewrite, Predicate};
use crate::Predicate;
use datafusion::error::{DataFusionError, Result as DataFusionResult};
use datafusion::execution::context::ExecutionProps;
use datafusion::logical_expr::lit;
use datafusion::logical_plan::{
Column, Expr, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
Column, Expr, ExprRewritable, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo,
};
use observability_deps::tracing::{debug, trace};
use schema::Schema;
use std::collections::BTreeSet;
use std::sync::Arc;
use self::column_rewrite::MissingColumnRewriter;
use self::field_rewrite::FieldProjectionRewriter;
use self::measurement_rewrite::rewrite_measurement_references;
use self::value_rewrite::rewrite_field_value_references;
@ -187,6 +191,7 @@ fn normalize_predicate(
let mut predicate = predicate.clone();
let mut field_projections = FieldProjectionRewriter::new(Arc::clone(&schema));
let mut missing_columums = MissingColumnRewriter::new(Arc::clone(&schema));
let mut field_value_exprs = vec![];
@ -194,24 +199,38 @@ fn normalize_predicate(
.exprs
.into_iter()
.map(|e| {
rewrite_measurement_references(table_name, e)
debug!(?e, "rewriting expr");
let e = rewrite_measurement_references(table_name, e)
.map(|e| log_rewrite(e, "rewrite_measurement_references"))
// Rewrite any references to `_value = some_value` to literal true values.
// Keeps track of these expressions, which can then be used to
// augment field projections with conditions using `CASE` statements.
.and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e))
.map(|e| log_rewrite(e, "rewrite_field_value_references"))
// Rewrite any references to `_field` with a literal
// and keep track of referenced field names to add to
// the field column projection set.
.and_then(|e| field_projections.rewrite_field_exprs(e))
.map(|e| log_rewrite(e, "field_projections"))
// remove references to columns that don't exist in this schema
.and_then(|e| e.rewrite(&mut missing_columums))
.map(|e| log_rewrite(e, "missing_columums"))
// apply IOx specific rewrites (that unlock other simplifications)
.and_then(rewrite::rewrite)
// Call the core DataFusion simplification logic
.map(|e| log_rewrite(e, "rewrite"))
// Call DataFusion simplification logic
.and_then(|e| {
let adapter = SimplifyAdapter::new(schema.as_ref());
// simplify twice to ensure "full" cleanup
e.simplify(&adapter)?.simplify(&adapter)
})
.map(|e| log_rewrite(e, "simplify_expr"))
.and_then(rewrite::simplify_predicate)
.map(|e| log_rewrite(e, "simplify_expr"));
debug!(?e, "rewritten expr");
e
})
// Filter out literal true so is_empty works correctly
.filter(|f| match f {
@ -227,6 +246,11 @@ fn normalize_predicate(
field_projections.add_to_predicate(predicate)
}
fn log_rewrite(expr: Expr, description: &str) -> Expr {
trace!(?expr, %description, "After rewrite");
expr
}
struct SimplifyAdapter<'a> {
schema: &'a Schema,
execution_props: ExecutionProps,
@ -290,9 +314,27 @@ mod tests {
use super::*;
use arrow::datatypes::DataType;
use datafusion::logical_plan::{col, lit};
use datafusion::{
logical_plan::{col, lit},
scalar::ScalarValue,
};
use test_helpers::assert_contains;
#[test]
fn test_normalize_predicate_coerced() {
let schema = schema();
let predicate = normalize_predicate(
"table",
Arc::clone(&schema),
&Predicate::new().with_expr(col("t1").eq(lit("f1"))),
)
.unwrap();
let expected = Predicate::new().with_expr(col("t1").eq(lit("f1")));
assert_eq!(predicate, expected);
}
#[test]
fn test_normalize_predicate_field_rewrite() {
let predicate = normalize_predicate(
@ -336,6 +378,20 @@ mod tests {
assert_eq!(predicate, expected);
}
#[test]
fn test_normalize_predicate_field_non_tag() {
// should treat
let predicate = normalize_predicate(
"table",
schema(),
&Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))),
)
.unwrap();
let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None)));
assert_eq!(predicate, expected);
}
#[test]
fn test_normalize_predicate_field_rewrite_multi_field_unsupported() {
let err = normalize_predicate(

View File

@ -0,0 +1,99 @@
use std::sync::Arc;
use datafusion::{
error::Result as DataFusionResult, logical_plan::ExprRewriter, prelude::*, scalar::ScalarValue,
};
use schema::Schema;
/// Logic for rewriting expressions from influxrpc that reference non
/// existent columns to NULL
#[derive(Debug)]
pub(crate) struct MissingColumnRewriter {
/// The input schema
schema: Arc<Schema>,
}
impl MissingColumnRewriter {
/// Create a new [`MissingColumnRewriter`] targeting the given schema
pub(crate) fn new(schema: Arc<Schema>) -> Self {
Self { schema }
}
fn column_exists(&self, col: &Column) -> DataFusionResult<bool> {
// todo a real error here (rpc_predicates shouldn't have table/relation qualifiers)
assert!(col.relation.is_none());
if self.schema.find_index_of(&col.name).is_some() {
Ok(true)
} else {
Ok(false)
}
}
}
fn lit_null() -> Expr {
lit(ScalarValue::Utf8(None))
}
impl ExprRewriter for MissingColumnRewriter {
fn mutate(&mut self, expr: Expr) -> DataFusionResult<Expr> {
Ok(match expr {
Expr::Column(col) if !self.column_exists(&col)? => lit_null(),
expr => expr,
})
}
}
#[cfg(test)]
mod tests {
use datafusion::{arrow::datatypes::DataType, logical_plan::ExprRewritable};
use schema::SchemaBuilder;
use super::*;
#[test]
fn all_columns_defined_no_rewrite() {
// t1 = "foo"
let expr = col("t1").eq(lit("foo"));
assert_eq!(rewrite(expr.clone()), expr);
// f1 > 1.0
let expr = col("f1").gt(lit(1.0));
assert_eq!(rewrite(expr.clone()), expr);
}
#[test]
fn all_columns_not_defined() {
// non_defined = "foo" --> NULL = "foo"
let expr = col("non_defined").eq(lit("foo"));
let expected = lit_null().eq(lit("foo"));
assert_eq!(rewrite(expr), expected);
// non_defined = 1.4 --> NULL = 1.4
let expr = col("non_defined").eq(lit(1.4));
// No type is inferred so this is a literal null string (even though it maybe should be a literal float)
let expected = lit_null().eq(lit(1.4));
assert_eq!(rewrite(expr), expected);
}
#[test]
fn some_columns_not_defined() {
// t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar"
let expr = col("t1")
.eq(lit("foo"))
.and(col("non_defined").eq(lit("bar")));
let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar")));
assert_eq!(rewrite(expr), expected);
}
fn rewrite(expr: Expr) -> Expr {
let schema = SchemaBuilder::new()
.tag("t1")
.field("f1", DataType::Int64)
.build()
.unwrap();
let mut rewriter = MissingColumnRewriter::new(Arc::new(schema));
expr.rewrite(&mut rewriter).unwrap()
}
}

View File

@ -55,8 +55,8 @@ impl FieldProjectionRewriter {
}
}
// Rewrites the predicate. See the description on
// [`FieldProjectionRewriter`] for more details.
/// Rewrites the predicate. See the description on
/// [`FieldProjectionRewriter`] for more details.
pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult<Expr> {
// for predicates like `A AND B AND C`
// rewrite `A`, `B` and `C` separately and put them back together

View File

@ -18,7 +18,7 @@ generated_types = { path = "../generated_types" }
influxdb_iox_client = { path = "../influxdb_iox_client" }
iox_catalog = { path = "../iox_catalog" }
metric = { path = "../metric" }
object_store = "0.5.0"
object_store = "0.5.1"
observability_deps = { path = "../observability_deps" }
parking_lot = "0.12"
parquet_file = { path = "../parquet_file" }

View File

@ -470,9 +470,9 @@ mod tests {
.into_iter()
.map(lp_to_record_batch)
.map(Arc::new)
.collect();
.collect::<Vec<_>>();
let stream = stream_from_batches(batches);
let stream = stream_from_batches(batches[0].schema(), batches);
let metric_registry = metric::Registry::new();

View File

@ -7,13 +7,16 @@ use arrow::{
use data_types::{
ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary, TimestampMinMax,
};
use datafusion::physical_plan::{
stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
use datafusion::{
error::DataFusionError,
physical_plan::{
stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream,
},
};
use futures::{Stream, TryStreamExt};
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
QueryChunk, QueryChunkError, QueryChunkMeta,
QueryChunk, QueryChunkMeta,
};
use observability_deps::tracing::debug;
use predicate::Predicate;
@ -114,7 +117,7 @@ impl QueryChunk for QuerierChunk {
mut ctx: IOxSessionContext,
predicate: &Predicate,
columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
ctx.set_metadata("projection", format!("{}", columns));
ctx.set_metadata("predicate", format!("{}", &predicate));
@ -161,10 +164,10 @@ impl QueryChunk for QuerierChunk {
None
}
Err(other) => {
return Err(Box::new(Error::RBChunk {
return Err(DataFusionError::External(Box::new(Error::RBChunk {
source: other,
chunk_id: self.id(),
}))
})))
}
};
@ -178,7 +181,7 @@ impl QueryChunk for QuerierChunk {
mut ctx: IOxSessionContext,
column_name: &str,
predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
ctx.set_metadata("column_name", column_name.to_string());
ctx.set_metadata("predicate", format!("{}", &predicate));
@ -205,11 +208,13 @@ impl QueryChunk for QuerierChunk {
};
ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));
let mut values = rb_chunk.column_values(
rb_predicate,
Selection::Some(&[column_name]),
BTreeMap::new(),
)?;
let mut values = rb_chunk
.column_values(
rb_predicate,
Selection::Some(&[column_name]),
BTreeMap::new(),
)
.map_err(|e| DataFusionError::External(Box::new(e)))?;
// The InfluxRPC frontend only supports getting column values
// for one column at a time (this is a restriction on the Influx
@ -221,7 +226,8 @@ impl QueryChunk for QuerierChunk {
.context(ColumnNameNotFoundSnafu {
chunk_id: self.id(),
column_name,
})?;
})
.map_err(|e| DataFusionError::External(Box::new(e)))?;
ctx.set_metadata("output_values", values.len() as i64);
Ok(Some(values))
@ -234,7 +240,7 @@ impl QueryChunk for QuerierChunk {
mut ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError> {
) -> Result<SendableRecordBatchStream, DataFusionError> {
let span_recorder = SpanRecorder::new(
ctx.span()
.map(|span| span.child("QuerierChunk::read_filter")),

View File

@ -11,6 +11,7 @@ use data_types::{
ChunkId, ChunkOrder, IngesterMapping, PartitionId, SequenceNumber, ShardId, ShardIndex,
TableSummary, TimestampMinMax,
};
use datafusion::error::DataFusionError;
use datafusion_util::MemoryStream;
use futures::{stream::FuturesUnordered, TryStreamExt};
use generated_types::{
@ -24,7 +25,7 @@ use influxdb_iox_client::flight::{
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
util::compute_timenanosecond_min_max,
QueryChunk, QueryChunkError, QueryChunkMeta,
QueryChunk, QueryChunkMeta,
};
use iox_time::{Time, TimeProvider};
use metric::{DurationHistogram, Metric};
@ -612,9 +613,7 @@ impl IngesterStreamDecoder {
partition_id,
shard_id,
status.parquet_max_sequence_number.map(SequenceNumber::new),
status
.tombstone_max_sequence_number
.map(SequenceNumber::new),
None,
partition_sort_key,
);
self.current_partition = Some(partition);
@ -1097,7 +1096,7 @@ impl QueryChunk for IngesterChunk {
_ctx: IOxSessionContext,
_predicate: &Predicate,
_columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
// TODO maybe some special handling?
Ok(None)
}
@ -1107,7 +1106,7 @@ impl QueryChunk for IngesterChunk {
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
) -> Result<Option<StringSet>, DataFusionError> {
// TODO maybe some special handling?
Ok(None)
}
@ -1117,11 +1116,15 @@ impl QueryChunk for IngesterChunk {
_ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<datafusion::physical_plan::SendableRecordBatchStream, QueryChunkError> {
) -> Result<datafusion::physical_plan::SendableRecordBatchStream, DataFusionError> {
trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data");
// Apply selection to in-memory batch
let batches = match self.schema.df_projection(selection)? {
let batches = match self
.schema
.df_projection(selection)
.map_err(|e| DataFusionError::External(Box::new(e)))?
{
None => self.batches.clone(),
Some(projection) => self
.batches
@ -1333,7 +1336,6 @@ mod tests {
partition_id: 1,
status: Some(PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
))],
@ -1389,7 +1391,6 @@ mod tests {
partition_id: 1,
status: Some(PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
)),
@ -1399,7 +1400,6 @@ mod tests {
partition_id: 2,
status: Some(PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
)),
@ -1409,7 +1409,6 @@ mod tests {
partition_id: 1,
status: Some(PartitionStatus {
parquet_max_sequence_number: None,
tombstone_max_sequence_number: None,
}),
},
)),
@ -1489,7 +1488,6 @@ mod tests {
partition_id: 1,
status: Some(PartitionStatus {
parquet_max_sequence_number: Some(11),
tombstone_max_sequence_number: Some(12),
}),
},
)),
@ -1519,7 +1517,6 @@ mod tests {
partition_id: 2,
status: Some(PartitionStatus {
parquet_max_sequence_number: Some(21),
tombstone_max_sequence_number: Some(22),
}),
},
)),
@ -1544,7 +1541,6 @@ mod tests {
partition_id: 3,
status: Some(PartitionStatus {
parquet_max_sequence_number: Some(31),
tombstone_max_sequence_number: Some(32),
}),
},
)),
@ -1574,10 +1570,7 @@ mod tests {
p1.parquet_max_sequence_number,
Some(SequenceNumber::new(11))
);
assert_eq!(
p1.tombstone_max_sequence_number,
Some(SequenceNumber::new(12))
);
assert_eq!(p1.tombstone_max_sequence_number, None);
assert_eq!(p1.chunks.len(), 2);
assert_eq!(p1.chunks[0].schema().as_arrow(), schema_1_1);
assert_eq!(p1.chunks[0].batches.len(), 2);
@ -1594,10 +1587,7 @@ mod tests {
p2.parquet_max_sequence_number,
Some(SequenceNumber::new(21))
);
assert_eq!(
p2.tombstone_max_sequence_number,
Some(SequenceNumber::new(22))
);
assert_eq!(p2.tombstone_max_sequence_number, None);
assert_eq!(p2.chunks.len(), 1);
assert_eq!(p2.chunks[0].schema().as_arrow(), schema_2_1);
assert_eq!(p2.chunks[0].batches.len(), 1);
@ -1610,10 +1600,7 @@ mod tests {
p3.parquet_max_sequence_number,
Some(SequenceNumber::new(31))
);
assert_eq!(
p3.tombstone_max_sequence_number,
Some(SequenceNumber::new(32))
);
assert_eq!(p3.tombstone_max_sequence_number, None);
assert_eq!(p3.chunks.len(), 1);
assert_eq!(p3.chunks[0].schema().as_arrow(), schema_3_1);
assert_eq!(p3.chunks[0].batches.len(), 1);
@ -1733,7 +1720,6 @@ mod tests {
partition_id: 1,
status: Some(PartitionStatus {
parquet_max_sequence_number: Some(11),
tombstone_max_sequence_number: Some(12),
}),
},
)),
@ -1773,10 +1759,7 @@ mod tests {
p1.parquet_max_sequence_number,
Some(SequenceNumber::new(11))
);
assert_eq!(
p1.tombstone_max_sequence_number,
Some(SequenceNumber::new(12))
);
assert_eq!(p1.tombstone_max_sequence_number, None);
assert_eq!(p1.chunks.len(), 1);
}

View File

@ -11,10 +11,11 @@ use data_types::NamespaceId;
use datafusion::{
catalog::{catalog::CatalogProvider, schema::SchemaProvider},
datasource::TableProvider,
error::DataFusionError,
};
use iox_query::{
exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext},
QueryChunk, QueryCompletedToken, QueryDatabase, QueryDatabaseError, QueryText, DEFAULT_SCHEMA,
QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA,
};
use observability_deps::tracing::{debug, trace};
use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate};
@ -40,8 +41,9 @@ impl QueryDatabase for QuerierNamespace {
&self,
table_name: &str,
predicate: &Predicate,
projection: &Option<Vec<usize>>,
ctx: IOxSessionContext,
) -> Result<Vec<Arc<dyn QueryChunk>>, QueryDatabaseError> {
) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
debug!(%table_name, %predicate, "Finding chunks for table");
// get table metadata
let table = match self.tables.get(table_name).map(Arc::clone) {
@ -57,7 +59,7 @@ impl QueryDatabase for QuerierNamespace {
.chunks(
predicate,
ctx.span().map(|span| span.child("querier table chunks")),
&None, // todo: pushdown projection to chunks
projection,
)
.await?;
@ -627,7 +629,7 @@ mod tests {
.unwrap_err();
assert_eq!(
err.to_string(),
format!("Cannot build plan: External error: Chunk pruning failed: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
format!("Cannot build plan: Resources exhausted: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."),
);
}

View File

@ -8,6 +8,7 @@ use crate::{
IngesterConnection,
};
use data_types::{ColumnId, PartitionId, ShardIndex, TableId, TimestampMinMax};
use datafusion::error::DataFusionError;
use futures::{join, StreamExt};
use iox_query::pruning::prune_summaries;
use iox_query::{exec::Executor, provider, provider::ChunkPruner, QueryChunk};
@ -65,6 +66,17 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
impl From<Error> for DataFusionError {
fn from(err: Error) -> Self {
match err {
Error::ChunkPruning {
source: err @ provider::Error::TooMuchData { .. },
} => Self::ResourcesExhausted(err.to_string()),
_ => Self::External(Box::new(err) as _),
}
}
}
/// Args to create a [`QuerierTable`].
pub struct QuerierTableArgs {
pub sharder: Arc<JumpHash<Arc<ShardIndex>>>,

View File

@ -66,8 +66,7 @@ impl TableProvider for QuerierTable {
ctx.child_span("querier table chunks"),
projection,
)
.await
.map_err(|e| DataFusionError::External(Box::new(e)))?;
.await?;
for chunk in chunks {
builder = builder.add_chunk(chunk);

View File

@ -23,6 +23,7 @@ use crate::{
use self::interface::{IngesterPartitionInfo, ParquetFileInfo, TombstoneInfo};
#[derive(Snafu, Debug)]
#[allow(missing_copy_implementations)]
pub enum ReconcileError {
#[snafu(display("Compactor processed file that the querier would need to split apart which is not yet implemented"))]
CompactorConflict,

View File

@ -1,25 +0,0 @@
-- Test Setup: OneDeleteSimpleExprOneChunkDeleteAll
-- SQL: SELECT * from cpu;
++
++
-- SQL: SELECT time from cpu;
++
++
-- SQL: SELECT count(*), count(bar), count(time) from cpu;
+-----------------+----------------+-----------------+
| COUNT(UInt8(1)) | COUNT(cpu.bar) | COUNT(cpu.time) |
+-----------------+----------------+-----------------+
| 0 | 0 | 0 |
+-----------------+----------------+-----------------+
-- SQL: SELECT min(bar), max(bar), min(time), max(time) from cpu;
+--------------+--------------+---------------+---------------+
| MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
+--------------+--------------+---------------+---------------+
| | | | |
+--------------+--------------+---------------+---------------+
-- SQL: SELECT max(bar) from cpu;
+--------------+
| MAX(cpu.bar) |
+--------------+
| |
+--------------+

View File

@ -1,17 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteSimpleExprOneChunkDeleteAll
-- select *
SELECT * from cpu;
-- select one specific column
SELECT time from cpu;
-- select aggregate of every column inlcuding star
SELECT count(*), count(bar), count(time) from cpu;
-- select aggregate of every column
SELECT min(bar), max(bar), min(time), max(time) from cpu;
-- select aggregate of one column
SELECT max(bar) from cpu;

View File

@ -1,207 +0,0 @@
-- Test Setup: OneDeleteMultiExprsOneChunk
-- SQL: SELECT * from cpu order by bar, foo, time;
+-----+-----+--------------------------------+
| bar | foo | time |
+-----+-----+--------------------------------+
| 1 | me | 1970-01-01T00:00:00.000000040Z |
| 2 | you | 1970-01-01T00:00:00.000000020Z |
+-----+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu order by time, bar;
+--------------------------------+-----+
| time | bar |
+--------------------------------+-----+
| 1970-01-01T00:00:00.000000020Z | 2 |
| 1970-01-01T00:00:00.000000040Z | 1 |
+--------------------------------+-----+
-- SQL: SELECT bar from cpu order by bar;
+-----+
| bar |
+-----+
| 1 |
| 2 |
+-----+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu;
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| 2 | 2 | 2 | 1 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000040Z |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT count(time) from cpu;
+-----------------+
| COUNT(cpu.time) |
+-----------------+
| 2 |
+-----------------+
-- SQL: SELECT count(foo) from cpu;
+----------------+
| COUNT(cpu.foo) |
+----------------+
| 2 |
+----------------+
-- SQL: SELECT count(bar) from cpu;
+----------------+
| COUNT(cpu.bar) |
+----------------+
| 2 |
+----------------+
-- SQL: SELECT count(*) from cpu;
+-----------------+
| COUNT(UInt8(1)) |
+-----------------+
| 2 |
+-----------------+
-- SQL: SELECT min(bar) from cpu;
+--------------+
| MIN(cpu.bar) |
+--------------+
| 1 |
+--------------+
-- SQL: SELECT foo from cpu;
-- Results After Sorting
+-----+
| foo |
+-----+
| me |
| you |
+-----+
-- SQL: SELECT min(foo) as min_foo from cpu order by min_foo;
+---------+
| min_foo |
+---------+
| me |
+---------+
-- SQL: SELECT max(foo) as max_foo from cpu order by max_foo;
+---------+
| max_foo |
+---------+
| you |
+---------+
-- SQL: SELECT min(foo) as min_foo from cpu group by time order by min_foo;
+---------+
| min_foo |
+---------+
| me |
| you |
+---------+
-- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
+---------+
| max_foo |
+---------+
| me |
| you |
+---------+
-- SQL: SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
+--------------------------------+---------+
| time | max_foo |
+--------------------------------+---------+
| 1970-01-01T00:00:00.000000020Z | you |
| 1970-01-01T00:00:00.000000040Z | me |
+--------------------------------+---------+
-- SQL: SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
+---------+
| min_foo |
+---------+
| me |
| you |
+---------+
-- SQL: SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
+-----+---------+
| bar | max_foo |
+-----+---------+
| 1 | me |
| 2 | you |
+-----+---------+
-- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo;
+---------+
| max_foo |
+---------+
| me |
| you |
+---------+
-- SQL: SELECT min(time) as min_time from cpu order by min_time;
+--------------------------------+
| min_time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu order by max_time;
+--------------------------------+
| max_time |
+--------------------------------+
| 1970-01-01T00:00:00.000000040Z |
+--------------------------------+
-- SQL: SELECT min(time) as min_time from cpu group by bar order by min_time;
+--------------------------------+
| min_time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
| 1970-01-01T00:00:00.000000040Z |
+--------------------------------+
-- SQL: SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
+-----+--------------------------------+
| bar | min_time |
+-----+--------------------------------+
| 1 | 1970-01-01T00:00:00.000000040Z |
| 2 | 1970-01-01T00:00:00.000000020Z |
+-----+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu group by foo order by max_time;
+--------------------------------+
| max_time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
| 1970-01-01T00:00:00.000000040Z |
+--------------------------------+
-- SQL: SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
+-----+--------------------------------+
| foo | max_time |
+-----+--------------------------------+
| me | 1970-01-01T00:00:00.000000040Z |
| you | 1970-01-01T00:00:00.000000020Z |
+-----+--------------------------------+
-- SQL: SELECT time from cpu;
-- Results After Sorting
+--------------------------------+
| time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
| 1970-01-01T00:00:00.000000040Z |
+--------------------------------+
-- SQL: SELECT max(bar) from cpu order by 1;
+--------------+
| MAX(cpu.bar) |
+--------------+
| 2 |
+--------------+
-- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
+-----+-----+--------------------------------+
| bar | foo | time |
+-----+-----+--------------------------------+
| 1 | me | 1970-01-01T00:00:00.000000040Z |
| 2 | you | 1970-01-01T00:00:00.000000020Z |
+-----+-----+--------------------------------+
-- SQL: SELECT foo from cpu where bar >= 1.0 order by foo;
+-----+
| foo |
+-----+
| me |
| you |
+-----+
-- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
+--------------------------------+-----+
| time | bar |
+--------------------------------+-----+
| 1970-01-01T00:00:00.000000040Z | 1 |
| 1970-01-01T00:00:00.000000020Z | 2 |
+--------------------------------+-----+
-- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time;
+-----+-----+--------------------------------+
| bar | foo | time |
+-----+-----+--------------------------------+
| 2 | you | 1970-01-01T00:00:00.000000020Z |
+-----+-----+--------------------------------+
-- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma
+----+--------------------------------+
| mi | ma |
+----+--------------------------------+
| 2 | 1970-01-01T00:00:00.000000020Z |
+----+--------------------------------+

View File

@ -1,61 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteMultiExprsOneChunk
-- select *
SELECT * from cpu order by bar, foo, time;
SELECT time, bar from cpu order by time, bar;
SELECT bar from cpu order by bar;
SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu;
SELECT count(time) from cpu;
SELECT count(foo) from cpu;
SELECT count(bar) from cpu;
SELECT count(*) from cpu;
SELECT min(bar) from cpu;
-- IOX_COMPARE: sorted
SELECT foo from cpu;
SELECT min(foo) as min_foo from cpu order by min_foo;
SELECT max(foo) as max_foo from cpu order by max_foo;
SELECT min(foo) as min_foo from cpu group by time order by min_foo;
SELECT max(foo) as max_foo from cpu group by time order by max_foo;
SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo;
SELECT min(foo) as min_foo from cpu group by bar order by min_foo;
SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo;
SELECT max(foo) as max_foo from cpu group by time order by max_foo;
SELECT min(time) as min_time from cpu order by min_time;
SELECT max(time) as max_time from cpu order by max_time;
SELECT min(time) as min_time from cpu group by bar order by min_time;
SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time;
SELECT max(time) as max_time from cpu group by foo order by max_time;
SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time;
-- IOX_COMPARE: sorted
SELECT time from cpu;
SELECT max(bar) from cpu order by 1;
--------------------------------------------------------
-- With selection predicate
SELECT * from cpu where bar >= 1.0 order by bar, foo, time;
SELECT foo from cpu where bar >= 1.0 order by foo;
SELECT time, bar from cpu where bar >= 1.0 order by bar, time;
SELECT * from cpu where foo = 'you' order by bar, foo, time;
SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma

View File

@ -1,91 +0,0 @@
-- Test Setup: OneDeleteSimpleExprOneChunk
-- SQL: SELECT * from cpu;
+-----+--------------------------------+
| bar | time |
+-----+--------------------------------+
| 2 | 1970-01-01T00:00:00.000000020Z |
+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu;
+--------------------------------+-----+
| time | bar |
+--------------------------------+-----+
| 1970-01-01T00:00:00.000000020Z | 2 |
+--------------------------------+-----+
-- SQL: SELECT min(bar), max(bar) from cpu;
+--------------+--------------+
| MIN(cpu.bar) | MAX(cpu.bar) |
+--------------+--------------+
| 2 | 2 |
+--------------+--------------+
-- SQL: SELECT time from cpu;
+--------------------------------+
| time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
+--------------------------------+
-- SQL: SELECT max(time) from cpu;
+--------------------------------+
| MAX(cpu.time) |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
+--------------------------------+
-- SQL: SELECT min(time) from cpu group by bar;
+--------------------------------+
| MIN(cpu.time) |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
+--------------------------------+
-- SQL: SELECT bar, min(time) from cpu group by bar;
+-----+--------------------------------+
| bar | MIN(cpu.time) |
+-----+--------------------------------+
| 2 | 1970-01-01T00:00:00.000000020Z |
+-----+--------------------------------+
-- SQL: SELECT count(time), max(time) from cpu;
+-----------------+--------------------------------+
| COUNT(cpu.time) | MAX(cpu.time) |
+-----------------+--------------------------------+
| 1 | 1970-01-01T00:00:00.000000020Z |
+-----------------+--------------------------------+
-- SQL: SELECT count(time) from cpu;
+-----------------+
| COUNT(cpu.time) |
+-----------------+
| 1 |
+-----------------+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu;
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| 1 | 1 | 1 | 2 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT * from cpu where bar = 2.0;
+-----+--------------------------------+
| bar | time |
+-----+--------------------------------+
| 2 | 1970-01-01T00:00:00.000000020Z |
+-----+--------------------------------+
-- SQL: SELECT * from cpu where bar != 2.0;
++
++
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar= 2.0;
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
| 1 | 1 | 1 | 2 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z |
+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+
-- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar != 2.0;
+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) |
+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
| 0 | 0 | 0 | | | | |
+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+
-- SQL: SELECT time from cpu where bar=2;
+--------------------------------+
| time |
+--------------------------------+
| 1970-01-01T00:00:00.000000020Z |
+--------------------------------+
-- SQL: SELECT bar from cpu where bar!= 2;
++
++

View File

@ -1,37 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: OneDeleteSimpleExprOneChunk
-- select *
SELECT * from cpu;
SELECT time, bar from cpu;
SELECT min(bar), max(bar) from cpu;
SELECT time from cpu;
SELECT max(time) from cpu;
SELECT min(time) from cpu group by bar;
SELECT bar, min(time) from cpu group by bar;
SELECT count(time), max(time) from cpu;
SELECT count(time) from cpu;
SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu;
----------------------------------------------------------------
-- Now add selection predicate
SELECT * from cpu where bar = 2.0;
SELECT * from cpu where bar != 2.0;
SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar= 2.0;
SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar != 2.0;
SELECT time from cpu where bar=2;
SELECT bar from cpu where bar!= 2;

View File

@ -1,85 +0,0 @@
-- Test Setup: ThreeDeleteThreeChunks
-- SQL: SELECT * from cpu order by foo, bar, time;
+-----+-----+--------------------------------+
| bar | foo | time |
+-----+-----+--------------------------------+
| 1 | me | 1970-01-01T00:00:00.000000040Z |
| 1 | me | 1970-01-01T00:00:00.000000042Z |
| 1 | me | 1970-01-01T00:00:00.000000062Z |
| 4 | me | 1970-01-01T00:00:00.000000050Z |
| 5 | me | 1970-01-01T00:00:00.000000060Z |
| 7 | me | 1970-01-01T00:00:00.000000080Z |
| 3 | you | 1970-01-01T00:00:00.000000070Z |
+-----+-----+--------------------------------+
-- SQL: SELECT time, bar from cpu order by bar, time;
+--------------------------------+-----+
| time | bar |
+--------------------------------+-----+
| 1970-01-01T00:00:00.000000040Z | 1 |
| 1970-01-01T00:00:00.000000042Z | 1 |
| 1970-01-01T00:00:00.000000062Z | 1 |
| 1970-01-01T00:00:00.000000070Z | 3 |
| 1970-01-01T00:00:00.000000050Z | 4 |
| 1970-01-01T00:00:00.000000060Z | 5 |
| 1970-01-01T00:00:00.000000080Z | 7 |
+--------------------------------+-----+
-- SQL: SELECT bar from cpu order by bar;
+-----+
| bar |
+-----+
| 1 |
| 1 |
| 1 |
| 3 |
| 4 |
| 5 |
| 7 |
+-----+
-- SQL: SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
+---+---+---+----+--------------------------------+--------------------------------+
| t | c | b | mi | mt | mat |
+---+---+---+----+--------------------------------+--------------------------------+
| 7 | 7 | 7 | 1 | 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
+---+---+---+----+--------------------------------+--------------------------------+
-- SQL: SELECT count(time) from cpu;
+-----------------+
| COUNT(cpu.time) |
+-----------------+
| 7 |
+-----------------+
-- SQL: SELECT count(foo) from cpu;
+----------------+
| COUNT(cpu.foo) |
+----------------+
| 7 |
+----------------+
-- SQL: SELECT count(bar) from cpu;
+----------------+
| COUNT(cpu.bar) |
+----------------+
| 7 |
+----------------+
-- SQL: SELECT count(*) from cpu;
+-----------------+
| COUNT(UInt8(1)) |
+-----------------+
| 7 |
+-----------------+
-- SQL: SELECT min(bar) from cpu;
+--------------+
| MIN(cpu.bar) |
+--------------+
| 1 |
+--------------+
-- SQL: SELECT foo from cpu order by foo;
+-----+
| foo |
+-----+
| me |
| me |
| me |
| me |
| me |
| me |
| you |
+-----+

View File

@ -1,23 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: ThreeDeleteThreeChunks
-- select *
SELECT * from cpu order by foo, bar, time;
SELECT time, bar from cpu order by bar, time;
SELECT bar from cpu order by bar;
SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat;
SELECT count(time) from cpu;
SELECT count(foo) from cpu;
SELECT count(bar) from cpu;
SELECT count(*) from cpu;
SELECT min(bar) from cpu;
SELECT foo from cpu order by foo;

View File

@ -1,77 +0,0 @@
-- Test Setup: ThreeDeleteThreeChunks
-- SQL: SELECT min(foo) from cpu;
+--------------+
| MIN(cpu.foo) |
+--------------+
| me |
+--------------+
-- SQL: SELECT max(foo) from cpu;
+--------------+
| MAX(cpu.foo) |
+--------------+
| you |
+--------------+
-- SQL: SELECT min(time) from cpu;
+--------------------------------+
| MIN(cpu.time) |
+--------------------------------+
| 1970-01-01T00:00:00.000000040Z |
+--------------------------------+
-- SQL: SELECT max(time) from cpu;
+--------------------------------+
| MAX(cpu.time) |
+--------------------------------+
| 1970-01-01T00:00:00.000000080Z |
+--------------------------------+
-- SQL: SELECT foo, min(time) from cpu group by foo;
-- Results After Sorting
+-----+--------------------------------+
| foo | MIN(cpu.time) |
+-----+--------------------------------+
| me | 1970-01-01T00:00:00.000000040Z |
| you | 1970-01-01T00:00:00.000000070Z |
+-----+--------------------------------+
-- SQL: SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
+-----+--------------------------------+
| bar | max_time |
+-----+--------------------------------+
| 1 | 1970-01-01T00:00:00.000000062Z |
| 3 | 1970-01-01T00:00:00.000000070Z |
| 4 | 1970-01-01T00:00:00.000000050Z |
| 5 | 1970-01-01T00:00:00.000000060Z |
| 7 | 1970-01-01T00:00:00.000000080Z |
+-----+--------------------------------+
-- SQL: SELECT max(time) as max_time from cpu group by bar order by max_time;
+--------------------------------+
| max_time |
+--------------------------------+
| 1970-01-01T00:00:00.000000050Z |
| 1970-01-01T00:00:00.000000060Z |
| 1970-01-01T00:00:00.000000062Z |
| 1970-01-01T00:00:00.000000070Z |
| 1970-01-01T00:00:00.000000080Z |
+--------------------------------+
-- SQL: SELECT time from cpu order by time;
+--------------------------------+
| time |
+--------------------------------+
| 1970-01-01T00:00:00.000000040Z |
| 1970-01-01T00:00:00.000000042Z |
| 1970-01-01T00:00:00.000000050Z |
| 1970-01-01T00:00:00.000000060Z |
| 1970-01-01T00:00:00.000000062Z |
| 1970-01-01T00:00:00.000000070Z |
| 1970-01-01T00:00:00.000000080Z |
+--------------------------------+
-- SQL: SELECT max(bar) from cpu;
+--------------+
| MAX(cpu.bar) |
+--------------+
| 7 |
+--------------+
-- SQL: SELECT min(time), max(time) from cpu;
+--------------------------------+--------------------------------+
| MIN(cpu.time) | MAX(cpu.time) |
+--------------------------------+--------------------------------+
| 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z |
+--------------------------------+--------------------------------+

View File

@ -1,19 +0,0 @@
-- Demonstrate soft deleted rows will not be return to queries
-- IOX_SETUP: ThreeDeleteThreeChunks
SELECT min(foo) from cpu;
SELECT max(foo) from cpu;
SELECT min(time) from cpu;
SELECT max(time) from cpu;
-- IOX_COMPARE: sorted
SELECT foo, min(time) from cpu group by foo;
SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time;
SELECT max(time) as max_time from cpu group by bar order by max_time;
SELECT time from cpu order by time;
SELECT max(bar) from cpu;
SELECT min(time), max(time) from cpu;

Some files were not shown because too many files have changed in this diff Show More