diff --git a/Cargo.lock b/Cargo.lock index aefdbb86b7..92a4115b4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1050,7 +1050,7 @@ dependencies = [ "influxdb_line_protocol", "iox_time", "observability_deps", - "ordered-float 3.1.0", + "ordered-float 3.2.0", "percent-encoding", "schema", "serde", @@ -1094,7 +1094,7 @@ dependencies = [ "log", "num_cpus", "object_store", - "ordered-float 3.1.0", + "ordered-float 3.2.0", "parking_lot 0.12.1", "parquet", "paste", @@ -1116,7 +1116,7 @@ source = "git+https://github.com/apache/arrow-datafusion.git?rev=c7f3a70a79ee840 dependencies = [ "arrow", "object_store", - "ordered-float 3.1.0", + "ordered-float 3.2.0", "parquet", "sqlparser 0.23.0", ] @@ -1163,7 +1163,7 @@ dependencies = [ "hashbrown", "lazy_static", "md-5", - "ordered-float 3.1.0", + "ordered-float 3.2.0", "paste", "rand", "regex", @@ -1741,9 +1741,9 @@ dependencies = [ [[package]] name = "handlebars" -version = "4.3.4" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56b224eaa4987c03c30b251de7ef0c15a6a59f34222905850dbc3026dfb24d5f" +checksum = "433e4ab33f1213cdc25b5fa45c76881240cfe79284cf2b395e8b9e312a30a2fd" dependencies = [ "log", "pest", @@ -2061,7 +2061,9 @@ dependencies = [ "data_types", "datafusion 0.1.0", "dotenvy", + "flate2", "futures", + "futures-util", "generated_types", "hashbrown", "http", @@ -2126,12 +2128,13 @@ dependencies = [ "client_util", "futures-util", "generated_types", - "mockito", + "influxdb_line_protocol", "prost 0.11.0", "rand", "reqwest", "thiserror", "tokio", + "tokio-stream", "tonic", ] @@ -2182,7 +2185,7 @@ version = "0.1.0" dependencies = [ "generated_types", "snafu", - "sqlparser 0.24.0", + "sqlparser 0.25.0", "workspace-hack", ] @@ -2222,6 +2225,7 @@ dependencies = [ "pin-project", "predicate", "prost 0.11.0", + "rand", "schema", "snafu", "test_helpers", @@ -2681,9 +2685,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.134" +version = "0.2.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" +checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" [[package]] name = "libloading" @@ -3130,9 +3134,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2168fee79ee3e7695905bc3a48777d807f82d956f821186fa7a2601c1295a73e" +checksum = "56ce10a205d9f610ae3532943039c34c145930065ce0c4284134c897fe6073b1" dependencies = [ "async-trait", "base64", @@ -3142,7 +3146,7 @@ dependencies = [ "itertools", "parking_lot 0.12.1", "percent-encoding", - "quick-xml 0.24.1", + "quick-xml 0.25.0", "rand", "reqwest", "ring", @@ -3207,9 +3211,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a" +checksum = "129d36517b53c461acc6e1580aeb919c8ae6708a4b1eae61c4463a615d4f0411" dependencies = [ "num-traits", ] @@ -3581,7 +3585,7 @@ dependencies = [ "schema", "serde_json", "snafu", - "sqlparser 0.24.0", + "sqlparser 0.25.0", "test_helpers", "workspace-hack", ] @@ -3670,9 +3674,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" dependencies = [ "unicode-ident", ] @@ -3942,9 +3946,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.24.1" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37dddbbe9df96afafcb8027fcf263971b726530e12f0787f620a7ba5b4846081" +checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9" dependencies = [ "memchr", "serde", @@ -4412,9 +4416,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074" dependencies = [ "itoa 1.0.3", "ryu", @@ -4669,15 +4673,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "snafu" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2" +checksum = "dd726aec4ebad65756394ff89a9b9598793d4e30121cd71690244c1e497b3aee" dependencies = [ "doc-comment", "snafu-derive", @@ -4685,9 +4689,9 @@ dependencies = [ [[package]] name = "snafu-derive" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5" +checksum = "712529e9b0b014eabaa345b38e06032767e3dc393e8b017e853b1d7247094e74" dependencies = [ "heck", "proc-macro2", @@ -4748,9 +4752,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac9c312566fdfc45a38ecf1924013c82af2a7d5315e46f67b1cc987f12be260" +checksum = "0781f2b6bd03e5adf065c8e772b49eaea9f640d06a1b9130330fe8bd2563f4fd" dependencies = [ "log", ] @@ -4953,9 +4957,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" +checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" dependencies = [ "proc-macro2", "quote", @@ -5228,9 +5232,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -5434,9 +5438,9 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if", "log", @@ -5447,9 +5451,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -5458,9 +5462,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", "valuable", diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml index 761e13140b..0104b2eee1 100644 --- a/clap_blocks/Cargo.toml +++ b/clap_blocks/Cargo.toml @@ -11,10 +11,10 @@ humantime = "2.1.0" iox_catalog = { path = "../iox_catalog" } iox_time = { path = "../iox_time" } metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" tempfile = "3.1.0" trace = { path = "../trace" } diff --git a/compactor/Cargo.toml b/compactor/Cargo.toml index 8a366ab903..7cb6a78574 100644 --- a/compactor/Cargo.toml +++ b/compactor/Cargo.toml @@ -14,7 +14,7 @@ datafusion = { path = "../datafusion" } futures = "0.3" iox_catalog = { path = "../iox_catalog" } metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parquet_file = { path = "../parquet_file" } predicate = { path = "../predicate" } diff --git a/compactor/src/cold.rs b/compactor/src/cold.rs index 1eb3aad4ee..f4a59dcf65 100644 --- a/compactor/src/cold.rs +++ b/compactor/src/cold.rs @@ -45,7 +45,7 @@ pub async fn compact(compactor: Arc, do_full_compact: bool) -> usize compaction_type, CompactionLevel::Initial, compact_in_parallel, - false, // no split + true, // split candidates.clone().into(), ) .await; @@ -57,7 +57,7 @@ pub async fn compact(compactor: Arc, do_full_compact: bool) -> usize compaction_type, CompactionLevel::FileNonOverlapped, compact_in_parallel, - false, // don't split + true, // split candidates.into(), ) .await; @@ -812,24 +812,42 @@ mod tests { compact(compactor, true).await; - // Should have 1 non-soft-deleted file: + // Should have 2 non-soft-deleted file: // - // - the level 2 file created after combining all 3 level 1 files created by the first step + // - the 2 level-2 files created after combining all 3 level 1 files created by the first step // of compaction to compact remaining level 0 files let mut files = catalog.list_by_table_not_to_delete(table.table.id).await; - assert_eq!(files.len(), 1, "{files:?}"); + assert_eq!(files.len(), 2, "{files:?}"); let files_and_levels: Vec<_> = files .iter() .map(|f| (f.id.get(), f.compaction_level)) .collect(); // The initial files are: L0 1-4, L1 5-6. The first step of cold compaction took files 1-5 - // and compacted them into a l-1 file 7. The second step of cold compaction - // took 6 and 7 and combined them all into file 8. - assert_eq!(files_and_levels, vec![(8, CompactionLevel::Final)]); + // and compacted them into two l-1 files 7, 8. The second step of cold compaction + // took 6, 7, and 8 and combined them all into two files 9 and 10. + assert_eq!( + files_and_levels, + vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)] + ); // ------------------------------------------------ // Verify the parquet file content + // first file: + let file = files.pop().unwrap(); + let batches = table.read_parquet_file(file).await; + assert_batches_sorted_eq!( + &[ + "+-----------+------+------+------+-----------------------------+", + "| field_int | tag1 | tag2 | tag3 | time |", + "+-----------+------+------+------+-----------------------------+", + "| 421 | | OH | 21 | 1970-01-01T00:00:00.000091Z |", + "| 81601 | | PA | 15 | 1970-01-01T00:00:00.000090Z |", + "+-----------+------+------+------+-----------------------------+", + ], + &batches + ); + // second file let file = files.pop().unwrap(); let batches = table.read_parquet_file(file).await; assert_batches_sorted_eq!( @@ -847,9 +865,7 @@ mod tests { "| 20 | | VT | 20 | 1970-01-01T00:00:00.000026Z |", "| 21 | | OH | 21 | 1970-01-01T00:00:00.000000025Z |", "| 270 | UT | | | 1970-01-01T00:00:00.000025Z |", - "| 421 | | OH | 21 | 1970-01-01T00:00:00.000091Z |", "| 70 | UT | | | 1970-01-01T00:00:00.000020Z |", - "| 81601 | | PA | 15 | 1970-01-01T00:00:00.000090Z |", "+-----------+------+------+------+--------------------------------+", ], &batches @@ -1027,14 +1043,14 @@ mod tests { compact(compactor, true).await; - // Should have 3 non-soft-deleted files: + // Should have 4 non-soft-deleted files: // // - pf4, the level 1 file untouched because it didn't fit in the memory budget // - pf6, the level 2 file untouched because it doesn't overlap anything - // - the level 2 file created after combining all 3 level 1 files created by the first step + // - two level-2 files created after combining all 3 level 1 files created by the first step // of compaction to compact remaining level 0 files let mut files = catalog.list_by_table_not_to_delete(table.table.id).await; - assert_eq!(files.len(), 3, "{files:?}"); + assert_eq!(files.len(), 4, "{files:?}"); let files_and_levels: Vec<_> = files .iter() .map(|f| (f.id.get(), f.compaction_level)) @@ -1042,20 +1058,35 @@ mod tests { // File 4 was L1 but didn't fit in the memory budget, so was untouched. // File 6 was already L2 and did not overlap with anything, so was untouched. - // Cold compaction took files 1, 2, 3, 5 and compacted them into file 7. + // Cold compaction took files 1, 2, 3, 5 and compacted them into 2 files 7 and 8. assert_eq!( files_and_levels, vec![ (4, CompactionLevel::FileNonOverlapped), (6, CompactionLevel::Final), (7, CompactionLevel::Final), + (8, CompactionLevel::Final), ] ); // ------------------------------------------------ // Verify the parquet file content - let file1 = files.pop().unwrap(); - let batches = table.read_parquet_file(file1).await; + // newly created L-2 with largest timestamp + let file = files.pop().unwrap(); + let batches = table.read_parquet_file(file).await; + assert_batches_sorted_eq!( + &[ + "+-----------+------+------+------+-----------------------------+", + "| field_int | tag1 | tag2 | tag3 | time |", + "+-----------+------+------+------+-----------------------------+", + "| 270 | UT | | | 1970-01-01T00:00:00.000025Z |", + "+-----------+------+------+------+-----------------------------+", + ], + &batches + ); + // newly created L-2 with smallest timestamp + let file = files.pop().unwrap(); + let batches = table.read_parquet_file(file).await; assert_batches_sorted_eq!( &[ "+-----------+------+------+------+--------------------------------+", @@ -1068,15 +1099,14 @@ mod tests { "| 1500 | WA | | | 1970-01-01T00:00:00.000008Z |", "| 1601 | | PA | 15 | 1970-01-01T00:00:00.000000009Z |", "| 21 | | OH | 21 | 1970-01-01T00:00:00.000000025Z |", - "| 270 | UT | | | 1970-01-01T00:00:00.000025Z |", "| 70 | UT | | | 1970-01-01T00:00:00.000020Z |", "+-----------+------+------+------+--------------------------------+", ], &batches ); - - let file0 = files.pop().unwrap(); - let batches = table.read_parquet_file(file0).await; + // available L2 that does not overlap + let file = files.pop().unwrap(); + let batches = table.read_parquet_file(file).await; assert_batches_sorted_eq!( &[ "+-----------+------+------+-----------------------------+", @@ -1088,6 +1118,20 @@ mod tests { ], &batches ); + // available L1 that did not fit in the memory budget + let file = files.pop().unwrap(); + let batches = table.read_parquet_file(file).await; + assert_batches_sorted_eq!( + &[ + "+-----------+------+------+-----------------------------+", + "| field_int | tag2 | tag3 | time |", + "+-----------+------+------+-----------------------------+", + "| 1600 | WA | 10 | 1970-01-01T00:00:00.000028Z |", + "| 20 | VT | 20 | 1970-01-01T00:00:00.000026Z |", + "+-----------+------+------+-----------------------------+", + ], + &batches + ); } struct TestDb { diff --git a/compactor/src/query.rs b/compactor/src/query.rs index ea6e219d4e..20a8d068cc 100644 --- a/compactor/src/query.rs +++ b/compactor/src/query.rs @@ -4,10 +4,10 @@ use data_types::{ ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber, TableSummary, Timestamp, TimestampMinMax, Tombstone, }; -use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream}; use iox_query::{ exec::{stringset::StringSet, IOxSessionContext}, - QueryChunk, QueryChunkError, QueryChunkMeta, + QueryChunk, QueryChunkMeta, }; use observability_deps::tracing::trace; use parquet_file::chunk::ParquetChunk; @@ -194,7 +194,7 @@ impl QueryChunk for QueryableParquetChunk { _ctx: IOxSessionContext, _predicate: &Predicate, _columns: Selection<'_>, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { Ok(None) } @@ -208,7 +208,7 @@ impl QueryChunk for QueryableParquetChunk { _ctx: IOxSessionContext, _column_name: &str, _predicate: &Predicate, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { Ok(None) } @@ -230,7 +230,7 @@ impl QueryChunk for QueryableParquetChunk { mut ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result { + ) -> Result { ctx.set_metadata("storage", "compactor"); ctx.set_metadata("projection", format!("{}", selection)); trace!(?selection, "selection"); @@ -238,7 +238,7 @@ impl QueryChunk for QueryableParquetChunk { self.data .read_filter(predicate, selection) .context(ReadParquetSnafu) - .map_err(|e| Box::new(e) as _) + .map_err(|e| DataFusionError::External(Box::new(e))) } /// Returns chunk type diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs index 75fd250dd0..38a9c8cd05 100644 --- a/datafusion_util/src/lib.rs +++ b/datafusion_util/src/lib.rs @@ -15,7 +15,7 @@ use datafusion::execution::context::TaskContext; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::common::SizedRecordBatchStream; use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics}; -use datafusion::physical_plan::{collect, ExecutionPlan}; +use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan}; use datafusion::prelude::SessionContext; use datafusion::{ arrow::{ @@ -236,12 +236,19 @@ where } /// Create a SendableRecordBatchStream a RecordBatch -pub fn stream_from_batch(batch: RecordBatch) -> SendableRecordBatchStream { - stream_from_batches(vec![Arc::new(batch)]) +pub fn stream_from_batch(schema: Arc, batch: RecordBatch) -> SendableRecordBatchStream { + stream_from_batches(schema, vec![Arc::new(batch)]) } /// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema -pub fn stream_from_batches(batches: Vec>) -> SendableRecordBatchStream { +pub fn stream_from_batches( + schema: Arc, + batches: Vec>, +) -> SendableRecordBatchStream { + if batches.is_empty() { + return Box::pin(EmptyRecordBatchStream::new(schema)); + } + let dummy_metrics = ExecutionPlanMetricsSet::new(); let mem_metrics = MemTrackingMetrics::new(&dummy_metrics, 0); let stream = SizedRecordBatchStream::new(batches[0].schema(), batches, mem_metrics); diff --git a/docs/underground_guide.md b/docs/underground_guide.md index 201dd5e44b..c087bcce88 100644 --- a/docs/underground_guide.md +++ b/docs/underground_guide.md @@ -15,17 +15,25 @@ developers. Build IOx for release with pprof: ```shell +cd influxdb_iox cargo build --release --features=pprof ``` -## Step 2: Start redpanda and postgres +You can also install the `influxdb_iox` command locally via -Now, start up redpanda and postgres locally in docker containers: +```shell +cd influxdb_iox +cargo install --path influxdb_iox +``` + +## Step 2: Start kafka and postgres + +Now, start up kafka and postgres locally in docker containers: ```shell # get rskafka from https://github.com/influxdata/rskafka cd rskafka -# Run redpanda on localhost:9010 -docker-compose -f docker-compose-redpanda.yml up & +# Run kafka on localhost:9010 +docker-compose -f docker-compose-kafka.yml up & # now run postgres docker run -p 5432:5432 -e POSTGRES_HOST_AUTH_METHOD=trust postgres & ``` @@ -136,8 +144,8 @@ INFLUXDB_IOX_GRPC_BIND_ADDR=localhost:8084 \ INFLUXDB_IOX_WRITE_BUFFER_TYPE=kafka \ INFLUXDB_IOX_WRITE_BUFFER_ADDR=localhost:9010 \ xINFLUXDB_IOX_WRITE_BUFFER_AUTO_CREATE_TOPICS=10 \ -INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_START=0 \ -INFLUXDB_IOX_WRITE_BUFFER_PARTITION_RANGE_END=0 \ +INFLUXDB_IOX_SHARD_INDEX_RANGE_START=0 \ +INFLUXDB_IOX_SHARD_INDEX_RANGE_END=0 \ INFLUXDB_IOX_PAUSE_INGEST_SIZE_BYTES=5000000000 \ INFLUXDB_IOX_PERSIST_MEMORY_THRESHOLD_BYTES=4000000000 \ INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost:5432/postgres \ @@ -151,6 +159,11 @@ LOG_FILTER=info \ # Step 5: Ingest data +You can load data using the influxdb_iox client: +```shell +influxdb_iox --host=http://localhost:8080 -v write test_db test_fixtures/lineproto/*.lp +``` + Now you can post data to `http://localhost:8080` with your favorite load generating tool My favorite is https://github.com/alamb/low_card @@ -171,3 +184,17 @@ posting fairly large requests (necessitating the # Step 6: Profile See [`profiling.md`](./profiling.md). + + +# Step 7: Clean up local state + +If you find yourself needing to clean up postgres / kafka state use these commands: +```shell +docker ps -a -q | xargs docker stop +docker rm rskafka_proxy_1 +docker rm rskafka_kafka-0_1 +docker rm rskafka_kafka-1_1 +docker rm rskafka_kafka-2_1 +docker rm rskafka_zookeeper_1 +docker volume rm rskafka_kafka_0_data rskafka_kafka_1_data rskafka_kafka_2_data rskafka_zookeeper_data +``` diff --git a/garbage_collector/Cargo.toml b/garbage_collector/Cargo.toml index a3e1362cb8..84bf828604 100644 --- a/garbage_collector/Cargo.toml +++ b/garbage_collector/Cargo.toml @@ -11,7 +11,7 @@ data_types = { path = "../data_types" } futures = "0.3" humantime = "2.1.0" iox_catalog = { path = "../iox_catalog" } -object_store = { version = "0.5.0" } +object_store = { version = "0.5.1" } observability_deps = { path = "../observability_deps" } snafu = "0.7" tokio = { version = "1", features = ["macros", "rt", "sync"] } diff --git a/generated_types/protos/influxdata/iox/ingester/v1/query.proto b/generated_types/protos/influxdata/iox/ingester/v1/query.proto index ff7cc66209..fc0ca483f2 100644 --- a/generated_types/protos/influxdata/iox/ingester/v1/query.proto +++ b/generated_types/protos/influxdata/iox/ingester/v1/query.proto @@ -82,8 +82,9 @@ message PartitionStatus { // Max sequence number persisted optional int64 parquet_max_sequence_number = 1; - // Max sequence number for a tombstone associated - optional int64 tombstone_max_sequence_number = 2; + // Deprecated tombstone support in ingester (#5825). + reserved "tombstone_max_sequence_number"; + reserved 2; } // Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s diff --git a/import/Cargo.toml b/import/Cargo.toml index c773711a23..20d0a3cdc3 100644 --- a/import/Cargo.toml +++ b/import/Cargo.toml @@ -13,11 +13,11 @@ futures = "0.3" generated_types = { path = "../generated_types" } influxdb_iox_client = { path = "../influxdb_iox_client" } iox_catalog = { path = "../iox_catalog" } -object_store = { version = "0.5.0", features = ["aws"] } +object_store = { version = "0.5.1", features = ["aws"] } observability_deps = { path = "../observability_deps" } schema = { path = "../schema" } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.82" +serde_json = "1.0.86" thiserror = "1.0.37" tokio = { version = "1.21" } tonic = { version = "0.8" } diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml index 060445779b..b3858aac87 100644 --- a/influxdb2_client/Cargo.toml +++ b/influxdb2_client/Cargo.toml @@ -9,7 +9,7 @@ bytes = "1.2" futures = { version = "0.3", default-features = false } reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" url = "2.3.1" uuid = { version = "1", features = ["v4"] } diff --git a/influxdb_influxql_parser/src/common.rs b/influxdb_influxql_parser/src/common.rs index 51266177d6..a6b245c397 100644 --- a/influxdb_influxql_parser/src/common.rs +++ b/influxdb_influxql_parser/src/common.rs @@ -2,6 +2,7 @@ use crate::expression::conditional::{conditional_expression, ConditionalExpressi use crate::identifier::{identifier, Identifier}; use crate::internal::{expect, ParseResult}; use crate::literal::unsigned_integer; +use crate::string::{regex, Regex}; use core::fmt; use nom::branch::alt; use nom::bytes::complete::{tag, tag_no_case}; @@ -11,73 +12,82 @@ use nom::multi::separated_list1; use nom::sequence::{pair, preceded, terminated}; use std::fmt::{Display, Formatter}; -/// Represents a fully-qualified measurement name. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct MeasurementNameExpression { +/// Represents a measurement name as either an identifier or a regular expression. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum MeasurementName { + /// A measurement name expressed as an [`Identifier`]. + Name(Identifier), + + /// A measurement name expressed as a [`Regex`]. + Regex(Regex), +} + +impl Parser for MeasurementName { + /// Parse a measurement name, which may be an identifier or a regular expression. + fn parse(i: &str) -> ParseResult<&str, Self> { + alt(( + map(identifier, MeasurementName::Name), + map(regex, MeasurementName::Regex), + ))(i) + } +} + +impl Display for MeasurementName { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Name(ident) => fmt::Display::fmt(ident, f), + Self::Regex(regex) => fmt::Display::fmt(regex, f), + } + } +} + +/// Represents a fully-qualified, 3-part measurement name. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct QualifiedMeasurementName { pub database: Option, pub retention_policy: Option, - pub name: Identifier, + pub name: MeasurementName, } -impl MeasurementNameExpression { - /// Constructs a new `MeasurementNameExpression` with the specified `name`. - pub fn new(name: Identifier) -> Self { - Self { - database: None, - retention_policy: None, - name, - } - } - - /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`. - pub fn new_db(name: Identifier, database: Identifier) -> Self { - Self { - database: Some(database), - retention_policy: None, - name, - } - } - - /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`. - pub fn new_db_rp(name: Identifier, database: Identifier, retention_policy: Identifier) -> Self { - Self { - database: Some(database), - retention_policy: Some(retention_policy), - name, - } - } -} - -impl fmt::Display for MeasurementNameExpression { +impl Display for QualifiedMeasurementName { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Self { database: None, retention_policy: None, name, - } => write!(f, "{}", name)?, + } => write!(f, "{}", name), Self { database: Some(db), retention_policy: None, name, - } => write!(f, "{}..{}", db, name)?, + } => write!(f, "{}..{}", db, name), Self { database: None, retention_policy: Some(rp), name, - } => write!(f, "{}.{}", rp, name)?, + } => write!(f, "{}.{}", rp, name), Self { database: Some(db), retention_policy: Some(rp), name, - } => write!(f, "{}.{}.{}", db, rp, name)?, - }; - Ok(()) + } => write!(f, "{}.{}.{}", db, rp, name), + } } } -/// Match a 3-part measurement name expression. -pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementNameExpression> { +/// Match a fully-qualified, 3-part measurement name. +/// +/// ```text +/// qualified_measurement_name ::= measurement_name | +/// ( policy_name "." measurement_name ) | +/// ( db_name "." policy_name? "." measurement_name ) +/// +/// db_name ::= identifier +/// policy_name ::= identifier +/// measurement_name ::= identifier | regex_lit +/// ``` +pub fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> { let (remaining_input, (opt_db_rp, name)) = pair( opt(alt(( // database "." retention_policy "." @@ -93,7 +103,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName // retention_policy "." map(terminated(identifier, tag(".")), |rp| (None, Some(rp))), ))), - identifier, + MeasurementName::parse, )(i)?; // Extract possible `database` and / or `retention_policy` @@ -104,7 +114,7 @@ pub fn measurement_name_expression(i: &str) -> ParseResult<&str, MeasurementName Ok(( remaining_input, - MeasurementNameExpression { + QualifiedMeasurementName { database, retention_policy, name, @@ -290,35 +300,107 @@ mod tests { use crate::assert_expect_error; use nom::character::complete::alphanumeric1; - #[test] - fn test_measurement_name_expression() { - let (_, got) = measurement_name_expression("diskio").unwrap(); - assert_eq!( - got, - MeasurementNameExpression { + impl From<&str> for MeasurementName { + /// Convert a `str` to [`MeasurementName::Name`]. + fn from(s: &str) -> Self { + Self::Name(Identifier(s.into())) + } + } + + impl QualifiedMeasurementName { + /// Constructs a new `MeasurementNameExpression` with the specified `name`. + pub fn new(name: MeasurementName) -> Self { + Self { database: None, retention_policy: None, - name: "diskio".into(), + name, + } + } + + /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`. + pub fn new_db(name: MeasurementName, database: Identifier) -> Self { + Self { + database: Some(database), + retention_policy: None, + name, + } + } + + /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`. + pub fn new_db_rp( + name: MeasurementName, + database: Identifier, + retention_policy: Identifier, + ) -> Self { + Self { + database: Some(database), + retention_policy: Some(retention_policy), + name, + } + } + } + + #[test] + fn test_qualified_measurement_name() { + use MeasurementName::*; + + let (_, got) = qualified_measurement_name("diskio").unwrap(); + assert_eq!( + got, + QualifiedMeasurementName { + database: None, + retention_policy: None, + name: Name("diskio".into()), } ); - let (_, got) = measurement_name_expression("telegraf.autogen.diskio").unwrap(); + let (_, got) = qualified_measurement_name("/diskio/").unwrap(); assert_eq!( got, - MeasurementNameExpression { + QualifiedMeasurementName { + database: None, + retention_policy: None, + name: Regex("diskio".into()), + } + ); + + let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap(); + assert_eq!( + got, + QualifiedMeasurementName { database: Some("telegraf".into()), retention_policy: Some("autogen".into()), - name: "diskio".into(), + name: Name("diskio".into()), } ); - let (_, got) = measurement_name_expression("telegraf..diskio").unwrap(); + let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap(); assert_eq!( got, - MeasurementNameExpression { + QualifiedMeasurementName { + database: Some("telegraf".into()), + retention_policy: Some("autogen".into()), + name: Regex("diskio".into()), + } + ); + + let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap(); + assert_eq!( + got, + QualifiedMeasurementName { database: Some("telegraf".into()), retention_policy: None, - name: "diskio".into(), + name: Name("diskio".into()), + } + ); + + let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap(); + assert_eq!( + got, + QualifiedMeasurementName { + database: Some("telegraf".into()), + retention_policy: None, + name: Regex("diskio".into()), } ); } diff --git a/influxdb_influxql_parser/src/delete.rs b/influxdb_influxql_parser/src/delete.rs index 3613e027ea..6d8a8c7cad 100644 --- a/influxdb_influxql_parser/src/delete.rs +++ b/influxdb_influxql_parser/src/delete.rs @@ -73,9 +73,14 @@ mod test { // Validate via the Display trait, as we don't need to validate the contents of the // FROM and / or WHERE clauses, given they are tested in their on modules. + // Measurement name expressed as an identifier let (_, got) = delete_statement("DELETE FROM foo").unwrap(); assert_eq!(format!("{}", got), "DELETE FROM foo"); + // Measurement name expressed as a regular expression + let (_, got) = delete_statement("DELETE FROM /foo/").unwrap(); + assert_eq!(format!("{}", got), "DELETE FROM /foo/"); + let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap(); assert_eq!(format!("{}", got), "DELETE FROM foo WHERE time > 10"); diff --git a/influxdb_influxql_parser/src/explain.rs b/influxdb_influxql_parser/src/explain.rs new file mode 100644 index 0000000000..c9576aa3e8 --- /dev/null +++ b/influxdb_influxql_parser/src/explain.rs @@ -0,0 +1,140 @@ +#![allow(dead_code)] // Temporary + +use crate::internal::{expect, ParseResult}; +use crate::select::{select_statement, SelectStatement}; +use nom::branch::alt; +use nom::bytes::complete::tag_no_case; +use nom::character::complete::multispace1; +use nom::combinator::{map, opt, value}; +use nom::sequence::{preceded, tuple}; +use std::fmt::{Display, Formatter}; + +/// Represents various options for an `EXPLAIN` statement. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ExplainOption { + /// `EXPLAIN VERBOSE statement` + Verbose, + /// `EXPLAIN ANALYZE statement` + Analyze, + /// `EXPLAIN ANALYZE VERBOSE statement` + AnalyzeVerbose, +} + +impl Display for ExplainOption { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Verbose => f.write_str("VERBOSE"), + Self::Analyze => f.write_str("ANALYZE"), + Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"), + } + } +} + +/// Represents an `EXPLAIN` statement. +/// +/// ```text +/// explain ::= "EXPLAIN" explain_options? select_statement +/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? ) +/// ``` +#[derive(Debug, Clone, PartialEq)] +pub struct ExplainStatement { + options: Option, + select: Box, +} + +impl Display for ExplainStatement { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str("EXPLAIN ")?; + if let Some(options) = &self.options { + write!(f, "{} ", options)?; + } + Display::fmt(&self.select, f) + } +} + +/// Parse an `EXPLAIN` statement. +pub fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> { + map( + tuple(( + tag_no_case("EXPLAIN"), + opt(preceded( + multispace1, + alt(( + map( + preceded( + tag_no_case("ANALYZE"), + opt(preceded(multispace1, tag_no_case("VERBOSE"))), + ), + |v| match v { + // If the optional combinator is Some, then it matched VERBOSE + Some(_) => ExplainOption::AnalyzeVerbose, + _ => ExplainOption::Analyze, + }, + ), + value(ExplainOption::Verbose, tag_no_case("VERBOSE")), + )), + )), + multispace1, + expect( + "invalid EXPLAIN statement, expected SELECT statement", + select_statement, + ), + )), + |(_, options, _, select)| ExplainStatement { + options, + select: Box::new(select), + }, + )(i) +} + +#[cfg(test)] +mod test { + use crate::assert_expect_error; + use crate::explain::{explain_statement, ExplainOption}; + use assert_matches::assert_matches; + + #[test] + fn test_explain_statement() { + let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap(); + assert_eq!(remain, ""); // assert that all input was consumed + assert_matches!(got.options, None); + assert_eq!(format!("{}", got), "EXPLAIN SELECT val FROM temp"); + + let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap(); + assert_eq!(remain, ""); + assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose); + assert_eq!(format!("{}", got), "EXPLAIN VERBOSE SELECT val FROM temp"); + + let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap(); + assert_eq!(remain, ""); + assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze); + assert_eq!(format!("{}", got), "EXPLAIN ANALYZE SELECT val FROM temp"); + + let (remain, got) = + explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap(); + assert_eq!(remain, ""); + assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose); + assert_eq!( + format!("{}", got), + "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp" + ); + + // Fallible cases + + assert_expect_error!( + explain_statement("EXPLAIN ANALYZE SHOW DATABASES"), + "invalid EXPLAIN statement, expected SELECT statement" + ); + + assert_expect_error!( + explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"), + "invalid EXPLAIN statement, expected SELECT statement" + ); + + // surfaces statement-specific errors + assert_expect_error!( + explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"), + "invalid FROM clause, expected identifier, regular expression or subquery" + ); + } +} diff --git a/influxdb_influxql_parser/src/internal.rs b/influxdb_influxql_parser/src/internal.rs index f9a2b2dcdc..a18c6f5a10 100644 --- a/influxdb_influxql_parser/src/internal.rs +++ b/influxdb_influxql_parser/src/internal.rs @@ -22,12 +22,10 @@ impl Display for Error { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::Syntax { input: _, message } => { - write!(f, "Syntax error: {}", message)?; + write!(f, "Syntax error: {}", message) } - Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind)?, + Self::Nom(_, kind) => write!(f, "nom error: {:?}", kind), } - - Ok(()) } } diff --git a/influxdb_influxql_parser/src/lib.rs b/influxdb_influxql_parser/src/lib.rs index 32842c0615..231e3fe0e9 100644 --- a/influxdb_influxql_parser/src/lib.rs +++ b/influxdb_influxql_parser/src/lib.rs @@ -29,6 +29,7 @@ mod test_util; mod common; mod delete; mod drop; +mod explain; mod expression; mod identifier; mod internal; diff --git a/influxdb_influxql_parser/src/select.rs b/influxdb_influxql_parser/src/select.rs index 111c0c869c..7b9764c182 100644 --- a/influxdb_influxql_parser/src/select.rs +++ b/influxdb_influxql_parser/src/select.rs @@ -1,6 +1,6 @@ use crate::common::{ - limit_clause, measurement_name_expression, offset_clause, order_by_clause, where_clause, - MeasurementNameExpression, OneOrMore, OrderByClause, Parser, + limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause, + OneOrMore, OrderByClause, Parser, QualifiedMeasurementName, }; use crate::expression::arithmetic::Expr::Wildcard; use crate::expression::arithmetic::{ @@ -164,8 +164,7 @@ pub fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> { /// Represents a single measurement selection found in a `FROM` clause. #[derive(Clone, Debug, PartialEq)] pub enum MeasurementSelection { - Name(MeasurementNameExpression), - Regex(Regex), + Name(QualifiedMeasurementName), Subquery(Box), } @@ -173,7 +172,6 @@ impl Display for MeasurementSelection { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Self::Name(ref name) => fmt::Display::fmt(name, f), - Self::Regex(ref re) => fmt::Display::fmt(re, f), Self::Subquery(ref subquery) => write!(f, "({})", subquery), } } @@ -182,8 +180,7 @@ impl Display for MeasurementSelection { impl Parser for MeasurementSelection { fn parse(i: &str) -> ParseResult<&str, Self> { alt(( - map(measurement_name_expression, MeasurementSelection::Name), - map(regex, MeasurementSelection::Regex), + map(qualified_measurement_name, MeasurementSelection::Name), map( delimited( preceded(multispace0, char('(')), @@ -812,7 +809,7 @@ mod test { assert_matches!(got, MeasurementSelection::Name(_)); let (_, got) = MeasurementSelection::parse("/regex/").unwrap(); - assert_matches!(got, MeasurementSelection::Regex(_)); + assert_matches!(got, MeasurementSelection::Name(_)); let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap(); assert_matches!(got, MeasurementSelection::Subquery(_)); diff --git a/influxdb_influxql_parser/src/show_measurements.rs b/influxdb_influxql_parser/src/show_measurements.rs index 582d562df8..d5277fad9b 100644 --- a/influxdb_influxql_parser/src/show_measurements.rs +++ b/influxdb_influxql_parser/src/show_measurements.rs @@ -2,24 +2,21 @@ //! //! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements +use crate::common::{ + limit_clause, offset_clause, qualified_measurement_name, where_clause, QualifiedMeasurementName, +}; +use crate::expression::conditional::ConditionalExpression; +use crate::identifier::{identifier, Identifier}; use crate::internal::{expect, ParseResult}; use nom::branch::alt; use nom::bytes::complete::{tag, tag_no_case}; -use nom::character::complete::{char, multispace0, multispace1}; +use nom::character::complete::{multispace0, multispace1}; use nom::combinator::{map, opt, value}; use nom::sequence::tuple; use nom::sequence::{pair, preceded, terminated}; use std::fmt; use std::fmt::Formatter; -use crate::common::{ - limit_clause, measurement_name_expression, offset_clause, where_clause, - MeasurementNameExpression, -}; -use crate::expression::conditional::ConditionalExpression; -use crate::identifier::{identifier, Identifier}; -use crate::string::{regex, Regex}; - /// OnExpression represents an InfluxQL database or retention policy name /// or a wildcard. #[derive(Clone, Debug, Eq, Hash, PartialEq)] @@ -110,18 +107,16 @@ impl fmt::Display for ShowMeasurementsStatement { #[derive(Clone, Debug, Eq, PartialEq)] pub enum MeasurementExpression { - Equals(MeasurementNameExpression), - Regex(Regex), + Equals(QualifiedMeasurementName), + Regex(QualifiedMeasurementName), } impl fmt::Display for MeasurementExpression { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { - Self::Equals(ref name) => write!(f, "= {}", name)?, - Self::Regex(ref re) => write!(f, "=~ {}", re)?, - }; - - Ok(()) + Self::Equals(ref name) => write!(f, "= {}", name), + Self::Regex(ref re) => write!(f, "=~ {}", re), + } } } @@ -140,23 +135,15 @@ fn with_measurement_clause(i: &str) -> ParseResult<&str, MeasurementExpression> "expected = or =~", alt(( map( - tuple(( - tag("=~"), - multispace0, - expect("expected regular expression literal", regex), - )), - |(_, _, regex)| MeasurementExpression::Regex(regex), + preceded(pair(tag("=~"), multispace0), qualified_measurement_name), + MeasurementExpression::Regex, ), map( - tuple(( - char('='), - multispace0, - expect( - "expected measurement name or wildcard", - measurement_name_expression, - ), - )), - |(_, _, name)| MeasurementExpression::Equals(name), + preceded( + pair(tag("="), multispace0), + expect("expected measurement name", qualified_measurement_name), + ), + MeasurementExpression::Equals, ), )), ), @@ -200,6 +187,7 @@ pub fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement mod test { use super::*; use crate::assert_expect_error; + use crate::common::MeasurementName; use crate::expression::arithmetic::Expr; use assert_matches::assert_matches; @@ -232,7 +220,7 @@ mod test { ShowMeasurementsStatement { on_expression: Some(OnExpression::Database("foo".into())), measurement_expression: Some(MeasurementExpression::Equals( - MeasurementNameExpression { + QualifiedMeasurementName { database: None, retention_policy: None, name: "bar".into(), @@ -255,7 +243,9 @@ mod test { got, ShowMeasurementsStatement { on_expression: Some(OnExpression::Database("foo".into())), - measurement_expression: Some(MeasurementExpression::Regex(Regex("bar".into()))), + measurement_expression: Some(MeasurementExpression::Regex( + QualifiedMeasurementName::new(MeasurementName::Regex("bar".into())) + )), condition: Some(Expr::Literal(true.into()).into()), limit: None, offset: None @@ -343,33 +333,50 @@ mod test { #[test] fn test_with_measurement_clause() { + use crate::common::MeasurementName::*; + let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap(); assert_eq!( got, - MeasurementExpression::Equals(MeasurementNameExpression { - database: None, - retention_policy: None, - name: "foo".into() - }) + MeasurementExpression::Equals(QualifiedMeasurementName::new(Name("foo".into()))) ); let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap(); - assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into()))); + assert_eq!( + got, + MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into()))) + ); // Expressions are still valid when whitespace is omitted let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap(); assert_eq!( got, - MeasurementExpression::Equals(MeasurementNameExpression { - database: Some("foo".into()), - retention_policy: None, - name: "bar".into() - }) + MeasurementExpression::Equals(QualifiedMeasurementName::new_db( + Name("bar".into()), + "foo".into() + )) ); let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap(); - assert_eq!(got, MeasurementExpression::Regex(Regex("foo".into()))); + assert_eq!( + got, + MeasurementExpression::Regex(QualifiedMeasurementName::new(Regex("foo".into()))) + ); + + // Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662 + + let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap(); + assert_eq!( + got, + MeasurementExpression::Regex(QualifiedMeasurementName::new(Name("foo".into()))) + ); + + let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap(); + assert_eq!( + got, + MeasurementExpression::Equals(QualifiedMeasurementName::new(Regex("foo".into()))) + ); // Fallible cases @@ -379,28 +386,16 @@ mod test { "invalid WITH clause, expected MEASUREMENT" ); - // Must have a regex for equal regex operator - assert_expect_error!( - with_measurement_clause("WITH measurement =~ foo"), - "expected regular expression literal" - ); - // Unsupported regex not equal operator assert_expect_error!( with_measurement_clause("WITH measurement !~ foo"), "expected = or =~" ); - // Must have an identifier for equal operator - assert_expect_error!( - with_measurement_clause("WITH measurement = /foo/"), - "expected measurement name or wildcard" - ); - // Must have an identifier assert_expect_error!( with_measurement_clause("WITH measurement = 1"), - "expected measurement name or wildcard" + "expected measurement name" ); } } diff --git a/influxdb_influxql_parser/src/simple_from_clause.rs b/influxdb_influxql_parser/src/simple_from_clause.rs index f3d7ab0481..07528a9fc2 100644 --- a/influxdb_influxql_parser/src/simple_from_clause.rs +++ b/influxdb_influxql_parser/src/simple_from_clause.rs @@ -1,41 +1,12 @@ -use crate::common::{measurement_name_expression, MeasurementNameExpression, OneOrMore, Parser}; +use crate::common::{ + qualified_measurement_name, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName, +}; use crate::identifier::{identifier, Identifier}; use crate::internal::ParseResult; -use crate::string::{regex, Regex}; -use nom::branch::alt; use nom::bytes::complete::tag_no_case; use nom::character::complete::multispace1; -use nom::combinator::map; use nom::sequence::{pair, preceded}; use std::fmt; -use std::fmt::Formatter; - -/// Represents a single measurement selection found in a `FROM` measurement clause. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum MeasurementSelection { - Name(T), - Regex(Regex), -} - -impl Parser for MeasurementSelection { - fn parse(i: &str) -> ParseResult<&str, Self> { - alt(( - map(T::parse, MeasurementSelection::Name), - map(regex, MeasurementSelection::Regex), - ))(i) - } -} - -impl fmt::Display for MeasurementSelection { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match self { - Self::Name(ref name) => fmt::Display::fmt(name, f)?, - Self::Regex(ref re) => fmt::Display::fmt(re, f)?, - }; - - Ok(()) - } -} /// Represents a `FROM` clause of a `DELETE` or `SHOW` statement. /// @@ -43,7 +14,7 @@ impl fmt::Display for MeasurementSelection { /// for measurements names. /// /// A `FROM` clause for a number of `SHOW` statements can accept a 3-part measurement name or -pub type FromMeasurementClause = OneOrMore>; +pub type FromMeasurementClause = OneOrMore; fn from_clause(i: &str) -> ParseResult<&str, FromMeasurementClause> { preceded( @@ -54,9 +25,9 @@ fn from_clause(i: &str) -> ParseResult<&str, FromMeasu )(i) } -impl Parser for MeasurementNameExpression { +impl Parser for QualifiedMeasurementName { fn parse(i: &str) -> ParseResult<&str, Self> { - measurement_name_expression(i) + qualified_measurement_name(i) } } @@ -68,10 +39,9 @@ impl Parser for MeasurementNameExpression { /// It is defined by the following EBNF notation: /// /// ```text -/// from_clause ::= "FROM" measurement_selection ("," measurement_selection)* -/// measurement_selection ::= measurement +/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)* /// -/// measurement ::= measurement_name | +/// qualified_measurement_name ::= measurement_name | /// ( policy_name "." measurement_name ) | /// ( db_name "." policy_name? "." measurement_name ) /// @@ -92,7 +62,7 @@ impl Parser for MeasurementNameExpression { /// ```text /// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar /// ``` -pub type ShowFromClause = FromMeasurementClause; +pub type ShowFromClause = FromMeasurementClause; /// Parse a `FROM` clause for various `SHOW` statements. pub fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> { @@ -106,7 +76,7 @@ impl Parser for Identifier { } /// Represents a `FROM` clause for a `DELETE` statement. -pub type DeleteFromClause = FromMeasurementClause; +pub type DeleteFromClause = FromMeasurementClause; /// Parse a `FROM` clause for a `DELETE` statement. pub fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> { @@ -119,49 +89,52 @@ mod test { #[test] fn test_show_from_clause() { - use crate::simple_from_clause::MeasurementSelection::*; + use crate::common::MeasurementName::*; let (_, from) = show_from_clause("FROM c").unwrap(); assert_eq!( from, - ShowFromClause::new(vec![Name(MeasurementNameExpression::new("c".into()))]) + ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))]) ); let (_, from) = show_from_clause("FROM a..c").unwrap(); assert_eq!( from, - ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db( - "c".into(), + ShowFromClause::new(vec![QualifiedMeasurementName::new_db( + Name("c".into()), "a".into() - ))]) + )]) ); let (_, from) = show_from_clause("FROM a.b.c").unwrap(); assert_eq!( from, - ShowFromClause::new(vec![Name(MeasurementNameExpression::new_db_rp( - "c".into(), + ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp( + Name("c".into()), "a".into(), "b".into() - ))]) + )]) ); let (_, from) = show_from_clause("FROM /reg/").unwrap(); - assert_eq!(from, ShowFromClause::new(vec![Regex("reg".into())])); + assert_eq!( + from, + ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))]) + ); let (_, from) = show_from_clause("FROM c, /reg/").unwrap(); assert_eq!( from, ShowFromClause::new(vec![ - Name(MeasurementNameExpression::new("c".into())), - Regex("reg".into()) + QualifiedMeasurementName::new(Name("c".into())), + QualifiedMeasurementName::new(Regex("reg".into())) ]) ); } #[test] fn test_delete_from_clause() { - use crate::simple_from_clause::MeasurementSelection::*; + use crate::common::MeasurementName::*; let (_, from) = delete_from_clause("FROM c").unwrap(); assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())])); diff --git a/influxdb_influxql_parser/src/statement.rs b/influxdb_influxql_parser/src/statement.rs index 0455051e81..3275685c54 100644 --- a/influxdb_influxql_parser/src/statement.rs +++ b/influxdb_influxql_parser/src/statement.rs @@ -1,5 +1,6 @@ use crate::delete::{delete_statement, DeleteStatement}; use crate::drop::{drop_statement, DropMeasurementStatement}; +use crate::explain::{explain_statement, ExplainStatement}; use crate::internal::ParseResult; use crate::select::{select_statement, SelectStatement}; use crate::show::{show_statement, ShowDatabasesStatement}; @@ -19,6 +20,8 @@ pub enum Statement { Delete(Box), /// Represents a `DROP MEASUREMENT` statement. DropMeasurement(Box), + /// Represents an `EXPLAIN` statement. + Explain(Box), /// Represents a `SELECT` statement. Select(Box), /// Represents a `SHOW DATABASES` statement. @@ -40,6 +43,7 @@ impl Display for Statement { match self { Self::Delete(s) => Display::fmt(s, f), Self::DropMeasurement(s) => Display::fmt(s, f), + Self::Explain(s) => Display::fmt(s, f), Self::Select(s) => Display::fmt(s, f), Self::ShowDatabases(s) => Display::fmt(s, f), Self::ShowMeasurements(s) => Display::fmt(s, f), @@ -56,6 +60,7 @@ pub fn statement(i: &str) -> ParseResult<&str, Statement> { alt(( map(delete_statement, |s| Statement::Delete(Box::new(s))), map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))), + map(explain_statement, |s| Statement::Explain(Box::new(s))), map(select_statement, |s| Statement::Select(Box::new(s))), show_statement, ))(i) @@ -77,6 +82,10 @@ mod test { let (got, _) = statement("DROP MEASUREMENT foo").unwrap(); assert_eq!(got, ""); + // explain_statement combinator + let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap(); + assert_eq!(got, ""); + let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap(); assert_eq!(got, ""); diff --git a/influxdb_iox/Cargo.toml b/influxdb_iox/Cargo.toml index 504bc00ea1..1e00c64fa0 100644 --- a/influxdb_iox/Cargo.toml +++ b/influxdb_iox/Cargo.toml @@ -25,7 +25,7 @@ ioxd_querier = { path = "../ioxd_querier"} ioxd_router = { path = "../ioxd_router"} ioxd_test = { path = "../ioxd_test"} metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" object_store_metrics = { path = "../object_store_metrics" } observability_deps = { path = "../observability_deps" } panic_logging = { path = "../panic_logging" } @@ -47,6 +47,8 @@ clap = { version = "4", features = ["derive", "env"] } console-subscriber = { version = "0.1.8", optional = true, features = ["parking_lot"] } dotenvy = "0.15.5" futures = "0.3" +futures-util = { version = "0.3" } +flate2 = "1.0" hashbrown = "0.12" http = "0.2.8" humantime = "2.1.0" @@ -55,7 +57,7 @@ libc = { version = "0.2" } num_cpus = "1.13.0" once_cell = { version = "1.15.0", features = ["parking_lot"] } rustyline = { version = "10.0", default-features = false } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" thiserror = "1.0.37" tikv-jemalloc-ctl = { version = "0.5.0", optional = true } diff --git a/influxdb_iox/src/commands/sql/repl.rs b/influxdb_iox/src/commands/sql/repl.rs index cf4cb2c8dd..129367b906 100644 --- a/influxdb_iox/src/commands/sql/repl.rs +++ b/influxdb_iox/src/commands/sql/repl.rs @@ -53,7 +53,7 @@ pub enum Error { pub type Result = std::result::Result; enum QueryEngine { - /// Run queries against the named database on the remote server + /// Run queries against the namespace on the remote server Remote(String), /// Run queries against a local `Observer` instance @@ -177,7 +177,7 @@ pub struct Repl { /// Client for running sql flight_client: influxdb_iox_client::flight::Client, - /// database name against which SQL commands are run + /// namespace name against which SQL commands are run query_engine: Option, /// Formatter to use to format query results @@ -239,8 +239,8 @@ impl Repl { .map_err(|e| println!("{}", e)) .ok(); } - ReplCommand::UseDatabase { db_name } => { - self.use_database(db_name); + ReplCommand::UseNamespace { db_name } => { + self.use_namespace(db_name); } ReplCommand::SqlCommand { sql } => { self.run_sql(sql).await.map_err(|e| println!("{}", e)).ok(); @@ -302,18 +302,18 @@ impl Repl { self.print_results(&[record_batch]) } - // Run a command against the currently selected remote database + // Run a command against the currently selected remote namespace async fn run_sql(&mut self, sql: String) -> Result<()> { let start = Instant::now(); let batches = match &mut self.query_engine { None => { - println!("Error: no database selected."); - println!("Hint: Run USE DATABASE to select database"); + println!("Error: no namespace selected."); + println!("Hint: Run USE NAMESPACE to select namespace"); return Ok(()); } Some(QueryEngine::Remote(db_name)) => { - info!(%db_name, %sql, "Running sql on remote database"); + info!(%db_name, %sql, "Running sql on remote namespace"); scrape_query(&mut self.flight_client, db_name, &sql).await? } @@ -349,9 +349,9 @@ impl Repl { } } - fn use_database(&mut self, db_name: String) { - info!(%db_name, "setting current database"); - println!("You are now in remote mode, querying database {}", db_name); + fn use_namespace(&mut self, db_name: String) { + info!(%db_name, "setting current namespace"); + println!("You are now in remote mode, querying namespace {}", db_name); self.set_query_engine(QueryEngine::Remote(db_name)); } diff --git a/influxdb_iox/src/commands/sql/repl_command.rs b/influxdb_iox/src/commands/sql/repl_command.rs index 37fa4fb843..56f310ed7f 100644 --- a/influxdb_iox/src/commands/sql/repl_command.rs +++ b/influxdb_iox/src/commands/sql/repl_command.rs @@ -7,7 +7,7 @@ pub enum ReplCommand { ShowNamespaces, Observer, SetFormat { format: String }, - UseDatabase { db_name: String }, + UseNamespace { db_name: String }, SqlCommand { sql: String }, Exit, } @@ -64,18 +64,18 @@ impl TryFrom<&str> for ReplCommand { ["observer"] => Ok(Self::Observer), ["exit"] => Ok(Self::Exit), ["quit"] => Ok(Self::Exit), - ["use", "database"] => { - Err("name not specified. Usage: USE DATABASE ".to_string()) - } // USE DATABASE - ["use", "database", _name] => { - // USE DATABASE - Ok(Self::UseDatabase { + ["use", "namespace"] => { + Err("name not specified. Usage: USE NAMESPACE ".to_string()) + } // USE NAMESPACE + ["use", "namespace", _name] => { + // USE namespace + Ok(Self::UseNamespace { db_name: raw_commands[2].to_string(), }) } ["use", _command] => { // USE - Ok(Self::UseDatabase { + Ok(Self::UseNamespace { db_name: raw_commands[1].to_string(), }) } @@ -98,9 +98,9 @@ impl ReplCommand { Available commands (not case sensitive): HELP (this one) -SHOW NAMESPACES: List databases available on the server +SHOW NAMESPACES: List namespaces available on the server -USE [DATABASE|NAMESPACE] : Set the current remote database to name +USE NAMESPACE : Set the current remote namespace to name SET FORMAT : Set the output format to Pretty, csv or json @@ -108,9 +108,9 @@ OBSERVER: Locally query unified queryable views of remote system tables [EXIT | QUIT]: Quit this session and exit the program -# Examples: use remote database foo -SHOW DATABASES; -USE DATABASE foo; +# Examples: use remote namespace foo +SHOW NAMESPACES; +USE foo; # Basic IOx SQL Primer @@ -199,35 +199,35 @@ mod tests { } #[test] - fn use_database() { - let expected = Ok(ReplCommand::UseDatabase { + fn use_namespace() { + let expected = Ok(ReplCommand::UseNamespace { db_name: "Foo".to_string(), }); assert_eq!("use Foo".try_into(), expected); - assert_eq!("use Database Foo;".try_into(), expected); - assert_eq!("use Database Foo ;".try_into(), expected); - assert_eq!(" use Database Foo; ".try_into(), expected); - assert_eq!(" use Database Foo; ".try_into(), expected); + assert_eq!("use Namespace Foo;".try_into(), expected); + assert_eq!("use Namespace Foo ;".try_into(), expected); + assert_eq!(" use Namespace Foo; ".try_into(), expected); + assert_eq!(" use Namespace Foo; ".try_into(), expected); - // ensure that database name is case sensitive - let expected = Ok(ReplCommand::UseDatabase { + // ensure that namespace name is case sensitive + let expected = Ok(ReplCommand::UseNamespace { db_name: "FOO".to_string(), }); assert_eq!("use FOO".try_into(), expected); - assert_eq!("use DATABASE FOO;".try_into(), expected); - assert_eq!("USE DATABASE FOO;".try_into(), expected); + assert_eq!("use NAMESPACE FOO;".try_into(), expected); + assert_eq!("USE NAMESPACE FOO;".try_into(), expected); let expected: Result = - Err("name not specified. Usage: USE DATABASE ".to_string()); - assert_eq!("use Database;".try_into(), expected); - assert_eq!("use DATABASE".try_into(), expected); - assert_eq!("use database".try_into(), expected); + Err("name not specified. Usage: USE NAMESPACE ".to_string()); + assert_eq!("use Namespace;".try_into(), expected); + assert_eq!("use NAMESPACE".try_into(), expected); + assert_eq!("use namespace".try_into(), expected); - let expected = sql_cmd("use database foo bar"); - assert_eq!("use database foo bar".try_into(), expected); + let expected = sql_cmd("use namespace foo bar"); + assert_eq!("use namespace foo bar".try_into(), expected); - let expected = sql_cmd("use database foo BAR"); - assert_eq!("use database foo BAR".try_into(), expected); + let expected = sql_cmd("use namespace foo BAR"); + assert_eq!("use namespace foo BAR".try_into(), expected); } #[test] diff --git a/influxdb_iox/src/commands/write.rs b/influxdb_iox/src/commands/write.rs index e5aff6bd88..857a81b320 100644 --- a/influxdb_iox/src/commands/write.rs +++ b/influxdb_iox/src/commands/write.rs @@ -1,6 +1,14 @@ +use futures::StreamExt; use influxdb_iox_client::{connection::Connection, write}; -use snafu::{ResultExt, Snafu}; -use std::{fs::File, io::Read, path::PathBuf}; +use observability_deps::tracing::info; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use std::{ + fs::File, + io::{BufReader, Read}, + num::NonZeroUsize, + path::PathBuf, + time::Instant, +}; #[allow(clippy::enum_variant_names)] #[derive(Debug, Snafu)] @@ -11,10 +19,30 @@ pub enum Error { source: std::io::Error, }, + #[snafu(display("Error reading files: {:#?}", sources))] + ReadingFiles { sources: Vec }, + #[snafu(display("Client error: {source}"))] ClientError { source: influxdb_iox_client::error::Error, }, + + #[snafu(display("Error converting parquet: {}", source))] + Conversion { + source: parquet_to_line_protocol::Error, + }, + + #[snafu(display("Line protocol was not valid utf8: {}", source))] + InvalidUtf8 { source: std::string::FromUtf8Error }, + + #[snafu(display("Error decoding gzip {:?}: {}", file_name, source))] + Gz { + file_name: PathBuf, + source: std::io::Error, + }, + + #[snafu(display("Max concurrent uploads must be greater than zero"))] + MaxConcurrentUploadsVerfication, } pub type Result = std::result::Result; @@ -22,36 +50,176 @@ pub type Result = std::result::Result; /// Write data into the specified database #[derive(Debug, clap::Parser)] pub struct Config { + /// If specified, restricts the maxium amount of line protocol + /// sent per request to this many bytes. Defaults to 1MB + #[clap(action, long, short = 'b', default_value = "1048576")] + max_request_payload_size_bytes: usize, + + /// Uploads up to this many http requests at a time. Defaults to 10 + #[clap(action, long, short = 'c', default_value = "10")] + max_concurrent_uploads: usize, + /// The namespace into which to write #[clap(action)] namespace: String, - /// File with data to load. Currently supported formats are .lp + /// File(s) with data to load. Currently supported formats are .lp (line protocol), + /// .parquet (IOx created parquet files), and .gz (gzipped line protocol) #[clap(action)] - file_name: PathBuf, + file_names: Vec, } pub async fn command(connection: Connection, config: Config) -> Result<()> { + let start = Instant::now(); + let Config { namespace, - file_name, + file_names, + max_request_payload_size_bytes, + max_concurrent_uploads, } = config; - let file_name = &file_name; - let mut file = File::open(file_name).context(ReadingFileSnafu { file_name })?; + let max_concurrent_uploads = + NonZeroUsize::new(max_concurrent_uploads).context(MaxConcurrentUploadsVerficationSnafu)?; - let mut lp_data = String::new(); - file.read_to_string(&mut lp_data) - .context(ReadingFileSnafu { file_name })?; + info!( + num_files = file_names.len(), + max_request_payload_size_bytes, max_concurrent_uploads, "Beginning upload" + ); - let mut client = write::Client::new(connection); + // first pass is to check that all the files exist and can be + // opened and if not fail fast. + let file_open_errors: Vec<_> = file_names + .iter() + .filter_map(|file_name| { + File::open(file_name) + .context(ReadingFileSnafu { file_name }) + .err() + }) + .collect(); + + ensure!( + file_open_errors.is_empty(), + ReadingFilesSnafu { + sources: file_open_errors + } + ); + + // if everything looked good, go through and read the files out + // them potentially in parallel. + let lp_stream = futures_util::stream::iter(file_names) + .map(|file_name| tokio::task::spawn(slurp_file(file_name))) + // Since the contents of each file are buffered into a string, + // limit the number that are open at once to the maximum + // possible uploads + .buffered(max_concurrent_uploads.into()) + // warn and skip any errors + .filter_map(|res| async move { + match res { + Ok(Ok(lp_data)) => Some(lp_data), + Ok(Err(e)) => { + eprintln!("WARNING: ignoring error : {}", e); + None + } + Err(e) => { + eprintln!("WARNING: ignoring task fail: {}", e); + None + } + } + }); + + let mut client = write::Client::new(connection) + .with_max_concurrent_uploads(max_concurrent_uploads) + .with_max_request_payload_size_bytes(Some(max_request_payload_size_bytes)); let total_bytes = client - .write_lp(namespace, lp_data) + .write_lp_stream(namespace, lp_stream) .await .context(ClientSnafu)?; - println!("{} Bytes OK", total_bytes); + let elapsed = Instant::now() - start; + let mb = (total_bytes as f64) / (1024.0 * 1024.0); + let mb_per_sec = (mb / (elapsed.as_millis() as f64)) * (1000.0); + println!("{total_bytes} Bytes OK in {elapsed:?}. {mb_per_sec:.2} MB/sec"); Ok(()) } + +/// Reads the contents of `file_name into a string +/// +/// .parquet files --> iox parquet files (convert to parquet) +/// .gz --> treated as gzipped line protocol +/// .lp (or anything else) --> treated as raw line protocol +/// +async fn slurp_file(file_name: PathBuf) -> Result { + let file_name = &file_name; + + let extension = file_name + .extension() + .map(|extension| extension.to_ascii_lowercase()); + + match extension { + // Transform parquet to line protocol prior to upload + // Not the most efficient process, but it is expedient + Some(extension) if extension.to_string_lossy() == "parquet" => { + let mut lp_data = vec![]; + parquet_to_line_protocol::convert_file(file_name, &mut lp_data) + .await + .context(ConversionSnafu)?; + + let lp_data = String::from_utf8(lp_data).context(InvalidUtf8Snafu)?; + info!( + ?file_name, + file_size_bytes = lp_data.len(), + "Buffered line protocol from parquet file" + ); + Ok(lp_data) + } + // decompress as gz + Some(extension) if extension.to_string_lossy() == "gz" => { + let mut lp_data = String::new(); + let reader = + BufReader::new(File::open(&file_name).context(ReadingFileSnafu { file_name })?); + + flate2::read::GzDecoder::new(reader) + .read_to_string(&mut lp_data) + .context(GzSnafu { file_name })?; + + info!( + ?file_name, + file_size_bytes = lp_data.len(), + "Buffered line protocol from gzipped line protocol file" + ); + Ok(lp_data) + } + // anything else, treat as line protocol + Some(_) | None => { + let lp_data = + std::fs::read_to_string(file_name).context(ReadingFileSnafu { file_name })?; + + info!( + ?file_name, + file_size_bytes = lp_data.len(), + "Buffered line protocol file" + ); + Ok(lp_data) + } + } +} + +#[cfg(test)] +mod test { + use clap::Parser; + use influxdb_iox_client::write::DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES; + + use super::*; + + #[test] + fn command_default_is_same_as_client_default() { + let config = Config::try_parse_from(vec!["my_db", "file1"]).unwrap(); + assert_eq!( + Some(config.max_request_payload_size_bytes), + DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES + ); + } +} diff --git a/influxdb_iox/tests/end_to_end_cases/cli.rs b/influxdb_iox/tests/end_to_end_cases/cli.rs index 89f868cae8..941a7437ee 100644 --- a/influxdb_iox/tests/end_to_end_cases/cli.rs +++ b/influxdb_iox/tests/end_to_end_cases/cli.rs @@ -6,7 +6,6 @@ use predicates::prelude::*; use serde_json::Value; use std::time::{Duration, Instant}; use tempfile::tempdir; -use test_helpers::make_temp_file; use test_helpers_end_to_end::{ maybe_skip_integration, AddAddrEnv, BindAddresses, MiniCluster, ServerType, Step, StepTest, StepTestState, @@ -526,9 +525,6 @@ async fn write_and_query() { vec![ Step::Custom(Box::new(|state: &mut StepTestState| { async { - // write line protocol to a temp file - let lp_file = make_temp_file("m,tag=1 v=2 12345"); - let lp_file_path = lp_file.path().to_string_lossy().to_string(); let router_addr = state.cluster().router().router_http_base().to_string(); let namespace = state.cluster().namespace(); @@ -537,53 +533,48 @@ async fn write_and_query() { // Validate the output of the schema CLI command Command::cargo_bin("influxdb_iox") .unwrap() + .arg("-v") .arg("-h") .arg(&router_addr) .arg("write") .arg(&namespace) - .arg(&lp_file_path) + // raw line protocol ('h2o_temperature' measurement) + .arg("../test_fixtures/lineproto/air_and_water.lp") + // gzipped line protocol ('m0') + .arg("../test_fixtures/lineproto/read_filter.lp.gz") + // iox formatted parquet ('cpu' measurement) + .arg("../test_fixtures/cpu.parquet") .assert() .success() - .stdout(predicate::str::contains("17 Bytes OK")); + // this number is the total size of + // uncompressed line protocol stored in all + // three files + .stdout(predicate::str::contains("1137058 Bytes OK")); } .boxed() })), Step::Custom(Box::new(|state: &mut StepTestState| { async { - let querier_addr = state.cluster().querier().querier_grpc_base().to_string(); - let namespace = state.cluster().namespace(); + // data from 'air_and_water.lp' + wait_for_query_result( + state, + "SELECT * from h2o_temperature order by time desc limit 10", + "| 51.3 | coyote_creek | CA | 55.1 | 1970-01-01T00:00:01.568756160Z |" + ).await; - let max_wait_time = Duration::from_secs(10); - let expected = "| 1 | 1970-01-01T00:00:00.000012345Z | 2 |"; - println!("Waiting for {expected}"); + // data from 'read_filter.lp.gz' + wait_for_query_result( + state, + "SELECT * from m0 order by time desc limit 10;", + "| value1 | value9 | value9 | value49 | value0 | 2021-04-26T13:47:39.727574Z | 1 |" + ).await; - // Validate the output of running the query CLI command appears after at most max_wait_time - let end = Instant::now() + max_wait_time; - while Instant::now() < end { - let maybe_result = Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("-h") - .arg(&querier_addr) - .arg("query") - .arg(&namespace) - .arg("SELECT * from m") - .assert() - .success() - .try_stdout(predicate::str::contains(expected)); - - match maybe_result { - Err(e) => { - println!("Got err: {}, retrying", e); - } - Ok(r) => { - println!("Success: {:?}", r); - return; - } - } - // sleep and try again - tokio::time::sleep(Duration::from_millis(500)).await - } - panic!("Did not find expected output in allotted time"); + // data from 'cpu.parquet' + wait_for_query_result( + state, + "SELECT * from cpu where cpu = 'cpu2' order by time desc limit 10", + "cpu2 | MacBook-Pro-8.hsd1.ma.comcast.net | 2022-09-30T12:55:00Z" + ).await; } .boxed() })), @@ -593,6 +584,53 @@ async fn write_and_query() { .await } +/// Runs the specified query in a loop for up to 10 seconds, waiting +/// for the specified output to appear +async fn wait_for_query_result(state: &mut StepTestState<'_>, query_sql: &str, expected: &str) { + let querier_addr = state.cluster().querier().querier_grpc_base().to_string(); + let namespace = state.cluster().namespace(); + + let max_wait_time = Duration::from_secs(10); + println!("Waiting for {expected}"); + + // Validate the output of running the query CLI command appears after at most max_wait_time + let end = Instant::now() + max_wait_time; + while Instant::now() < end { + let assert = Command::cargo_bin("influxdb_iox") + .unwrap() + .arg("-h") + .arg(&querier_addr) + .arg("query") + .arg(&namespace) + .arg(query_sql) + .assert(); + + let assert = match assert.try_success() { + Err(e) => { + println!("Got err running command: {}, retrying", e); + continue; + } + Ok(a) => a, + }; + + match assert.try_stdout(predicate::str::contains(expected)) { + Err(e) => { + println!("No match: {}, retrying", e); + } + Ok(r) => { + println!("Success: {:?}", r); + return; + } + } + // sleep and try again + tokio::time::sleep(Duration::from_secs(1)).await + } + panic!( + "Did not find expected output {} within {:?}", + expected, max_wait_time + ); +} + /// Test the schema cli command #[tokio::test] async fn namespaces_cli() { diff --git a/influxdb_iox/tests/end_to_end_cases/ingester.rs b/influxdb_iox/tests/end_to_end_cases/ingester.rs index 07ecd8fbbe..edf93bb305 100644 --- a/influxdb_iox/tests/end_to_end_cases/ingester.rs +++ b/influxdb_iox/tests/end_to_end_cases/ingester.rs @@ -52,7 +52,6 @@ async fn ingester_flight_api() { partition_id, status: Some(PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None }) }, ); diff --git a/influxdb_iox/tests/end_to_end_cases/querier.rs b/influxdb_iox/tests/end_to_end_cases/querier.rs index d5f1cfbe0e..b189098a64 100644 --- a/influxdb_iox/tests/end_to_end_cases/querier.rs +++ b/influxdb_iox/tests/end_to_end_cases/querier.rs @@ -7,7 +7,8 @@ use futures::FutureExt; use predicates::prelude::*; use test_helpers::assert_contains; use test_helpers_end_to_end::{ - maybe_skip_integration, run_query, MiniCluster, Step, StepTest, StepTestState, TestConfig, + maybe_skip_integration, run_query, try_run_query, GrpcRequestBuilder, MiniCluster, Step, + StepTest, StepTestState, TestConfig, }; #[tokio::test] @@ -454,6 +455,87 @@ async fn issue_4631_b() { .await } +#[tokio::test] +async fn oom_protection() { + test_helpers::maybe_start_logging(); + let database_url = maybe_skip_integration!(); + + let table_name = "the_table"; + + // Set up the cluster ==================================== + let router_config = TestConfig::new_router(&database_url); + let ingester_config = TestConfig::new_ingester(&router_config); + let querier_config = + TestConfig::new_querier(&ingester_config).with_querier_max_table_query_bytes(1); + let mut cluster = MiniCluster::new() + .with_router(router_config) + .await + .with_ingester(ingester_config) + .await + .with_querier(querier_config) + .await; + + StepTest::new( + &mut cluster, + vec![ + Step::WriteLineProtocol(format!("{},tag1=A,tag2=B val=42i 123457", table_name)), + Step::WaitForReadable, + Step::AssertNotPersisted, + // SQL query + Step::Custom(Box::new(move |state: &mut StepTestState| { + async move { + let sql = format!("select * from {}", table_name); + let err = try_run_query( + sql, + state.cluster().namespace(), + state.cluster().querier().querier_grpc_connection(), + ) + .await + .unwrap_err(); + + if let influxdb_iox_client::flight::Error::GrpcError(status) = err { + assert_eq!( + status.code(), + tonic::Code::ResourceExhausted, + "Wrong status code: {}\n\nStatus:\n{}", + status.code(), + status, + ); + } else { + panic!("Not a gRPC error: {err}"); + } + } + .boxed() + })), + // InfluxRPC/storage query + Step::Custom(Box::new(move |state: &mut StepTestState| { + async move { + let mut storage_client = state.cluster().querier_storage_client(); + + let read_filter_request = GrpcRequestBuilder::new() + .source(state.cluster()) + .build_read_filter(); + + let status = storage_client + .read_filter(read_filter_request) + .await + .unwrap_err(); + assert_eq!( + status.code(), + tonic::Code::ResourceExhausted, + "Wrong status code: {}\n\nStatus:\n{}", + status.code(), + status, + ); + } + .boxed() + })), + ], + ) + .run() + .await +} + /// This structure holds information for tests that need to force a parquet file to be persisted struct ForcePersistenceSetup { // Set up a cluster that will will persist quickly diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml index 9b674c4a33..42a886d98c 100644 --- a/influxdb_iox_client/Cargo.toml +++ b/influxdb_iox_client/Cargo.toml @@ -13,6 +13,7 @@ format = ["arrow", "arrow_util"] # Workspace dependencies, in alphabetical order arrow_util = { path = "../arrow_util", optional = true } client_util = { path = "../client_util" } +influxdb_line_protocol = { path = "../influxdb_line_protocol"} generated_types = { path = "../generated_types", default-features = false, features = ["data_types_conversions"] } # Crates.io dependencies, in alphabetical order @@ -23,9 +24,7 @@ futures-util = { version = "0.3", optional = true } prost = "0.11" rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] } +tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] } +tokio-stream = "0.1.11" thiserror = "1.0.37" tonic = { version = "0.8" } - -[dev-dependencies] # In alphabetical order -tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread"] } -mockito = "0.31" \ No newline at end of file diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs index 1ee584d8a0..4771970f11 100644 --- a/influxdb_iox_client/src/client/write.rs +++ b/influxdb_iox_client/src/client/write.rs @@ -1,15 +1,16 @@ -/// Re-export generated_types -pub mod generated_types { - pub use generated_types::influxdata::pbdata::v1::*; -} +use std::{fmt::Debug, num::NonZeroUsize, sync::Arc}; use client_util::{connection::HttpConnection, namespace_translation::split_namespace}; +use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt}; use crate::{ connection::Connection, error::{translate_response, Error}, }; -use reqwest::Method; +use reqwest::{Body, Method}; + +/// The default value for the maximum size of each request, in bytes +pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option = Some(1024 * 1024); /// An IOx Write API client. /// @@ -37,18 +38,67 @@ use reqwest::Method; /// ``` #[derive(Debug, Clone)] pub struct Client { - inner: HttpConnection, + /// The inner client used to actually make requests. + /// + /// Uses a trait for test mocking. + /// + /// Does not expose the trait in the `Client` type to avoid + /// exposing an internal implementation detail (the trait) in the + /// public interface. + inner: Arc, + + /// If `Some`, restricts the maximum amount of line protocol + /// sent per request to this many bytes. If `None`, does not restrict + /// the amount sent per request. Defaults to `Some(1MB)` + /// + /// Splitting the upload size consumes a non trivial amount of CPU + /// to find line protocol boundaries. This can be disabled by + /// setting `max_request_payload_size_bytes` to `None`. + max_request_payload_size_bytes: Option, + + /// Makes this many concurrent requests at a time. Defaults to 1 + max_concurrent_uploads: NonZeroUsize, } impl Client { /// Creates a new client with the provided connection pub fn new(connection: Connection) -> Self { + Self::new_with_maker(Arc::new(connection.into_http_connection())) + } + + /// Creates a new client with the provided request maker + fn new_with_maker(inner: Arc) -> Self { Self { - inner: connection.into_http_connection(), + inner, + max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES, + max_concurrent_uploads: NonZeroUsize::new(1).unwrap(), } } - /// Write the [LineProtocol] formatted data in `lp_data` to + /// Override the default of sending 1MB of line protocol per request. + /// If `Some` is specified, restricts the maximum amount of line protocol + /// sent per request to this many bytes. If `None`, does not restrict the amount of + /// line protocol sent per request. + pub fn with_max_request_payload_size_bytes( + self, + max_request_payload_size_bytes: Option, + ) -> Self { + Self { + max_request_payload_size_bytes, + ..self + } + } + + /// The client makes this many concurrent uploads at a + /// time. Defaults to 1. + pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self { + Self { + max_concurrent_uploads, + ..self + } + } + + /// Write the [LineProtocol] formatted string in `lp_data` to /// namespace `namespace`. /// /// Returns the number of bytes which were written to the database @@ -59,11 +109,24 @@ impl Client { namespace: impl AsRef + Send, lp_data: impl Into + Send, ) -> Result { - let lp_data = lp_data.into(); - let data_len = lp_data.len(); + let sources = futures_util::stream::iter([lp_data.into()]); - let write_url = format!("{}api/v2/write", self.inner.uri()); + self.write_lp_stream(namespace, sources).await + } + /// Write the stream of [LineProtocol] formatted strings in + /// `sources` to namespace `namespace`. It is assumed that + /// individual lines (points) do not cross these strings + /// + /// Returns the number of bytes, in total, which were written to + /// the database + /// + /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format + pub async fn write_lp_stream( + &mut self, + namespace: impl AsRef + Send, + sources: impl Stream + Send, + ) -> Result { let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| { Error::invalid_argument( "namespace", @@ -71,47 +134,302 @@ impl Client { ) })?; - let response = self - .inner - .client() - .request(Method::POST, &write_url) - .query(&[("bucket", bucket_id), ("org", org_id)]) - .body(lp_data) - .send() + let max_concurrent_uploads: usize = self.max_concurrent_uploads.into(); + let max_request_payload_size_bytes = self.max_request_payload_size_bytes; + + // make a stream and process in parallel + let results = sources + // split each input source in parallel, if possible + .flat_map(|source| { + split_lp( + source, + max_request_payload_size_bytes, + max_concurrent_uploads, + ) + }) + // do the actual write + .map(|source| { + let org_id = org_id.to_string(); + let bucket_id = bucket_id.to_string(); + let inner = Arc::clone(&self.inner); + + tokio::task::spawn( + async move { inner.write_source(org_id, bucket_id, source).await }, + ) + }) + // Do the uploads in parallel + .buffered(max_concurrent_uploads) + .try_collect::>() + // handle panics in tasks .await - .map_err(Error::client)?; + .map_err(Error::client)? + // find / return any errors + .into_iter() + .collect::, Error>>()?; - translate_response(response).await?; + Ok(results.into_iter().sum()) + } +} - Ok(data_len) +/// Something that knows how to send http data. Exists so it can be +/// mocked out for testing +trait RequestMaker: Debug + Send + Sync { + /// Write the body data to the specified org, bucket, and + /// returning the number of bytes written + /// + /// (this is implemented manually to avoid `async_trait`) + fn write_source( + &self, + org_id: String, + bucket_id: String, + body: String, + ) -> BoxFuture<'_, Result>; +} + +impl RequestMaker for HttpConnection { + fn write_source( + &self, + org_id: String, + bucket_id: String, + body: String, + ) -> BoxFuture<'_, Result> { + let write_url = format!("{}api/v2/write", self.uri()); + + async move { + let body: Body = body.into(); + + let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0); + + let response = self + .client() + .request(Method::POST, &write_url) + .query(&[("bucket", bucket_id), ("org", org_id)]) + .body(body) + .send() + .await + .map_err(Error::client)?; + + translate_response(response).await?; + + Ok(data_len) + } + .boxed() + } +} + +/// splits input line protocol into one or more sizes of at most +/// `max_chunk` on line breaks in a separte tokio task +fn split_lp( + input: String, + max_chunk_size: Option, + max_concurrent_uploads: usize, +) -> impl Stream { + let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads); + + tokio::task::spawn(async move { + match max_chunk_size { + None => { + // ignore errors (means the receiver hung up but nothing to communicate + tx.send(input).await.ok(); + } + Some(max_chunk_size) => { + // use the actual line protocol parser to split on valid boundaries + let mut acc = LineAccumulator::new(max_chunk_size); + for l in influxdb_line_protocol::split_lines(&input) { + if let Some(chunk) = acc.push(l) { + // abort if receiver has hungup + if tx.send(chunk).await.is_err() { + return; + } + } + } + if let Some(chunk) = acc.flush() { + tx.send(chunk).await.ok(); + } + } + } + }); + + tokio_stream::wrappers::ReceiverStream::new(rx) +} +#[derive(Debug)] +struct LineAccumulator { + current_chunk: String, + max_chunk_size: usize, +} + +impl LineAccumulator { + fn new(max_chunk_size: usize) -> Self { + Self { + current_chunk: String::with_capacity(max_chunk_size), + max_chunk_size, + } + } + + // Add data `l` to the current chunk being created, returning the + // current chunk if complete. + fn push(&mut self, l: &str) -> Option { + let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size { + self.flush() + } else { + None + }; + + if !self.current_chunk.is_empty() { + self.current_chunk += "\n"; + } + + self.current_chunk += l; + chunk + } + + /// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length + /// `self.current_chunk.len()` is zero + fn flush(&mut self) -> Option { + if !self.current_chunk.is_empty() { + let mut new_chunk = String::with_capacity(self.max_chunk_size); + std::mem::swap(&mut new_chunk, &mut self.current_chunk); + Some(new_chunk) + } else { + None + } } } #[cfg(test)] mod tests { + use std::sync::Mutex; + use super::*; - use crate::connection::Builder; #[tokio::test] - /// Ensure the basic plumbing is hooked up correctly - async fn basic() { - let url = mockito::server_url(); - - let connection = Builder::new().build(&url).await.unwrap(); + async fn test() { + let mock = Arc::new(MockRequestMaker::new()); let namespace = "orgname_bucketname"; let data = "m,t=foo f=4"; - let m = mockito::mock("POST", "/api/v2/write?bucket=bucketname&org=orgname") - .with_status(201) - .match_body(data) - .create(); + let expected = vec![MockRequest { + org_id: "orgname".into(), + bucket_id: "bucketname".into(), + body: data.into(), + }]; - let res = Client::new(connection).write_lp(namespace, data).await; - - m.assert(); - - let num_bytes = res.expect("Error making write request"); + let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _) + .write_lp(namespace, data) + .await + .unwrap(); + assert_eq!(expected, mock.requests()); assert_eq!(num_bytes, 11); } + + #[tokio::test] + async fn test_max_request_payload_size() { + let mock = Arc::new(MockRequestMaker::new()); + + let namespace = "orgname_bucketname"; + let data = "m,t=foo f=4\n\ + m,t=bar f=3\n\ + m,t=fooddddddd f=4"; + + // expect the data to be broken up into two chunks: + let expected = vec![ + MockRequest { + org_id: "orgname".into(), + bucket_id: "bucketname".into(), + body: "m,t=foo f=4\nm,t=bar f=3".into(), + }, + MockRequest { + org_id: "orgname".into(), + bucket_id: "bucketname".into(), + body: "m,t=fooddddddd f=4".into(), + }, + ]; + + let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _) + // enough to get first two lines, but not last + .with_max_request_payload_size_bytes(Some(30)) + .write_lp(namespace, data) + .await + .unwrap(); + assert_eq!(expected, mock.requests()); + assert_eq!(num_bytes, 41); + } + + #[tokio::test] + async fn test_write_lp_stream() { + let mock = Arc::new(MockRequestMaker::new()); + + let namespace = "orgname_bucketname"; + let data = futures_util::stream::iter( + vec!["m,t=foo f=4", "m,t=bar f=3"] + .into_iter() + .map(|s| s.to_string()), + ); + + // expect the data to come in two chunks + let expected = vec![ + MockRequest { + org_id: "orgname".into(), + bucket_id: "bucketname".into(), + body: "m,t=foo f=4".into(), + }, + MockRequest { + org_id: "orgname".into(), + bucket_id: "bucketname".into(), + body: "m,t=bar f=3".into(), + }, + ]; + + let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _) + .write_lp_stream(namespace, data) + .await + .unwrap(); + assert_eq!(expected, mock.requests()); + assert_eq!(num_bytes, 22); + } + + #[derive(Debug, Clone, PartialEq)] + struct MockRequest { + org_id: String, + bucket_id: String, + body: String, + } + + #[derive(Debug)] + struct MockRequestMaker { + requests: Mutex>, + } + + impl MockRequestMaker { + fn new() -> Self { + Self { + requests: Mutex::new(vec![]), + } + } + + /// get a copy of the requests that were made using this mock + fn requests(&self) -> Vec { + self.requests.lock().unwrap().clone() + } + } + + impl RequestMaker for MockRequestMaker { + fn write_source( + &self, + org_id: String, + bucket_id: String, + body: String, + ) -> BoxFuture<'_, Result> { + let sz = body.len(); + + self.requests.lock().unwrap().push(MockRequest { + org_id, + bucket_id, + body, + }); + + async move { Ok(sz) }.boxed() + } + } } diff --git a/influxdb_line_protocol/Cargo.toml b/influxdb_line_protocol/Cargo.toml index f82103288d..aae56dd1db 100644 --- a/influxdb_line_protocol/Cargo.toml +++ b/influxdb_line_protocol/Cargo.toml @@ -14,7 +14,7 @@ ffi = ["libc"] bytes = "1.2" libc = { version = "0.2", optional = true } nom = { version = "7", default-features = false, features = ["std"] } -smallvec = { version = "1.9.0", features = ["union"] } +smallvec = { version = "1.10.0", features = ["union"] } snafu = "0.7" observability_deps = { path = "../observability_deps" } workspace-hack = { path = "../workspace-hack"} diff --git a/influxdb_line_protocol/src/lib.rs b/influxdb_line_protocol/src/lib.rs index 07d9ca14ea..91c1c2077d 100644 --- a/influxdb_line_protocol/src/lib.rs +++ b/influxdb_line_protocol/src/lib.rs @@ -529,7 +529,7 @@ pub fn parse_lines(input: &str) -> impl Iterator>> /// logic duplication for scanning fields, duplicating it also means /// we can be more sure of the compatibility of the rust parser and /// the canonical Go parser. -fn split_lines(input: &str) -> impl Iterator { +pub fn split_lines(input: &str) -> impl Iterator { // NB: This is ported as closely as possibly from the original Go code: let mut quoted = false; let mut fields = false; diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml index 152c099d2d..80a8496db6 100644 --- a/influxrpc_parser/Cargo.toml +++ b/influxrpc_parser/Cargo.toml @@ -4,8 +4,8 @@ version = "0.1.0" edition = "2021" [dependencies] -sqlparser = "0.24.0" -snafu = "0.7.1" +sqlparser = "0.25.0" +snafu = "0.7.2" generated_types = { path = "../generated_types" } workspace-hack = { path = "../workspace-hack"} \ No newline at end of file diff --git a/ingester/Cargo.toml b/ingester/Cargo.toml index beb94c37e9..b12ed95c1d 100644 --- a/ingester/Cargo.toml +++ b/ingester/Cargo.toml @@ -24,7 +24,7 @@ iox_catalog = { path = "../iox_catalog" } metric = { path = "../metric" } mutable_batch = { path = "../mutable_batch"} mutable_batch_lp = { path = "../mutable_batch_lp" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parking_lot = "0.12" parquet_file = { path = "../parquet_file" } @@ -45,6 +45,7 @@ write_buffer = { path = "../write_buffer" } write_summary = { path = "../write_summary" } tokio-util = { version = "0.7.4" } trace = { path = "../trace" } +rand = "0.8.5" [dev-dependencies] assert_matches = "1.5.0" @@ -52,4 +53,4 @@ bitflags = {version = "1.3.2"} once_cell = "1" paste = "1.0.9" test_helpers = { path = "../test_helpers", features = ["future_timeout"] } -tokio-stream = {version = "0.1.10", default_features = false } +tokio-stream = {version = "0.1.11", default_features = false } diff --git a/ingester/src/compact.rs b/ingester/src/compact.rs index 040a1c983c..8a280cc751 100644 --- a/ingester/src/compact.rs +++ b/ingester/src/compact.rs @@ -18,7 +18,7 @@ use crate::{data::partition::PersistingBatch, query::QueryableBatch}; #[derive(Debug, Snafu)] #[allow(missing_copy_implementations, missing_docs)] -pub enum Error { +pub(crate) enum Error { #[snafu(display("Error while building logical plan for Ingester's compaction"))] LogicalPlan { source: iox_query::frontend::reorg::Error, @@ -86,11 +86,8 @@ pub(crate) async fn compact_persisting_batch( namespace_id: i64, partition_info: &PartitionInfo, batch: Arc, -) -> Result> { - // Nothing to compact - if batch.data.data.is_empty() { - return Ok(None); - } +) -> Result { + assert!(!batch.data.data.is_empty()); let namespace_name = &partition_info.namespace_name; let table_name = &partition_info.table_name; @@ -141,11 +138,11 @@ pub(crate) async fn compact_persisting_batch( sort_key: Some(metadata_sort_key), }; - Ok(Some(CompactedStream { + Ok(CompactedStream { stream, iox_metadata, sort_key_update, - })) + }) } /// Compact a given Queryable Batch @@ -192,8 +189,8 @@ mod tests { create_batches_with_influxtype_same_columns_different_type, create_one_record_batch_with_influxtype_duplicates, create_one_record_batch_with_influxtype_no_duplicates, - create_one_row_record_batch_with_influxtype, create_tombstone, make_meta, - make_persisting_batch, make_queryable_batch, make_queryable_batch_with_deletes, + create_one_row_record_batch_with_influxtype, make_meta, make_persisting_batch, + make_queryable_batch, }; // this test was added to guard against https://github.com/influxdata/influxdb_iox/issues/3782 @@ -226,7 +223,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -254,7 +250,6 @@ mod tests { let CompactedStream { stream, .. } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -297,7 +292,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -328,7 +322,6 @@ mod tests { sort_key_update, } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -394,7 +387,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -426,7 +418,6 @@ mod tests { sort_key_update, } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -494,7 +485,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -527,7 +517,6 @@ mod tests { sort_key_update, } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -595,7 +584,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -629,7 +617,6 @@ mod tests { sort_key_update, } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -700,7 +687,6 @@ mod tests { partition_id, uuid, batches, - vec![], ); // verify PK @@ -739,7 +725,6 @@ mod tests { sort_key_update, } = compact_persisting_batch(time_provider, &exc, 1, &partition_info, persisting_batch) .await - .unwrap() .unwrap(); let output_batches = datafusion::physical_plan::common::collect(stream) @@ -825,54 +810,6 @@ mod tests { assert_batches_eq!(&expected, &output_batches); } - #[tokio::test] - async fn test_compact_one_batch_no_dupilcates_with_deletes() { - test_helpers::maybe_start_logging(); - - // create input data - let batches = create_one_record_batch_with_influxtype_no_duplicates().await; - let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")]; - - // build queryable batch from the input batches - let compact_batch = - make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones); - - // verify PK - let schema = compact_batch.schema(); - let pk = schema.primary_key(); - let expected_pk = vec!["tag1", "time"]; - assert_eq!(expected_pk, pk); - - let sort_key = compute_sort_key( - &schema, - compact_batch.data.iter().map(|sb| sb.data.as_ref()), - ); - assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"])); - - // compact - let exc = Executor::new(1); - let stream = compact(&exc, compact_batch, sort_key).await.unwrap(); - let output_batches = datafusion::physical_plan::common::collect(stream) - .await - .unwrap(); - // verify no empty record batches - bug #3782 - assert_eq!(output_batches.len(), 2); - assert_eq!(output_batches[0].num_rows(), 1); - assert_eq!(output_batches[1].num_rows(), 1); - - // verify compacted data - // row with "tag1=UT" no longer available - let expected = vec![ - "+-----------+------+-----------------------------+", - "| field_int | tag1 | time |", - "+-----------+------+-----------------------------+", - "| 10 | VT | 1970-01-01T00:00:00.000010Z |", - "| 1000 | WA | 1970-01-01T00:00:00.000008Z |", - "+-----------+------+-----------------------------+", - ]; - assert_batches_eq!(&expected, &output_batches); - } - #[tokio::test] async fn test_compact_one_batch_with_duplicates() { // create input data @@ -1019,23 +956,12 @@ mod tests { } #[tokio::test] - async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_deletes( - ) { + async fn test_compact_many_batches_different_columns_different_order_with_duplicates() { // create many-batches input data let batches = create_batches_with_influxtype_different_columns_different_order().await; - let tombstones = vec![create_tombstone( - 1, - 1, - 1, - 100, // delete's seq_number - 0, // min time of data to get deleted - 200000, // max time of data to get deleted - "tag2=CT and field_int=1000", // delete predicate - )]; // build queryable batch from the input batches - let compact_batch = - make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones); + let compact_batch = make_queryable_batch("test_table", 0, 1, batches); // verify PK let schema = compact_batch.schema(); @@ -1058,7 +984,6 @@ mod tests { // verify compacted data // data is sorted and all duplicates are removed - // all rows with ("tag2=CT and field_int=1000") are also removed // CORRECT RESULT let expected = vec![ "+-----------+------+------+--------------------------------+", @@ -1067,73 +992,15 @@ mod tests { "| 5 | | AL | 1970-01-01T00:00:00.000005Z |", "| 10 | | AL | 1970-01-01T00:00:00.000007Z |", "| 70 | | CT | 1970-01-01T00:00:00.000000100Z |", + "| 1000 | | CT | 1970-01-01T00:00:00.000001Z |", "| 100 | | MA | 1970-01-01T00:00:00.000000050Z |", "| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |", "| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |", "| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |", "| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |", "| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |", - "+-----------+------+------+--------------------------------+", - ]; - - assert_batches_eq!(&expected, &output_batches); - } - - #[tokio::test] - async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes( - ) { - // create many-batches input data - let batches = create_batches_with_influxtype_different_columns_different_order().await; - let tombstones = vec![ - create_tombstone( - 1, - 1, - 1, - 100, // delete's seq_number - 0, // min time of data to get deleted - 200000, // max time of data to get deleted - "tag2=CT and field_int=1000", // delete predicate - ), - create_tombstone( - 1, 1, 1, 101, // delete's seq_number - 0, // min time of data to get deleted - 200000, // max time of data to get deleted - "tag1!=MT", // delete predicate - ), - ]; - - // build queryable batch from the input batches - let compact_batch = - make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones); - - // verify PK - let schema = compact_batch.schema(); - let pk = schema.primary_key(); - let expected_pk = vec!["tag1", "tag2", "time"]; - assert_eq!(expected_pk, pk); - - let sort_key = compute_sort_key( - &schema, - compact_batch.data.iter().map(|sb| sb.data.as_ref()), - ); - assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"])); - - // compact - let exc = Executor::new(1); - let stream = compact(&exc, compact_batch, sort_key).await.unwrap(); - let output_batches = datafusion::physical_plan::common::collect(stream) - .await - .unwrap(); - - // verify compacted data - // data is sorted and all duplicates are removed - // all rows with ("tag2=CT and field_int=1000") and ("tag1!=MT") are also removed - let expected = vec![ - "+-----------+------+------+--------------------------------+", - "| field_int | tag1 | tag2 | time |", - "+-----------+------+------+--------------------------------+", - "| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |", - "| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |", + "| 1000 | MT | CT | 1970-01-01T00:00:00.000001Z |", + "| 1000 | MT | CT | 1970-01-01T00:00:00.000002Z |", "+-----------+------+------+--------------------------------+", ]; @@ -1142,31 +1009,12 @@ mod tests { // BUG #[tokio::test] - async fn test_compact_many_batches_different_columns_different_order_with_duplicates_with_many_deletes_2( - ) { + async fn test_compact_many_batches_different_columns_different_order_with_duplicates2() { // create many-batches input data let batches = create_batches_with_influxtype_different_columns_different_order().await; - let tombstones = vec![ - create_tombstone( - 1, - 1, - 1, - 100, // delete's seq_number - 0, // min time of data to get deleted - 200000, // max time of data to get deleted - "tag2=CT and field_int=1000", // delete predicate - ), - create_tombstone( - 1, 1, 1, 101, // delete's seq_number - 0, // min time of data to get deleted - 200000, // max time of data to get deleted - "tag1=MT", // delete predicate - ), - ]; // build queryable batch from the input batches - let compact_batch = - make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones); + let compact_batch = make_queryable_batch("test_table", 0, 1, batches); // verify PK let schema = compact_batch.schema(); @@ -1189,29 +1037,22 @@ mod tests { // verify compacted data // data is sorted and all duplicates are removed - // all rows with ("tag2=CT and field_int=1000") and ("tag1=MT") are also removed - // CORRECT RESULT - // let expected = vec![ - // "+-----------+------+------+--------------------------------+", - // "| field_int | tag1 | tag2 | time |", - // "+-----------+------+------+--------------------------------+", - // "| 5 | | AL | 1970-01-01T00:00:00.000005Z |", - // "| 10 | | AL | 1970-01-01T00:00:00.000007Z |", - // "| 70 | | CT | 1970-01-01T00:00:00.000000100Z |", - // "| 100 | | MA | 1970-01-01T00:00:00.000000050Z |", - // "| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |", - // "| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |", - // "| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |", - // "+-----------+------+------+--------------------------------+", - // ]; - // current WRONMG result: "tag1 is null" is also eliminated let expected = vec![ "+-----------+------+------+--------------------------------+", "| field_int | tag1 | tag2 | time |", "+-----------+------+------+--------------------------------+", + "| 5 | | AL | 1970-01-01T00:00:00.000005Z |", + "| 10 | | AL | 1970-01-01T00:00:00.000007Z |", + "| 70 | | CT | 1970-01-01T00:00:00.000000100Z |", + "| 1000 | | CT | 1970-01-01T00:00:00.000001Z |", + "| 100 | | MA | 1970-01-01T00:00:00.000000050Z |", "| 10 | AL | MA | 1970-01-01T00:00:00.000000050Z |", "| 70 | CT | CT | 1970-01-01T00:00:00.000000100Z |", "| 70 | CT | CT | 1970-01-01T00:00:00.000000500Z |", + "| 30 | MT | AL | 1970-01-01T00:00:00.000000005Z |", + "| 20 | MT | AL | 1970-01-01T00:00:00.000007Z |", + "| 1000 | MT | CT | 1970-01-01T00:00:00.000001Z |", + "| 1000 | MT | CT | 1970-01-01T00:00:00.000002Z |", "+-----------+------+------+--------------------------------+", ]; diff --git a/ingester/src/data.rs b/ingester/src/data.rs index 7c4a48386f..d1ec7d39a2 100644 --- a/ingester/src/data.rs +++ b/ingester/src/data.rs @@ -1,15 +1,12 @@ //! Data for the lifecycle of the Ingester -use std::{collections::BTreeMap, pin::Pin, sync::Arc}; +use std::{collections::BTreeMap, sync::Arc}; -use arrow::{error::ArrowError, record_batch::RecordBatch}; -use arrow_util::optimize::{optimize_record_batch, optimize_schema}; use async_trait::async_trait; use backoff::{Backoff, BackoffConfig}; -use data_types::{PartitionId, SequenceNumber, ShardId, ShardIndex}; -use datafusion::physical_plan::SendableRecordBatchStream; +use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, ShardIndex, TableId}; + use dml::DmlOperation; -use futures::{Stream, StreamExt}; use iox_catalog::interface::{get_table_schema_by_id, Catalog}; use iox_query::exec::Executor; use iox_time::SystemProvider; @@ -25,16 +22,12 @@ use crate::{ lifecycle::LifecycleHandle, }; -pub mod namespace; +pub(crate) mod namespace; pub mod partition; -mod query_dedup; -pub mod shard; -pub mod table; +pub(crate) mod shard; +pub(crate) mod table; -use self::{ - partition::{resolver::PartitionProvider, PartitionStatus}, - shard::ShardData, -}; +use self::{partition::resolver::PartitionProvider, shard::ShardData, table::TableName}; #[cfg(test)] mod triggers; @@ -51,9 +44,6 @@ pub enum Error { #[snafu(display("Table {} not found in buffer", table_name))] TableNotFound { table_name: String }, - #[snafu(display("Table must be specified in delete"))] - TableNotPresent, - #[snafu(display("Error accessing catalog: {}", source))] Catalog { source: iox_catalog::interface::Error, @@ -186,7 +176,7 @@ impl IngesterData { .get(&shard_id) .context(ShardNotFoundSnafu { shard_id })?; shard_data - .buffer_operation(dml_operation, &self.catalog, lifecycle_handle, &self.exec) + .buffer_operation(dml_operation, &self.catalog, lifecycle_handle) .await } @@ -220,7 +210,13 @@ impl IngesterData { #[async_trait] pub trait Persister: Send + Sync + 'static { /// Persits the partition ID. Will retry forever until it succeeds. - async fn persist(&self, partition_id: PartitionId); + async fn persist( + &self, + shard_id: ShardId, + namespace_id: NamespaceId, + table_id: TableId, + partition_id: PartitionId, + ); /// Updates the shard's `min_unpersisted_sequence_number` in the catalog. /// This number represents the minimum that might be unpersisted, which is the @@ -235,7 +231,69 @@ pub trait Persister: Send + Sync + 'static { #[async_trait] impl Persister for IngesterData { - async fn persist(&self, partition_id: PartitionId) { + async fn persist( + &self, + shard_id: ShardId, + namespace_id: NamespaceId, + table_id: TableId, + partition_id: PartitionId, + ) { + // lookup the state from the ingester data. If something isn't found, + // it's unexpected. Crash so someone can take a look. + let shard_data = self + .shards + .get(&shard_id) + .unwrap_or_else(|| panic!("shard state for {shard_id} not in ingester data")); + let namespace = shard_data + .namespace_by_id(namespace_id) + .unwrap_or_else(|| panic!("namespace {namespace_id} not in shard {shard_id} state")); + + let partition_key; + let batch; + { + let table_data = namespace.table_id(table_id).unwrap_or_else(|| { + panic!("table {table_id} in namespace {namespace_id} not in shard {shard_id} state") + }); + + let mut guard = table_data.write().await; + let partition = guard.get_partition(partition_id).unwrap_or_else(|| { + panic!( + "partition {partition_id} in table {table_id} in namespace {namespace_id} not in shard {shard_id} state" + ) + }); + + partition_key = partition.partition_key().clone(); + batch = partition.snapshot_to_persisting_batch(); + }; + + debug!(%shard_id, %namespace_id, %table_id, %partition_id, %partition_key, "persisting partition"); + + // Check if there is any data to persist. + let batch = match batch { + Some(v) if !v.data.data.is_empty() => v, + _ => { + warn!( + %shard_id, + %namespace_id, + %table_id, + %partition_id, + %partition_key, + "partition marked for persistence contains no data" + ); + return; + } + }; + + // lookup column IDs from catalog + // TODO: this can be removed once the ingester uses column IDs internally as well + let table_schema = Backoff::new(&self.backoff_config) + .retry_all_errors("get table schema", || async { + let mut repos = self.catalog.repositories().await; + get_table_schema_by_id(table_id, repos.as_mut()).await + }) + .await + .expect("retry forever"); + // lookup the partition_info from the catalog let partition_info = Backoff::new(&self.backoff_config) .retry_all_errors("get partition_info_by_id", || async { @@ -243,217 +301,159 @@ impl Persister for IngesterData { repos.partitions().partition_info_by_id(partition_id).await }) .await - .expect("retry forever"); + .expect("retry forever").unwrap_or_else(|| panic!("partition {partition_id} in table {table_id} in namespace {namespace_id} in shard {shard_id} has no partition info in catalog")); - // lookup the state from the ingester data. If something isn't found, it's unexpected. Crash - // so someone can take a look. - let partition_info = partition_info - .unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id)); - let shard_data = self - .shards - .get(&partition_info.partition.shard_id) - .unwrap_or_else(|| { - panic!( - "shard state for {} not in ingester data", - partition_info.partition.shard_id - ) - }); //{ - let namespace = shard_data - .namespace(&partition_info.namespace_name) - .unwrap_or_else(|| { - panic!( - "namespace {} not in shard {} state", - partition_info.namespace_name, partition_info.partition.shard_id - ) - }); - debug!(?partition_id, ?partition_info, "persisting partition"); + // do the CPU intensive work of compaction, de-duplication and sorting + let CompactedStream { + stream: record_stream, + iox_metadata, + sort_key_update, + } = compact_persisting_batch( + Arc::new(SystemProvider::new()), + &self.exec, + namespace.namespace_id().get(), + &partition_info, + Arc::clone(&batch), + ) + .await + .expect("unable to compact persisting batch"); - // lookup column IDs from catalog - // TODO: this can be removed once the ingester uses column IDs internally as well - let table_schema = Backoff::new(&self.backoff_config) - .retry_all_errors("get table schema", || async { - let mut repos = self.catalog.repositories().await; - let table = repos - .tables() - .get_by_namespace_and_name(namespace.namespace_id(), &partition_info.table_name) - .await? - .expect("table not found in catalog"); - get_table_schema_by_id(table.id, repos.as_mut()).await - }) + // Save the compacted data to a parquet file in object storage. + // + // This call retries until it completes. + let (md, file_size) = self + .store + .upload(record_stream, &iox_metadata) .await - .expect("retry forever"); + .expect("unexpected fatal persist error"); - let persisting_batch = namespace - .snapshot_to_persisting( - &partition_info.table_name, - &partition_info.partition.partition_key, - ) - .await; - - if let Some(persisting_batch) = persisting_batch { - // do the CPU intensive work of compaction, de-duplication and sorting - let compacted_stream = match compact_persisting_batch( - Arc::new(SystemProvider::new()), - &self.exec, - namespace.namespace_id().get(), - &partition_info, - Arc::clone(&persisting_batch), - ) - .await - { - Err(e) => { - // this should never error out. if it does, we need to crash hard so - // someone can take a look. - panic!("unable to compact persisting batch with error: {:?}", e); - } - Ok(Some(r)) => r, - Ok(None) => { - warn!("persist called with no data"); - return; - } - }; - let CompactedStream { - stream: record_stream, - iox_metadata, - sort_key_update, - } = compacted_stream; - - // Save the compacted data to a parquet file in object storage. - // - // This call retries until it completes. - let (md, file_size) = self - .store - .upload(record_stream, &iox_metadata) - .await - .expect("unexpected fatal persist error"); - - // Update the sort key in the catalog if there are - // additional columns BEFORE adding parquet file to the - // catalog. If the order is reversed, the querier or - // compactor may see a parquet file with an inconsistent - // sort key. https://github.com/influxdata/influxdb_iox/issues/5090 - if let Some(new_sort_key) = sort_key_update { - let sort_key = new_sort_key.to_columns().collect::>(); - Backoff::new(&self.backoff_config) - .retry_all_errors("update_sort_key", || async { - let mut repos = self.catalog.repositories().await; - let _partition = repos - .partitions() - .update_sort_key(partition_id, &sort_key) - .await?; - // compiler insisted on getting told the type of the error :shrug: - Ok(()) as Result<(), iox_catalog::interface::Error> - }) - .await - .expect("retry forever"); - debug!( - ?partition_id, - table = partition_info.table_name, - ?new_sort_key, - "adjusted sort key during batch compact & persist" - ); - } - - // Add the parquet file to the catalog until succeed - let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| { - table_schema.columns.get(name).expect("Unknown column").id - }); - - // Assert partitions are persisted in-order. - // - // It is an invariant that partitions are persisted in order so that - // both the per-shard, and per-partition watermarks are correctly - // advanced and accurate. - if let Some(last_persist) = partition_info.partition.persisted_sequence_number { - assert!( - parquet_file.max_sequence_number > last_persist, - "out of order partition persistence, persisting {}, previously persisted {}", - parquet_file.max_sequence_number.get(), - last_persist.get(), - ); - } - - // Add the parquet file to the catalog. - // - // This has the effect of allowing the queriers to "discover" the - // parquet file by polling / querying the catalog. + // Update the sort key in the catalog if there are + // additional columns BEFORE adding parquet file to the + // catalog. If the order is reversed, the querier or + // compactor may see a parquet file with an inconsistent + // sort key. https://github.com/influxdata/influxdb_iox/issues/5090 + if let Some(new_sort_key) = sort_key_update { + let sort_key = new_sort_key.to_columns().collect::>(); Backoff::new(&self.backoff_config) - .retry_all_errors("add parquet file to catalog", || async { + .retry_all_errors("update_sort_key", || async { let mut repos = self.catalog.repositories().await; - let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?; - debug!( - ?partition_id, - table_id=?parquet_file.table_id, - parquet_file_id=?parquet_file.id, - table_name=%iox_metadata.table_name, - "parquet file written to catalog" - ); + let _partition = repos + .partitions() + .update_sort_key(partition_id, &sort_key) + .await?; // compiler insisted on getting told the type of the error :shrug: Ok(()) as Result<(), iox_catalog::interface::Error> }) .await .expect("retry forever"); - - // Update the per-partition persistence watermark, so that new - // ingester instances skip the just-persisted ops during replay. - // - // This could be transactional with the above parquet insert to - // maintain catalog consistency, though in practice it is an - // unnecessary overhead - the system can tolerate replaying the ops - // that lead to this parquet file being generated, and tolerate - // creating a parquet file containing duplicate data (remedied by - // compaction). - // - // This means it is possible to observe a parquet file with a - // max_persisted_sequence_number > - // partition.persisted_sequence_number, either in-between these - // catalog updates, or for however long it takes a crashed ingester - // to restart and replay the ops, and re-persist a file containing - // the same (or subset of) data. - // - // The above is also true of the per-shard persist marker that - // governs the ingester's replay start point, which is - // non-transactionally updated after all partitions have persisted. - Backoff::new(&self.backoff_config) - .retry_all_errors("set partition persist marker", || async { - self.catalog - .repositories() - .await - .partitions() - .update_persisted_sequence_number( - parquet_file.partition_id, - parquet_file.max_sequence_number, - ) - .await - }) - .await - .expect("retry forever"); - - // Record metrics - let attributes = Attributes::from([( - "shard_id", - format!("{}", partition_info.partition.shard_id).into(), - )]); - self.persisted_file_size_bytes - .recorder(attributes) - .record(file_size as u64); - - // and remove the persisted data from memory - namespace - .mark_persisted( - &partition_info.table_name, - &partition_info.partition.partition_key, - iox_metadata.max_sequence_number, - ) - .await; debug!( ?partition_id, - table_name=%partition_info.table_name, - partition_key=%partition_info.partition.partition_key, - max_sequence_number=%iox_metadata.max_sequence_number.get(), - "marked partition as persisted" + table = partition_info.table_name, + ?new_sort_key, + "adjusted sort key during batch compact & persist" ); } + + // Add the parquet file to the catalog until succeed + let parquet_file = iox_metadata.to_parquet_file(partition_id, file_size, &md, |name| { + table_schema.columns.get(name).expect("Unknown column").id + }); + + // Assert partitions are persisted in-order. + // + // It is an invariant that partitions are persisted in order so that + // both the per-shard, and per-partition watermarks are correctly + // advanced and accurate. + if let Some(last_persist) = partition_info.partition.persisted_sequence_number { + assert!( + parquet_file.max_sequence_number > last_persist, + "out of order partition persistence, persisting {}, previously persisted {}", + parquet_file.max_sequence_number.get(), + last_persist.get(), + ); + } + + // Add the parquet file to the catalog. + // + // This has the effect of allowing the queriers to "discover" the + // parquet file by polling / querying the catalog. + Backoff::new(&self.backoff_config) + .retry_all_errors("add parquet file to catalog", || async { + let mut repos = self.catalog.repositories().await; + let parquet_file = repos.parquet_files().create(parquet_file.clone()).await?; + debug!( + ?partition_id, + table_id=?parquet_file.table_id, + parquet_file_id=?parquet_file.id, + table_name=%iox_metadata.table_name, + "parquet file written to catalog" + ); + // compiler insisted on getting told the type of the error :shrug: + Ok(()) as Result<(), iox_catalog::interface::Error> + }) + .await + .expect("retry forever"); + + // Update the per-partition persistence watermark, so that new + // ingester instances skip the just-persisted ops during replay. + // + // This could be transactional with the above parquet insert to + // maintain catalog consistency, though in practice it is an + // unnecessary overhead - the system can tolerate replaying the ops + // that lead to this parquet file being generated, and tolerate + // creating a parquet file containing duplicate data (remedied by + // compaction). + // + // This means it is possible to observe a parquet file with a + // max_persisted_sequence_number > + // partition.persisted_sequence_number, either in-between these + // catalog updates, or for however long it takes a crashed ingester + // to restart and replay the ops, and re-persist a file containing + // the same (or subset of) data. + // + // The above is also true of the per-shard persist marker that + // governs the ingester's replay start point, which is + // non-transactionally updated after all partitions have persisted. + Backoff::new(&self.backoff_config) + .retry_all_errors("set partition persist marker", || async { + self.catalog + .repositories() + .await + .partitions() + .update_persisted_sequence_number( + parquet_file.partition_id, + parquet_file.max_sequence_number, + ) + .await + }) + .await + .expect("retry forever"); + + // Record metrics + let attributes = Attributes::from([( + "shard_id", + format!("{}", partition_info.partition.shard_id).into(), + )]); + self.persisted_file_size_bytes + .recorder(attributes) + .record(file_size as u64); + + // and remove the persisted data from memory + let table_name = TableName::from(&partition_info.table_name); + namespace + .mark_persisted( + &table_name, + &partition_info.partition.partition_key, + iox_metadata.max_sequence_number, + ) + .await; + debug!( + ?partition_id, + %table_name, + partition_key=%partition_info.partition.partition_key, + max_sequence_number=%iox_metadata.max_sequence_number.get(), + "marked partition as persisted" + ); } async fn update_min_unpersisted_sequence_number( @@ -475,172 +475,24 @@ impl Persister for IngesterData { } } -/// Stream of snapshots. -/// -/// Every snapshot is a dedicated [`SendableRecordBatchStream`]. -pub(crate) type SnapshotStream = - Pin> + Send>>; - -/// Response data for a single partition. -pub(crate) struct IngesterQueryPartition { - /// Stream of snapshots. - snapshots: SnapshotStream, - - /// Partition ID. - id: PartitionId, - - /// Partition persistence status. - status: PartitionStatus, -} - -impl std::fmt::Debug for IngesterQueryPartition { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("IngesterQueryPartition") - .field("snapshots", &"") - .field("id", &self.id) - .field("status", &self.status) - .finish() - } -} - -impl IngesterQueryPartition { - pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self { - Self { - snapshots, - id, - status, - } - } -} - -/// Stream of partitions in this response. -pub(crate) type IngesterQueryPartitionStream = - Pin> + Send>>; - -/// Response streams for querier<>ingester requests. -/// -/// The data structure is constructed to allow lazy/streaming data generation. For easier -/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method. -pub struct IngesterQueryResponse { - /// Stream of partitions. - partitions: IngesterQueryPartitionStream, -} - -impl std::fmt::Debug for IngesterQueryResponse { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("IngesterQueryResponse") - .field("partitions", &"") - .finish() - } -} - -impl IngesterQueryResponse { - /// Make a response - pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self { - Self { partitions } - } - - /// Flattens the data according to the wire protocol. - pub fn flatten(self) -> FlatIngesterQueryResponseStream { - self.partitions - .flat_map(|partition_res| match partition_res { - Ok(partition) => { - let head = futures::stream::once(async move { - Ok(FlatIngesterQueryResponse::StartPartition { - partition_id: partition.id, - status: partition.status, - }) - }); - let tail = partition - .snapshots - .flat_map(|snapshot_res| match snapshot_res { - Ok(snapshot) => { - let schema = Arc::new(optimize_schema(&snapshot.schema())); - - let schema_captured = Arc::clone(&schema); - let head = futures::stream::once(async { - Ok(FlatIngesterQueryResponse::StartSnapshot { - schema: schema_captured, - }) - }); - - let tail = snapshot.map(move |batch_res| match batch_res { - Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch { - batch: optimize_record_batch(&batch, Arc::clone(&schema))?, - }), - Err(e) => Err(e), - }); - - head.chain(tail).boxed() - } - Err(e) => futures::stream::once(async { Err(e) }).boxed(), - }); - - head.chain(tail).boxed() - } - Err(e) => futures::stream::once(async { Err(e) }).boxed(), - }) - .boxed() - } -} - -/// Flattened version of [`IngesterQueryResponse`]. -pub(crate) type FlatIngesterQueryResponseStream = - Pin> + Send>>; - -/// Element within the flat wire protocol. -#[derive(Debug, PartialEq)] -pub enum FlatIngesterQueryResponse { - /// Start a new partition. - StartPartition { - /// Partition ID. - partition_id: PartitionId, - - /// Partition persistence status. - status: PartitionStatus, - }, - - /// Start a new snapshot. - /// - /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition) - /// message. - StartSnapshot { - /// Snapshot schema. - schema: Arc, - }, - - /// Add a record batch to the snapshot that was announced by the last - /// [`StartSnapshot`](Self::StartSnapshot) message. - RecordBatch { - /// Record batch. - batch: RecordBatch, - }, -} - #[cfg(test)] mod tests { - use std::{ - ops::DerefMut, - sync::Arc, - task::{Context, Poll}, - time::Duration, - }; + use std::{ops::DerefMut, sync::Arc, time::Duration}; - use arrow::datatypes::SchemaRef; use assert_matches::assert_matches; use data_types::{ ColumnId, ColumnSet, CompactionLevel, DeletePredicate, NamespaceSchema, NonEmptyString, ParquetFileParams, Sequence, Timestamp, TimestampRange, }; - use datafusion::physical_plan::RecordBatchStream; + use dml::{DmlDelete, DmlMeta, DmlWrite}; use futures::TryStreamExt; use iox_catalog::{mem::MemCatalog, validate_or_insert_schema}; use iox_time::Time; use metric::{MetricObserver, Observation}; - use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch}; + use mutable_batch_lp::lines_to_batches; use object_store::memory::InMemory; - use schema::selection::Selection; + use uuid::Uuid; use super::*; @@ -804,17 +656,20 @@ mod tests { // limits) assert!(!should_pause); - let partition_id = { + let (table_id, partition_id) = { let sd = data.shards.get(&shard1.id).unwrap(); - let n = sd.namespace("foo").unwrap(); - let mem_table = n.table_data("mem").unwrap(); - assert!(n.table_data("mem").is_some()); + let n = sd.namespace(&"foo".into()).unwrap(); + let mem_table = n.table_data(&"mem".into()).unwrap(); + assert!(n.table_data(&"mem".into()).is_some()); let mem_table = mem_table.write().await; - let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap(); - p.id() + let p = mem_table + .get_partition_by_key(&"1970-01-01".into()) + .unwrap(); + (mem_table.table_id(), p.partition_id()) }; - data.persist(partition_id).await; + data.persist(shard1.id, namespace.id, table_id, partition_id) + .await; // verify that a file got put into object store let file_paths: Vec<_> = object_store @@ -945,17 +800,20 @@ mod tests { assert_progress(&data, shard_index, expected_progress).await; let sd = data.shards.get(&shard1.id).unwrap(); - let n = sd.namespace("foo").unwrap(); + let n = sd.namespace(&"foo".into()).unwrap(); let partition_id; let table_id; { - let mem_table = n.table_data("mem").unwrap(); - assert!(n.table_data("cpu").is_some()); - let mem_table = mem_table.write().await; - let p = mem_table.partition_data.get(&"1970-01-01".into()).unwrap(); + let mem_table = n.table_data(&"mem".into()).unwrap(); + assert!(n.table_data(&"cpu".into()).is_some()); + let mem_table = mem_table.write().await; table_id = mem_table.table_id(); - partition_id = p.id(); + + let p = mem_table + .get_partition_by_key(&"1970-01-01".into()) + .unwrap(); + partition_id = p.partition_id(); } { // verify the partition doesn't have a sort key before any data has been persisted @@ -969,7 +827,8 @@ mod tests { assert!(partition_info.partition.sort_key.is_empty()); } - data.persist(partition_id).await; + data.persist(shard1.id, namespace.id, table_id, partition_id) + .await; // verify that a file got put into object store let file_paths: Vec<_> = object_store @@ -1061,7 +920,7 @@ mod tests { .unwrap(); assert_eq!(partition_info.partition.sort_key, vec!["time"]); - let mem_table = n.table_data("mem").unwrap(); + let mem_table = n.table_data(&"mem".into()).unwrap(); let mem_table = mem_table.read().await; // verify that the parquet_max_sequence_number got updated @@ -1177,7 +1036,7 @@ mod tests { // Get the namespace let sd = data.shards.get(&shard1.id).unwrap(); - let n = sd.namespace("foo").unwrap(); + let n = sd.namespace(&"foo".into()).unwrap(); let expected_progress = ShardProgress::new().with_buffered(SequenceNumber::new(1)); assert_progress(&data, shard_index, expected_progress).await; @@ -1336,23 +1195,28 @@ mod tests { Arc::clone(&metrics), Arc::new(SystemProvider::new()), ); - let exec = Executor::new(1); let partition_provider = Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog))); - let data = NamespaceData::new(namespace.id, shard.id, partition_provider, &*metrics); + let data = NamespaceData::new( + namespace.id, + "foo".into(), + shard.id, + partition_provider, + &*metrics, + ); // w1 should be ignored because the per-partition replay offset is set // to 1 already, so it shouldn't be buffered and the buffer should // remain empty. let should_pause = data - .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle(), &exec) + .buffer_operation(DmlOperation::Write(w1), &catalog, &manager.handle()) .await .unwrap(); { - let table_data = data.table_data("mem").unwrap(); + let table_data = data.table_data(&"mem".into()).unwrap(); let table = table_data.read().await; - let p = table.partition_data.get(&"1970-01-01".into()).unwrap(); + let p = table.get_partition_by_key(&"1970-01-01".into()).unwrap(); assert_eq!( p.max_persisted_sequence_number(), Some(SequenceNumber::new(1)) @@ -1362,13 +1226,13 @@ mod tests { assert!(!should_pause); // w2 should be in the buffer - data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle(), &exec) + data.buffer_operation(DmlOperation::Write(w2), &catalog, &manager.handle()) .await .unwrap(); - let table_data = data.table_data("mem").unwrap(); + let table_data = data.table_data(&"mem".into()).unwrap(); let table = table_data.read().await; - let partition = table.partition_data.get(&"1970-01-01".into()).unwrap(); + let partition = table.get_partition_by_key(&"1970-01-01".into()).unwrap(); assert_eq!( partition.data.buffer.as_ref().unwrap().min_sequence_number, SequenceNumber::new(2) @@ -1454,19 +1318,6 @@ mod tests { .await .unwrap(); - assert_eq!( - data.shard(shard1.id) - .unwrap() - .namespace(&namespace.name) - .unwrap() - .table_data("mem") - .unwrap() - .read() - .await - .tombstone_max_sequence_number(), - None, - ); - let predicate = DeletePredicate { range: TimestampRange::new(1, 2), exprs: vec![], @@ -1485,19 +1336,6 @@ mod tests { data.buffer_operation(shard1.id, DmlOperation::Delete(d1), &manager.handle()) .await .unwrap(); - - assert_eq!( - data.shard(shard1.id) - .unwrap() - .namespace(&namespace.name) - .unwrap() - .table_data("mem") - .unwrap() - .read() - .await - .tombstone_max_sequence_number(), - Some(SequenceNumber::new(2)), - ); } /// Verifies that the progress in data is the same as expected_progress @@ -1513,132 +1351,4 @@ mod tests { assert_eq!(progresses, expected_progresses); } - - #[tokio::test] - async fn test_ingester_query_response_flatten() { - let batch_1_1 = lp_to_batch("table x=1 0"); - let batch_1_2 = lp_to_batch("table x=2 1"); - let batch_2 = lp_to_batch("table y=1 10"); - let batch_3 = lp_to_batch("table z=1 10"); - - let schema_1 = batch_1_1.schema(); - let schema_2 = batch_2.schema(); - let schema_3 = batch_3.schema(); - - let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([ - Ok(IngesterQueryPartition::new( - Box::pin(futures::stream::iter([ - Ok(Box::pin(TestRecordBatchStream::new( - vec![ - Ok(batch_1_1.clone()), - Err(ArrowError::NotYetImplemented("not yet implemeneted".into())), - Ok(batch_1_2.clone()), - ], - Arc::clone(&schema_1), - )) as _), - Err(ArrowError::InvalidArgumentError("invalid arg".into())), - Ok(Box::pin(TestRecordBatchStream::new( - vec![Ok(batch_2.clone())], - Arc::clone(&schema_2), - )) as _), - Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _), - ])), - PartitionId::new(2), - PartitionStatus { - parquet_max_sequence_number: None, - tombstone_max_sequence_number: Some(SequenceNumber::new(1)), - }, - )), - Err(ArrowError::IoError("some io error".into())), - Ok(IngesterQueryPartition::new( - Box::pin(futures::stream::iter([])), - PartitionId::new(1), - PartitionStatus { - parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, - }, - )), - ]))); - - let actual: Vec<_> = response.flatten().collect().await; - let expected = vec![ - Ok(FlatIngesterQueryResponse::StartPartition { - partition_id: PartitionId::new(2), - status: PartitionStatus { - parquet_max_sequence_number: None, - tombstone_max_sequence_number: Some(SequenceNumber::new(1)), - }, - }), - Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }), - Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }), - Err(ArrowError::NotYetImplemented("not yet implemeneted".into())), - Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }), - Err(ArrowError::InvalidArgumentError("invalid arg".into())), - Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }), - Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }), - Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }), - Err(ArrowError::IoError("some io error".into())), - Ok(FlatIngesterQueryResponse::StartPartition { - partition_id: PartitionId::new(1), - status: PartitionStatus { - parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, - }, - }), - ]; - - assert_eq!(actual.len(), expected.len()); - for (actual, expected) in actual.into_iter().zip(expected) { - match (actual, expected) { - (Ok(actual), Ok(expected)) => { - assert_eq!(actual, expected); - } - (Err(_), Err(_)) => { - // cannot compare `ArrowError`, but it's unlikely that someone changed the error - } - (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"), - (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"), - } - } - } - - fn lp_to_batch(lp: &str) -> RecordBatch { - lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap() - } - - pub struct TestRecordBatchStream { - schema: SchemaRef, - batches: Vec>, - } - - impl TestRecordBatchStream { - pub fn new(batches: Vec>, schema: SchemaRef) -> Self { - Self { schema, batches } - } - } - - impl RecordBatchStream for TestRecordBatchStream { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - } - - impl futures::Stream for TestRecordBatchStream { - type Item = Result; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - _: &mut Context<'_>, - ) -> Poll> { - if self.batches.is_empty() { - Poll::Ready(None) - } else { - Poll::Ready(Some(self.batches.remove(0))) - } - } - - fn size_hint(&self) -> (usize, Option) { - (self.batches.len(), Some(self.batches.len())) - } - } } diff --git a/ingester/src/data/namespace.rs b/ingester/src/data/namespace.rs index 6a5ddb9581..9aa414a535 100644 --- a/ingester/src/data/namespace.rs +++ b/ingester/src/data/namespace.rs @@ -1,36 +1,91 @@ //! Namespace level data buffer structures. -use std::{ - collections::{btree_map::Entry, BTreeMap}, - sync::Arc, -}; +use std::{collections::HashMap, sync::Arc}; -use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId}; +use data_types::{NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId}; use dml::DmlOperation; use iox_catalog::interface::Catalog; -use iox_query::exec::Executor; use metric::U64Counter; +use observability_deps::tracing::warn; use parking_lot::RwLock; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use write_summary::ShardProgress; #[cfg(test)] use super::triggers::TestTriggers; use super::{ - partition::{resolver::PartitionProvider, PersistingBatch}, - table::TableData, + partition::resolver::PartitionProvider, + table::{TableData, TableName}, }; use crate::lifecycle::LifecycleHandle; +/// A double-referenced map where [`TableData`] can be looked up by name, or ID. +#[derive(Debug, Default)] +struct DoubleRef { + // TODO(4880): this can be removed when IDs are sent over the wire. + by_name: HashMap>>, + by_id: HashMap>>, +} + +impl DoubleRef { + fn insert(&mut self, t: TableData) -> Arc> { + let name = t.table_name().clone(); + let id = t.table_id(); + + let t = Arc::new(tokio::sync::RwLock::new(t)); + self.by_name.insert(name, Arc::clone(&t)); + self.by_id.insert(id, Arc::clone(&t)); + t + } + + fn by_name(&self, name: &TableName) -> Option>> { + self.by_name.get(name).map(Arc::clone) + } + + fn by_id(&self, id: TableId) -> Option>> { + self.by_id.get(&id).map(Arc::clone) + } +} + +/// The string name / identifier of a Namespace. +/// +/// A reference-counted, cheap clone-able string. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(crate) struct NamespaceName(Arc); + +impl From for NamespaceName +where + T: AsRef, +{ + fn from(v: T) -> Self { + Self(Arc::from(v.as_ref())) + } +} + +impl std::ops::Deref for NamespaceName { + type Target = str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for NamespaceName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + /// Data of a Namespace that belongs to a given Shard #[derive(Debug)] pub(crate) struct NamespaceData { namespace_id: NamespaceId, + namespace_name: NamespaceName, /// The catalog ID of the shard this namespace is being populated from. shard_id: ShardId, - tables: RwLock>>>, + tables: RwLock, table_count: U64Counter, /// The resolver of `(shard_id, table_id, partition_key)` to @@ -87,8 +142,9 @@ pub(crate) struct NamespaceData { impl NamespaceData { /// Initialize new tables with default partition template of daily - pub fn new( + pub(super) fn new( namespace_id: NamespaceId, + namespace_name: NamespaceName, shard_id: ShardId, partition_provider: Arc, metrics: &metric::Registry, @@ -102,6 +158,7 @@ impl NamespaceData { Self { namespace_id, + namespace_name, shard_id, tables: Default::default(), table_count, @@ -120,7 +177,6 @@ impl NamespaceData { dml_operation: DmlOperation, catalog: &Arc, lifecycle_handle: &dyn LifecycleHandle, - executor: &Executor, ) -> Result { let sequence_number = dml_operation .meta() @@ -146,6 +202,7 @@ impl NamespaceData { .clone(); for (t, b) in write.into_tables() { + let t = TableName::from(t); let table_data = match self.table_data(&t) { Some(t) => t, None => self.insert_table(&t, catalog).await?, @@ -171,19 +228,17 @@ impl NamespaceData { Ok(pause_writes) } DmlOperation::Delete(delete) => { - let table_name = delete.table_name().context(super::TableNotPresentSnafu)?; - let table_data = match self.table_data(table_name) { - Some(t) => t, - None => self.insert_table(table_name, catalog).await?, - }; + // Deprecated delete support: + // https://github.com/influxdata/influxdb_iox/issues/5825 + warn!( + shard_id=%self.shard_id, + namespace_name=%self.namespace_name, + namespace_id=%self.namespace_id, + table_name=?delete.table_name(), + sequence_number=?delete.meta().sequence(), + "discarding unsupported delete op" + ); - let mut table_data = table_data.write().await; - - table_data - .buffer_delete(delete.predicate(), sequence_number, &**catalog, executor) - .await?; - - // don't pause writes since deletes don't count towards memory limits Ok(false) } } @@ -194,16 +249,16 @@ impl NamespaceData { #[cfg(test)] // Only used in tests pub(crate) async fn snapshot( &self, - table_name: &str, + table_name: &TableName, partition_key: &PartitionKey, ) -> Option<( Vec>, - Option>, + Option>, )> { if let Some(t) = self.table_data(table_name) { let mut t = t.write().await; - return t.partition_data.get_mut(partition_key).map(|p| { + return t.get_partition_by_key_mut(partition_key).map(|p| { p.data .generate_snapshot() .expect("snapshot on mutable batch should never fail"); @@ -217,17 +272,17 @@ impl NamespaceData { /// Snapshots the mutable buffer for the partition, which clears it out and then moves all /// snapshots over to a persisting batch, which is returned. If there is no data to snapshot /// or persist, None will be returned. + #[cfg(test)] // Only used in tests pub(crate) async fn snapshot_to_persisting( &self, - table_name: &str, + table_name: &TableName, partition_key: &PartitionKey, - ) -> Option> { + ) -> Option> { if let Some(table_data) = self.table_data(table_name) { let mut table_data = table_data.write().await; return table_data - .partition_data - .get_mut(partition_key) + .get_partition_by_key_mut(partition_key) .and_then(|partition_data| partition_data.snapshot_to_persisting_batch()); } @@ -237,45 +292,55 @@ impl NamespaceData { /// Gets the buffered table data pub(crate) fn table_data( &self, - table_name: &str, + table_name: &TableName, ) -> Option>> { let t = self.tables.read(); - t.get(table_name).cloned() + t.by_name(table_name) + } + + /// Return the table data by ID. + pub(crate) fn table_id( + &self, + table_id: TableId, + ) -> Option>> { + let t = self.tables.read(); + t.by_id(table_id) } /// Inserts the table or returns it if it happens to be inserted by some other thread async fn insert_table( &self, - table_name: &str, + table_name: &TableName, catalog: &Arc, ) -> Result>, super::Error> { let mut repos = catalog.repositories().await; + let info = repos .tables() .get_table_persist_info(self.shard_id, self.namespace_id, table_name) .await .context(super::CatalogSnafu)? - .context(super::TableNotFoundSnafu { table_name })?; + .ok_or_else(|| super::Error::TableNotFound { + table_name: table_name.to_string(), + })?; let mut t = self.tables.write(); - let data = match t.entry(table_name.to_string()) { - Entry::Vacant(v) => { - let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new( + Ok(match t.by_name(table_name) { + Some(v) => v, + None => { + self.table_count.inc(1); + + // Insert the table and then return a ref to it. + t.insert(TableData::new( info.table_id, - table_name, + table_name.clone(), self.shard_id, self.namespace_id, - info.tombstone_max_sequence_number, Arc::clone(&self.partition_provider), - )))); - self.table_count.inc(1); - Arc::clone(v) + )) } - Entry::Occupied(v) => Arc::clone(v.get()), - }; - - Ok(data) + }) } /// Walks down the table and partition and clears the persisting batch. The sequence number is @@ -283,13 +348,13 @@ impl NamespaceData { /// data buffer. pub(super) async fn mark_persisted( &self, - table_name: &str, + table_name: &TableName, partition_key: &PartitionKey, sequence_number: SequenceNumber, ) { if let Some(t) = self.table_data(table_name) { let mut t = t.write().await; - let partition = t.partition_data.get_mut(partition_key); + let partition = t.get_partition_by_key_mut(partition_key); if let Some(p) = partition { p.mark_persisted(sequence_number); @@ -299,7 +364,7 @@ impl NamespaceData { /// Return progress from this Namespace pub(super) async fn progress(&self) -> ShardProgress { - let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect(); + let tables: Vec<_> = self.tables.read().by_id.values().map(Arc::clone).collect(); // Consolidate progtress across partitions. let mut progress = ShardProgress::new() @@ -323,6 +388,12 @@ impl NamespaceData { pub(super) fn table_count(&self) -> &U64Counter { &self.table_count } + + /// Returns the [`NamespaceName`] for this namespace. + #[cfg(test)] + pub(crate) fn namespace_name(&self) -> &NamespaceName { + &self.namespace_name + } } /// RAAI struct that sets buffering sequence number on creation and clears it on free @@ -357,3 +428,92 @@ impl<'a> Drop for ScopedSequenceNumber<'a> { *buffering_sequence_number = None; } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use data_types::{PartitionId, ShardIndex}; + use metric::{Attributes, Metric}; + + use crate::{ + data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState}, + lifecycle::mock_handle::MockLifecycleHandle, + test_util::{make_write_op, populate_catalog}, + }; + + use super::*; + + const SHARD_INDEX: ShardIndex = ShardIndex::new(24); + const TABLE_NAME: &str = "bananas"; + const NAMESPACE_NAME: &str = "platanos"; + + #[tokio::test] + async fn test_namespace_double_ref() { + let metrics = Arc::new(metric::Registry::default()); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + // Configure the mock partition provider to return a partition for this + // table ID. + let partition_provider = Arc::new(MockPartitionProvider::default().with_partition( + PartitionData::new( + PartitionId::new(0), + PartitionKey::from("banana-split"), + shard_id, + ns_id, + table_id, + TABLE_NAME.into(), + SortKeyState::Provided(None), + None, + ), + )); + + let ns = NamespaceData::new( + ns_id, + NAMESPACE_NAME.into(), + shard_id, + partition_provider, + &*metrics, + ); + + // Assert the namespace name was stored + assert_eq!(&**ns.namespace_name(), NAMESPACE_NAME); + + // Assert the namespace does not contain the test data + assert!(ns.table_data(&TABLE_NAME.into()).is_none()); + assert!(ns.table_id(table_id).is_none()); + + // Write some test data + ns.buffer_operation( + DmlOperation::Write(make_write_op( + &PartitionKey::from("banana-split"), + SHARD_INDEX, + NAMESPACE_NAME, + 0, + r#"bananas,city=Medford day="sun",temp=55 22"#, + )), + &catalog, + &MockLifecycleHandle::default(), + ) + .await + .expect("buffer op should succeed"); + + // Both forms of referencing the table should succeed + assert!(ns.table_data(&TABLE_NAME.into()).is_some()); + assert!(ns.table_id(table_id).is_some()); + + // And the table counter metric should increase + let tables = metrics + .get_instrument::>("ingester_tables_total") + .expect("failed to read metric") + .get_observer(&Attributes::from([])) + .expect("failed to get observer") + .fetch(); + assert_eq!(tables, 1); + } +} diff --git a/ingester/src/data/partition.rs b/ingester/src/data/partition.rs index 1ec531fdbc..61dd4c36d2 100644 --- a/ingester/src/data/partition.rs +++ b/ingester/src/data/partition.rs @@ -3,18 +3,21 @@ use std::sync::Arc; use arrow::record_batch::RecordBatch; -use data_types::{ - NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, Tombstone, -}; -use iox_query::exec::Executor; +use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId}; use mutable_batch::MutableBatch; -use schema::selection::Selection; +use observability_deps::tracing::*; +use schema::{selection::Selection, sort::SortKey}; use snafu::ResultExt; use uuid::Uuid; use write_summary::ShardProgress; -use self::buffer::{BufferBatch, DataBuffer}; -use crate::{data::query_dedup::query, query::QueryableBatch}; +use self::{ + buffer::{BufferBatch, DataBuffer}, + resolver::DeferredSortKey, +}; +use crate::{querier_handler::PartitionStatus, query::QueryableBatch}; + +use super::table::TableName; mod buffer; pub mod resolver; @@ -28,20 +31,6 @@ pub(crate) struct UnpersistedPartitionData { pub(crate) partition_status: PartitionStatus, } -/// Status of a partition that has unpersisted data. -/// -/// Note that this structure is specific to a partition (which itself is bound to a table and -/// shard)! -#[derive(Debug, Clone, PartialEq, Eq)] -#[allow(missing_copy_implementations)] -pub struct PartitionStatus { - /// Max sequence number persisted - pub parquet_max_sequence_number: Option, - - /// Max sequence number for a tombstone - pub tombstone_max_sequence_number: Option, -} - /// PersistingBatch contains all needed info and data for creating /// a parquet file for given set of SnapshotBatches #[derive(Debug, PartialEq, Clone)] @@ -132,7 +121,28 @@ impl SnapshotBatch { } } -/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard +/// The load state of the [`SortKey`] for a given partition. +#[derive(Debug)] +pub(crate) enum SortKeyState { + /// The [`SortKey`] has not yet been fetched from the catalog, and will be + /// lazy loaded (or loaded in the background) by a call to + /// [`DeferredSortKey::get()`]. + Deferred(DeferredSortKey), + /// The sort key is known and specified. + Provided(Option), +} + +impl SortKeyState { + async fn get(&self) -> Option { + match self { + Self::Deferred(v) => v.get().await, + Self::Provided(v) => v.clone(), + } + } +} + +/// Data of an IOx Partition of a given Table of a Namespace that belongs to a +/// given Shard #[derive(Debug)] pub struct PartitionData { /// The catalog ID of the partition this buffer is for. @@ -140,12 +150,23 @@ pub struct PartitionData { /// The string partition key for this partition. partition_key: PartitionKey, + /// The sort key of this partition. + /// + /// This can known, in which case this field will contain a + /// [`SortKeyState::Provided`] with the [`SortKey`], or unknown with a value + /// of [`SortKeyState::Deferred`] causing it to be loaded from the catalog + /// (potentially) in the background or at read time. + /// + /// Callers should use [`Self::sort_key()`] to be abstracted away from these + /// fetch details. + sort_key: SortKeyState, + /// The shard, namespace & table IDs for this partition. shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, /// The name of the table this partition is part of. - table_name: Arc, + table_name: TableName, pub(super) data: DataBuffer, @@ -156,18 +177,21 @@ pub struct PartitionData { impl PartitionData { /// Initialize a new partition data buffer + #[allow(clippy::too_many_arguments)] pub(crate) fn new( id: PartitionId, partition_key: PartitionKey, shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, + sort_key: SortKeyState, max_persisted_sequence_number: Option, ) -> Self { Self { id, partition_key, + sort_key, shard_id, namespace_id, table_id, @@ -209,100 +233,36 @@ impl PartitionData { sequence_number: SequenceNumber, mb: MutableBatch, ) -> Result<(), super::Error> { - match &mut self.data.buffer { + let (min_sequence_number, max_sequence_number) = match &mut self.data.buffer { Some(buf) => { buf.max_sequence_number = sequence_number.max(buf.max_sequence_number); buf.data.extend_from(&mb).context(super::BufferWriteSnafu)?; + (buf.min_sequence_number, buf.max_sequence_number) } None => { self.data.buffer = Some(BufferBatch { min_sequence_number: sequence_number, max_sequence_number: sequence_number, data: mb, - }) + }); + (sequence_number, sequence_number) } - } + }; + trace!( + min_sequence_number=?min_sequence_number, + max_sequence_number=?max_sequence_number, + "buffered write" + ); Ok(()) } - /// Buffers a new tombstone: - /// . All the data in the `buffer` and `snapshots` will be replaced with one - /// tombstone-applied snapshot - /// . The tombstone is only added in the `deletes_during_persisting` if the `persisting` - /// exists - pub(super) async fn buffer_tombstone(&mut self, executor: &Executor, tombstone: Tombstone) { - self.data.add_tombstone(tombstone.clone()); - - // ---------------------------------------------------------- - // First apply the tombstone on all in-memory & non-persisting data - // Make a QueryableBatch for all buffer + snapshots + the given tombstone - let max_sequence_number = tombstone.sequence_number; - let query_batch = match self.data.snapshot_to_queryable_batch( - &self.table_name, - self.id, - Some(tombstone.clone()), - ) { - Some(query_batch) if !query_batch.is_empty() => query_batch, - _ => { - // No need to proceed further - return; - } - }; - - let (min_sequence_number, _) = query_batch.min_max_sequence_numbers(); - assert!(min_sequence_number <= max_sequence_number); - - // Run query on the QueryableBatch to apply the tombstone. - let stream = match query(executor, Arc::new(query_batch)).await { - Err(e) => { - // this should never error out. if it does, we need to crash hard so - // someone can take a look. - panic!("unable to apply tombstones on snapshots: {:?}", e); - } - Ok(stream) => stream, - }; - let record_batches = match datafusion::physical_plan::common::collect(stream).await { - Err(e) => { - // this should never error out. if it does, we need to crash hard so - // someone can take a look. - panic!("unable to collect record batches: {:?}", e); - } - Ok(batches) => batches, - }; - - // Merge all result record batches into one record batch - // and make a snapshot for it - let snapshot = if !record_batches.is_empty() { - let record_batch = - arrow::compute::concat_batches(&record_batches[0].schema(), &record_batches) - .unwrap_or_else(|e| { - panic!("unable to concat record batches: {:?}", e); - }); - let snapshot = SnapshotBatch { - min_sequence_number, - max_sequence_number, - data: Arc::new(record_batch), - }; - - Some(Arc::new(snapshot)) - } else { - None - }; - - // ---------------------------------------------------------- - // Add the tombstone-applied data back in as one snapshot - if let Some(snapshot) = snapshot { - self.data.snapshots.push(snapshot); - } - } - /// Return the progress from this Partition pub(super) fn progress(&self) -> ShardProgress { self.data.progress() } - pub(super) fn id(&self) -> PartitionId { + pub(super) fn partition_id(&self) -> PartitionId { self.id } @@ -347,6 +307,13 @@ impl PartitionData { pub fn namespace_id(&self) -> NamespaceId { self.namespace_id } + + /// Return the [`SortKey`] for this partition. + /// + /// NOTE: this MAY involve querying the catalog with unbounded retries. + pub async fn sort_key(&self) -> Option { + self.sort_key.get().await + } } #[cfg(test)] @@ -355,7 +322,6 @@ mod tests { use mutable_batch_lp::test_helpers::lp_to_mutable_batch; use super::*; - use crate::test_util::create_tombstone; #[test] fn snapshot_buffer_different_but_compatible_schemas() { @@ -366,6 +332,7 @@ mod tests { NamespaceId::new(42), TableId::new(1), "foo".into(), + SortKeyState::Provided(None), None, ); @@ -401,7 +368,7 @@ mod tests { // Test deletes mixed with writes on a single parittion #[tokio::test] - async fn writes_and_deletes() { + async fn writes() { // Make a partition with empty DataBuffer let s_id = 1; let t_id = 1; @@ -413,9 +380,9 @@ mod tests { NamespaceId::new(42), TableId::new(t_id), "restaurant".into(), + SortKeyState::Provided(None), None, ); - let exec = Executor::new(1); // ------------------------------------------ // Fill `buffer` @@ -438,42 +405,8 @@ mod tests { SequenceNumber::new(2) ); assert_eq!(p.data.snapshots.len(), 0); - assert_eq!(p.data.deletes_during_persisting().len(), 0); assert_eq!(p.data.persisting, None); - // ------------------------------------------ - // Delete - // --- seq_num: 3 - let ts = create_tombstone( - 1, // tombstone id - t_id, // table id - s_id, // shard id - 3, // delete's seq_number - 0, // min time of data to get deleted - 20, // max time of data to get deleted - "day=thu", // delete predicate - ); - // one row will get deleted, the other is moved to snapshot - p.buffer_tombstone(&exec, ts).await; - - // verify data - assert!(p.data.buffer.is_none()); // always empty after delete - assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data - assert_eq!(p.data.deletes_during_persisting().len(), 0); - assert_eq!(p.data.persisting, None); - // snapshot only has one row since the other one got deleted - let data = (*p.data.snapshots[0].data).clone(); - let expected = vec![ - "+--------+-----+------+--------------------------------+", - "| city | day | temp | time |", - "+--------+-----+------+--------------------------------+", - "| Boston | fri | 50 | 1970-01-01T00:00:00.000000010Z |", - "+--------+-----+------+--------------------------------+", - ]; - assert_batches_sorted_eq!(&expected, &[data]); - assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1); - assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 3); - // ------------------------------------------ // Fill `buffer` // --- seq_num: 4 @@ -493,50 +426,15 @@ mod tests { // verify data assert_eq!( p.data.buffer.as_ref().unwrap().min_sequence_number, - SequenceNumber::new(4) + SequenceNumber::new(1) ); assert_eq!( p.data.buffer.as_ref().unwrap().max_sequence_number, SequenceNumber::new(5) ); - assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot - assert_eq!(p.data.deletes_during_persisting().len(), 0); + assert_eq!(p.data.snapshots.len(), 0); assert_eq!(p.data.persisting, None); - - // ------------------------------------------ - // Delete - // --- seq_num: 6 - let ts = create_tombstone( - 2, // tombstone id - t_id, // table id - s_id, // shard id - 6, // delete's seq_number - 10, // min time of data to get deleted - 50, // max time of data to get deleted - "city=Boston", // delete predicate - ); - // two rows will get deleted, one from existing snapshot, one from the buffer being moved - // to snpashot - p.buffer_tombstone(&exec, ts).await; - - // verify data - assert!(p.data.buffer.is_none()); // always empty after delete - assert_eq!(p.data.snapshots.len(), 1); // one snpashot - assert_eq!(p.data.deletes_during_persisting().len(), 0); - assert_eq!(p.data.persisting, None); - // snapshot only has two rows since the other 2 rows with city=Boston have got deleted - let data = (*p.data.snapshots[0].data).clone(); - let expected = vec![ - "+---------+-----+------+--------------------------------+", - "| city | day | temp | time |", - "+---------+-----+------+--------------------------------+", - "| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |", - "+---------+-----+------+--------------------------------+", - ]; - assert_batches_sorted_eq!(&expected, &[data]); - assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 1); - assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 6); + assert!(p.data.buffer.is_some()); // ------------------------------------------ // Persisting @@ -545,32 +443,12 @@ mod tests { // verify data assert!(p.data.buffer.is_none()); // always empty after issuing persit assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit - assert_eq!(p.data.deletes_during_persisting().len(), 0); // deletes not happen yet assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch))); - // ------------------------------------------ - // Delete - // --- seq_num: 7 - let ts = create_tombstone( - 3, // tombstone id - t_id, // table id - s_id, // shard id - 7, // delete's seq_number - 10, // min time of data to get deleted - 50, // max time of data to get deleted - "temp=55", // delete predicate - ); - // if a query come while persisting, the row with temp=55 will be deleted before - // data is sent back to Querier - p.buffer_tombstone(&exec, ts).await; - // verify data - assert!(p.data.buffer.is_none()); // always empty after delete - // no snpashots becasue buffer has not data yet and the - // snapshot was empty too - assert_eq!(p.data.snapshots.len(), 0); - assert_eq!(p.data.deletes_during_persisting().len(), 1); // tombstone added since data is - // persisting + assert!(p.data.buffer.is_none()); + assert_eq!(p.data.snapshots.len(), 0); // no snpashots becasue buffer has not data yet and the + // snapshot was empty too assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch))); // ------------------------------------------ @@ -591,7 +469,6 @@ mod tests { SequenceNumber::new(8) ); // 1 newly added mutable batch of 3 rows of data assert_eq!(p.data.snapshots.len(), 0); // still empty - assert_eq!(p.data.deletes_during_persisting().len(), 1); assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch))); // ------------------------------------------ @@ -600,7 +477,6 @@ mod tests { // verify data assert!(p.data.buffer.is_none()); // empty after snapshot assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer - assert_eq!(p.data.deletes_during_persisting().len(), 1); assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch))); // snapshot has three rows moved from buffer let data = (*p.data.snapshots[0].data).clone(); @@ -616,41 +492,5 @@ mod tests { assert_batches_sorted_eq!(&expected, &[data]); assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8); assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 8); - - // ------------------------------------------ - // Delete - // --- seq_num: 9 - let ts = create_tombstone( - 4, // tombstone id - t_id, // table id - s_id, // shard id - 9, // delete's seq_number - 10, // min time of data to get deleted - 50, // max time of data to get deleted - "temp=60", // delete predicate - ); - // the row with temp=60 will be removed from the sanphot - p.buffer_tombstone(&exec, ts).await; - - // verify data - assert!(p.data.buffer.is_none()); // always empty after delete - assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied - assert_eq!(p.data.deletes_during_persisting().len(), 2); // one more tombstone added make it 2 - assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch))); - // snapshot has only 2 rows because the row with tem=60 was removed - let data = (*p.data.snapshots[0].data).clone(); - let expected = vec![ - "+------------+-----+------+--------------------------------+", - "| city | day | temp | time |", - "+------------+-----+------+--------------------------------+", - "| Wilmington | sun | 55 | 1970-01-01T00:00:00.000000035Z |", - "| Boston | sun | 62 | 1970-01-01T00:00:00.000000038Z |", - "+------------+-----+------+--------------------------------+", - ]; - assert_batches_sorted_eq!(&expected, &[data]); - assert_eq!(p.data.snapshots[0].min_sequence_number.get(), 8); - assert_eq!(p.data.snapshots[0].max_sequence_number.get(), 9); - - exec.join().await; } } diff --git a/ingester/src/data/partition/buffer.rs b/ingester/src/data/partition/buffer.rs index 739da735fa..866e7a966c 100644 --- a/ingester/src/data/partition/buffer.rs +++ b/ingester/src/data/partition/buffer.rs @@ -2,13 +2,15 @@ use std::sync::Arc; -use data_types::{PartitionId, SequenceNumber, ShardId, TableId, Tombstone}; +use data_types::{PartitionId, SequenceNumber, ShardId, TableId}; use mutable_batch::MutableBatch; use schema::selection::Selection; use snafu::ResultExt; use uuid::Uuid; use write_summary::ShardProgress; +use crate::data::table::TableName; + use super::{PersistingBatch, QueryableBatch, SnapshotBatch}; /// Data of an IOx partition split into batches @@ -38,14 +40,6 @@ pub(crate) struct DataBuffer { /// Buffer of incoming writes pub(crate) buffer: Option, - /// Buffer of tombstones whose time range may overlap with this partition. - /// All tombstones were already applied to corresponding snapshots. This list - /// only keep the ones that come during persisting. The reason - /// we keep them becasue if a query comes, we need to apply these tombstones - /// on the persiting data before sending it to the Querier - /// When the `persiting` is done and removed, this list will get empty, too - deletes_during_persisting: Vec, - /// Data in `buffer` will be moved to a `snapshot` when one of these happens: /// . A background persist is called /// . A read request from Querier @@ -70,14 +64,6 @@ pub(crate) struct DataBuffer { } impl DataBuffer { - /// Add a new tombstones into the [`DataBuffer`]. - pub(super) fn add_tombstone(&mut self, tombstone: Tombstone) { - // Only keep this tombstone if some data is being persisted - if self.persisting.is_some() { - self.deletes_during_persisting.push(tombstone); - } - } - /// If a [`BufferBatch`] exists, convert it to a [`SnapshotBatch`] and add /// it to the list of snapshots. /// @@ -109,9 +95,8 @@ impl DataBuffer { /// Both buffer and snapshots will be empty after this pub(super) fn snapshot_to_queryable_batch( &mut self, - table_name: &Arc, + table_name: &TableName, partition_id: PartitionId, - tombstone: Option, ) -> Option { self.generate_snapshot() .expect("This mutable batch snapshot error should be impossible."); @@ -119,21 +104,11 @@ impl DataBuffer { let mut data = vec![]; std::mem::swap(&mut data, &mut self.snapshots); - let mut tombstones = vec![]; - if let Some(tombstone) = tombstone { - tombstones.push(tombstone); - } - // only produce batch if there is any data if data.is_empty() { None } else { - Some(QueryableBatch::new( - Arc::clone(table_name), - partition_id, - data, - tombstones, - )) + Some(QueryableBatch::new(table_name.clone(), partition_id, data)) } } @@ -164,15 +139,13 @@ impl DataBuffer { shard_id: ShardId, table_id: TableId, partition_id: PartitionId, - table_name: &Arc, + table_name: &TableName, ) -> Option> { if self.persisting.is_some() { panic!("Unable to snapshot while persisting. This is an unexpected state.") } - if let Some(queryable_batch) = - self.snapshot_to_queryable_batch(table_name, partition_id, None) - { + if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, partition_id) { let persisting_batch = Arc::new(PersistingBatch { shard_id, table_id, @@ -197,12 +170,7 @@ impl DataBuffer { }; // persisting data - let mut queryable_batch = (*persisting.data).clone(); - - // Add new tombstones if any - queryable_batch.add_tombstones(&self.deletes_during_persisting); - - Some(queryable_batch) + Some((*persisting.data).clone()) } /// Return the progress in this DataBuffer @@ -239,12 +207,6 @@ impl DataBuffer { pub(crate) fn mark_persisted(&mut self) { self.persisting = None; - self.deletes_during_persisting.clear() - } - - #[cfg(test)] - pub(super) fn deletes_during_persisting(&self) -> &[Tombstone] { - self.deletes_during_persisting.as_ref() } } diff --git a/ingester/src/data/partition/resolver/cache.rs b/ingester/src/data/partition/resolver/cache.rs index 0dda53f057..7f282ae38c 100644 --- a/ingester/src/data/partition/resolver/cache.rs +++ b/ingester/src/data/partition/resolver/cache.rs @@ -1,13 +1,18 @@ -use std::{collections::HashMap, sync::Arc}; +use std::{collections::HashMap, sync::Arc, time::Duration}; use async_trait::async_trait; +use backoff::BackoffConfig; use data_types::{ NamespaceId, Partition, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId, }; +use iox_catalog::interface::Catalog; use observability_deps::tracing::debug; use parking_lot::Mutex; -use crate::data::partition::PartitionData; +use crate::data::{ + partition::{resolver::DeferredSortKey, PartitionData, SortKeyState}, + table::TableName, +}; use super::r#trait::PartitionProvider; @@ -43,6 +48,18 @@ struct Entry { /// Each cache hit _removes_ the entry from the cache - this eliminates the /// memory overhead for items that were hit. This is the expected (only valid!) /// usage pattern. +/// +/// # Deferred Sort Key Loading +/// +/// This cache does NOT cache the [`SortKey`] for each [`PartitionData`], as the +/// sort key can be large and is likely unique per table, and thus not +/// share-able across instances / prohibitively expensive to cache. +/// +/// Instead cached instances are returned with a deferred sort key resolver +/// which attempts to fetch the sort key in the background some time after +/// construction. +/// +/// [`SortKey`]: schema::sort::SortKey #[derive(Debug)] pub(crate) struct PartitionCache { // The inner delegate called for a cache miss. @@ -59,13 +76,31 @@ pub(crate) struct PartitionCache { /// a faster search for cache misses. #[allow(clippy::type_complexity)] entries: Mutex>>>, + + /// Data needed to construct the [`DeferredSortKey`] for cached entries. + catalog: Arc, + backoff_config: BackoffConfig, + /// The maximum amount of time a [`DeferredSortKey`] may wait until + /// pre-fetching the sort key in the background. + max_smear: Duration, } impl PartitionCache { /// Initialise a [`PartitionCache`] containing the specified partitions. /// /// Any cache miss is passed through to `inner`. - pub(crate) fn new

(inner: T, partitions: P) -> Self + /// + /// Any cache hit returns a [`PartitionData`] configured with a + /// [`SortKeyState::Deferred`] for deferred key loading in the background. + /// The [`DeferredSortKey`] is initialised with the given `catalog`, + /// `backoff_config`, and `max_smear` maximal load wait duration. + pub(crate) fn new

( + inner: T, + partitions: P, + max_smear: Duration, + catalog: Arc, + backoff_config: BackoffConfig, + ) -> Self where P: IntoIterator, { @@ -97,6 +132,9 @@ impl PartitionCache { Self { entries: Mutex::new(entries), inner, + catalog, + backoff_config, + max_smear, } } @@ -154,7 +192,7 @@ where shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, ) -> PartitionData { // Use the cached PartitionKey instead of the caller's partition_key, // instead preferring to reuse the already-shared Arc in the cache. @@ -171,6 +209,12 @@ where namespace_id, table_id, table_name, + SortKeyState::Deferred(DeferredSortKey::new( + cached.partition_id, + self.max_smear, + Arc::clone(&__self.catalog), + self.backoff_config.clone(), + )), cached.max_sequence_number, ); } @@ -186,6 +230,8 @@ where #[cfg(test)] mod tests { + use iox_catalog::mem::MemCatalog; + use crate::data::partition::resolver::MockPartitionProvider; use super::*; @@ -197,6 +243,22 @@ mod tests { const TABLE_ID: TableId = TableId::new(3); const TABLE_NAME: &str = "platanos"; + fn new_cache

( + inner: MockPartitionProvider, + partitions: P, + ) -> PartitionCache + where + P: IntoIterator, + { + PartitionCache::new( + inner, + partitions, + Duration::from_secs(10_000_000), + Arc::new(MemCatalog::new(Arc::new(metric::Registry::default()))), + BackoffConfig::default(), + ) + } + #[tokio::test] async fn test_miss() { let data = PartitionData::new( @@ -206,11 +268,12 @@ mod tests { NAMESPACE_ID, TABLE_ID, TABLE_NAME.into(), + SortKeyState::Provided(None), None, ); let inner = MockPartitionProvider::default().with_partition(data); - let cache = PartitionCache::new(inner, []); + let cache = new_cache(inner, []); let got = cache .get_partition( PARTITION_KEY.into(), @@ -221,7 +284,7 @@ mod tests { ) .await; - assert_eq!(got.id(), PARTITION_ID); + assert_eq!(got.partition_id(), PARTITION_ID); assert_eq!(got.shard_id(), SHARD_ID); assert_eq!(got.table_id(), TABLE_ID); assert_eq!(got.table_name(), TABLE_NAME); @@ -238,11 +301,11 @@ mod tests { shard_id: SHARD_ID, table_id: TABLE_ID, partition_key: stored_partition_key.clone(), - sort_key: Default::default(), + sort_key: vec!["dos".to_string(), "bananas".to_string()], persisted_sequence_number: Default::default(), }; - let cache = PartitionCache::new(inner, [partition]); + let cache = new_cache(inner, [partition]); let callers_partition_key = PartitionKey::from(PARTITION_KEY); let got = cache @@ -255,7 +318,7 @@ mod tests { ) .await; - assert_eq!(got.id(), PARTITION_ID); + assert_eq!(got.partition_id(), PARTITION_ID); assert_eq!(got.shard_id(), SHARD_ID); assert_eq!(got.table_id(), TABLE_ID); assert_eq!(got.table_name(), TABLE_NAME); @@ -274,7 +337,7 @@ mod tests { } #[tokio::test] - async fn test_miss_partition_jey() { + async fn test_miss_partition_key() { let other_key = PartitionKey::from("test"); let other_key_id = PartitionId::new(99); let inner = MockPartitionProvider::default().with_partition(PartitionData::new( @@ -284,6 +347,7 @@ mod tests { NAMESPACE_ID, TABLE_ID, TABLE_NAME.into(), + SortKeyState::Provided(None), None, )); @@ -296,7 +360,7 @@ mod tests { persisted_sequence_number: Default::default(), }; - let cache = PartitionCache::new(inner, [partition]); + let cache = new_cache(inner, [partition]); let got = cache .get_partition( other_key.clone(), @@ -307,7 +371,7 @@ mod tests { ) .await; - assert_eq!(got.id(), other_key_id); + assert_eq!(got.partition_id(), other_key_id); assert_eq!(got.shard_id(), SHARD_ID); assert_eq!(got.table_id(), TABLE_ID); assert_eq!(got.table_name(), TABLE_NAME); @@ -323,6 +387,7 @@ mod tests { NAMESPACE_ID, other_table, TABLE_NAME.into(), + SortKeyState::Provided(None), None, )); @@ -335,7 +400,7 @@ mod tests { persisted_sequence_number: Default::default(), }; - let cache = PartitionCache::new(inner, [partition]); + let cache = new_cache(inner, [partition]); let got = cache .get_partition( PARTITION_KEY.into(), @@ -346,7 +411,7 @@ mod tests { ) .await; - assert_eq!(got.id(), PARTITION_ID); + assert_eq!(got.partition_id(), PARTITION_ID); assert_eq!(got.shard_id(), SHARD_ID); assert_eq!(got.table_id(), other_table); assert_eq!(got.table_name(), TABLE_NAME); @@ -362,6 +427,7 @@ mod tests { NAMESPACE_ID, TABLE_ID, TABLE_NAME.into(), + SortKeyState::Provided(None), None, )); @@ -374,7 +440,7 @@ mod tests { persisted_sequence_number: Default::default(), }; - let cache = PartitionCache::new(inner, [partition]); + let cache = new_cache(inner, [partition]); let got = cache .get_partition( PARTITION_KEY.into(), @@ -385,7 +451,7 @@ mod tests { ) .await; - assert_eq!(got.id(), PARTITION_ID); + assert_eq!(got.partition_id(), PARTITION_ID); assert_eq!(got.shard_id(), other_shard); assert_eq!(got.table_id(), TABLE_ID); assert_eq!(got.table_name(), TABLE_NAME); diff --git a/ingester/src/data/partition/resolver/catalog.rs b/ingester/src/data/partition/resolver/catalog.rs index 8035546be6..e42c4876c4 100644 --- a/ingester/src/data/partition/resolver/catalog.rs +++ b/ingester/src/data/partition/resolver/catalog.rs @@ -9,7 +9,10 @@ use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId}; use iox_catalog::interface::Catalog; use observability_deps::tracing::debug; -use crate::data::partition::PartitionData; +use crate::data::{ + partition::{PartitionData, SortKeyState}, + table::TableName, +}; use super::r#trait::PartitionProvider; @@ -55,7 +58,7 @@ impl PartitionProvider for CatalogPartitionResolver { shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, ) -> PartitionData { debug!( %partition_key, @@ -78,6 +81,7 @@ impl PartitionProvider for CatalogPartitionResolver { namespace_id, table_id, table_name, + SortKeyState::Provided(p.sort_key()), p.persisted_sequence_number, ) } @@ -131,7 +135,7 @@ mod tests { }; let callers_partition_key = PartitionKey::from(PARTITION_KEY); - let table_name = TABLE_NAME.into(); + let table_name = TableName::from(TABLE_NAME); let resolver = CatalogPartitionResolver::new(Arc::clone(&catalog)); let got = resolver .get_partition( @@ -139,11 +143,12 @@ mod tests { shard_id, namespace_id, table_id, - Arc::clone(&table_name), + table_name.clone(), ) .await; assert_eq!(got.namespace_id(), namespace_id); assert_eq!(*got.table_name(), *table_name); + assert_eq!(got.sort_key().await, None); assert_eq!(got.max_persisted_sequence_number(), None); assert!(got.partition_key.ptr_eq(&callers_partition_key)); diff --git a/ingester/src/data/partition/resolver/mock.rs b/ingester/src/data/partition/resolver/mock.rs index e65f127ef4..80f859c43e 100644 --- a/ingester/src/data/partition/resolver/mock.rs +++ b/ingester/src/data/partition/resolver/mock.rs @@ -1,12 +1,12 @@ //! A mock [`PartitionProvider`] to inject [`PartitionData`] for tests. -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; use async_trait::async_trait; use data_types::{NamespaceId, PartitionKey, ShardId, TableId}; use parking_lot::Mutex; -use crate::data::partition::PartitionData; +use crate::data::{partition::PartitionData, table::TableName}; use super::r#trait::PartitionProvider; @@ -58,7 +58,7 @@ impl PartitionProvider for MockPartitionProvider { shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, ) -> PartitionData { let p = self .partitions diff --git a/ingester/src/data/partition/resolver/mod.rs b/ingester/src/data/partition/resolver/mod.rs index fcb5e5fb6a..904eb781f5 100644 --- a/ingester/src/data/partition/resolver/mod.rs +++ b/ingester/src/data/partition/resolver/mod.rs @@ -11,6 +11,9 @@ pub use r#trait::*; mod catalog; pub use catalog::*; +mod sort_key; +pub(crate) use sort_key::*; + #[cfg(test)] mod mock; #[cfg(test)] diff --git a/ingester/src/data/partition/resolver/sort_key.rs b/ingester/src/data/partition/resolver/sort_key.rs new file mode 100644 index 0000000000..36e3ee5f1a --- /dev/null +++ b/ingester/src/data/partition/resolver/sort_key.rs @@ -0,0 +1,331 @@ +//! A optimised resolver of a partition [`SortKey`]. + +use std::{sync::Arc, time::Duration}; + +use backoff::{Backoff, BackoffConfig}; +use data_types::PartitionId; +use iox_catalog::interface::Catalog; +use parking_lot::Mutex; +use rand::Rng; +use schema::sort::SortKey; +use tokio::task::JoinHandle; + +/// The states of a [`DeferredSortKey`] instance. +#[derive(Debug)] +enum State { + /// The value has not yet been fetched by the background task. + Unresolved, + /// The value was fetched by the background task and is read to be consumed. + Resolved(Option), +} + +/// A resolver of [`SortKey`] from the catalog for a given partition. +/// +/// This implementation combines lazy / deferred loading of the [`SortKey`] from +/// the [`Catalog`], and a background timer that pre-fetches the [`SortKey`] +/// after some random duration of time. Combined, these behaviours smear the +/// [`SortKey`] queries across the allowable time range, avoiding a large number +/// of queries from executing when multiple [`SortKey`] are needed in the system +/// at one point in time. +/// +/// If the [`DeferredSortKey`] is dropped and the background task is still +/// incomplete (sleeping / actively fetching the [`SortKey`]) it is aborted +/// immediately. The background task exists once it has successfully fetched the +/// [`SortKey`]. +/// +/// # Stale Cached Values +/// +/// This is effectively a cache that is pre-warmed in the background - this +/// necessitates that the caller can tolerate, or determine, stale values. +#[derive(Debug)] +pub(crate) struct DeferredSortKey { + value: Arc>, + partition_id: PartitionId, + + handle: JoinHandle<()>, + + backoff_config: BackoffConfig, + catalog: Arc, +} + +impl DeferredSortKey { + /// Construct a [`DeferredSortKey`] instance that fetches the [`SortKey`] + /// for the specified `partition_id`. + /// + /// The background task will wait a uniformly random duration of time + /// between `[0, max_smear)` before attempting to pre-fetch the [`SortKey`] + /// from `catalog`. + pub(crate) fn new( + partition_id: PartitionId, + max_smear: Duration, + catalog: Arc, + backoff_config: BackoffConfig, + ) -> Self { + // Init the value container the background thread populates. + let value = Arc::new(Mutex::new(State::Unresolved)); + + // Select random duration from a uniform distribution, up to the + // configured maximum. + let wait_for = rand::thread_rng().gen_range(Duration::ZERO..max_smear); + + // Spawn the background task, sleeping for the random duration of time + // before fetching the sort key. + let handle = tokio::spawn({ + let value = Arc::clone(&value); + let catalog = Arc::clone(&catalog); + let backoff_config = backoff_config.clone(); + async move { + // Sleep for the random duration + tokio::time::sleep(wait_for).await; + // Fetch the sort key from the catalog + let v = fetch(partition_id, &*catalog, &backoff_config).await; + // And attempt to update the value container, if it hasn't + // already resolved + let mut state = value.lock(); + *state = match *state { + State::Unresolved => State::Resolved(v), + State::Resolved(_) => return, + }; + } + }); + + Self { + value, + partition_id, + handle, + backoff_config, + catalog, + } + } + + /// Read the [`SortKey`] for the partition. + /// + /// If the [`SortKey`] was pre-fetched in the background, it is returned + /// immediately. If the [`SortKey`] has not yet been resolved, this call + /// blocks while it is read from the [`Catalog`]. + /// + /// # Concurrency + /// + /// If this method requires resolving the [`SortKey`], N concurrent callers + /// will cause N queries against the catalog. + /// + /// # Await Safety + /// + /// Cancelling the future returned by calling [`Self::get()`] before + /// completion will leave [`Self`] without a background task. The next call + /// to [`Self::get()`] will incur a catalog query (see concurrency above). + pub(crate) async fn get(&self) -> Option { + { + let state = self.value.lock(); + + // If there is a resolved value, return it. + if let State::Resolved(v) = &*state { + return v.clone(); + } + } + + // Otherwise resolve the value immediately, aborting the background + // task. + self.handle.abort(); + let sort_key = fetch(self.partition_id, &*self.catalog, &self.backoff_config).await; + + { + let mut state = self.value.lock(); + *state = State::Resolved(sort_key.clone()); + } + + sort_key + } +} + +impl Drop for DeferredSortKey { + fn drop(&mut self) { + // Attempt to abort the background task, regardless of it having + // completed or not. + self.handle.abort() + } +} + +/// Fetch the [`SortKey`] from the [`Catalog`] for `partition_id`, retrying +/// endlessly when errors occur. +async fn fetch( + partition_id: PartitionId, + catalog: &dyn Catalog, + backoff_config: &BackoffConfig, +) -> Option { + Backoff::new(backoff_config) + .retry_all_errors("fetch partition sort key", || async { + let s = catalog + .repositories() + .await + .partitions() + .get_by_id(partition_id) + .await? + .expect("resolving sort key for non-existent partition") + .sort_key(); + + Result::<_, iox_catalog::interface::Error>::Ok(s) + }) + .await + .expect("retry forever") +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use data_types::ShardIndex; + use test_helpers::timeout::FutureTimeout; + + use crate::test_util::populate_catalog; + + use super::*; + + const SHARD_INDEX: ShardIndex = ShardIndex::new(24); + const TABLE_NAME: &str = "bananas"; + const NAMESPACE_NAME: &str = "platanos"; + const PARTITION_KEY: &str = "platanos"; + + // A test that (most likely) exercises the "read on demand" code path. + // + // The background task is configured to run some time between now, and + // 10,000,000 seconds in the future - it most likely doesn't get to complete + // before the get() call is issued. + // + // If this test flakes, it is POSSIBLE but UNLIKELY that the background task + // has completed and the get() call reads a pre-fetched value. + #[tokio::test] + async fn test_read_demand() { + const LONG_LONG_TIME: Duration = Duration::from_secs(10_000_000); + + let metrics = Arc::new(metric::Registry::default()); + let backoff_config = BackoffConfig::default(); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, _ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + let partition_id = catalog + .repositories() + .await + .partitions() + .create_or_get(PARTITION_KEY.into(), shard_id, table_id) + .await + .expect("should create") + .id; + + // Read the just-created sort key (None) + let fetched = DeferredSortKey::new( + partition_id, + Duration::from_secs(36_000_000), + Arc::clone(&catalog), + backoff_config.clone(), + ) + .get() + .await; + assert!(fetched.is_none()); + + // Set the sort key + let catalog_state = catalog + .repositories() + .await + .partitions() + .update_sort_key(partition_id, &["uno", "dos", "bananas"]) + .await + .expect("should update existing partition key"); + + // Read the updated sort key + let fetched = DeferredSortKey::new( + partition_id, + LONG_LONG_TIME, + Arc::clone(&catalog), + backoff_config, + ) + .get() + .await; + + assert!(fetched.is_some()); + assert_eq!(fetched, catalog_state.sort_key()); + } + + // A test that deterministically exercises the "background pre-fetch" code path. + #[tokio::test] + async fn test_read_pre_fetched() { + let metrics = Arc::new(metric::Registry::default()); + let backoff_config = BackoffConfig::default(); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, _ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + let partition_id = catalog + .repositories() + .await + .partitions() + .create_or_get(PARTITION_KEY.into(), shard_id, table_id) + .await + .expect("should create") + .id; + + // Read the just-created sort key (None) + let fetcher = DeferredSortKey::new( + partition_id, + Duration::from_nanos(1), + Arc::clone(&catalog), + backoff_config.clone(), + ); + + // Spin, waiting for the background task to show as complete. + async { + loop { + if fetcher.handle.is_finished() { + return; + } + + tokio::task::yield_now().await + } + } + .with_timeout_panic(Duration::from_secs(5)) + .await; + + assert!(fetcher.get().await.is_none()); + + // Set the sort key + let catalog_state = catalog + .repositories() + .await + .partitions() + .update_sort_key(partition_id, &["uno", "dos", "bananas"]) + .await + .expect("should update existing partition key"); + + // Read the updated sort key + let fetcher = DeferredSortKey::new( + partition_id, + Duration::from_nanos(1), + Arc::clone(&catalog), + backoff_config.clone(), + ); + + // Spin, waiting for the background task to show as complete. + async { + loop { + if fetcher.handle.is_finished() { + return; + } + + tokio::task::yield_now().await + } + } + .with_timeout_panic(Duration::from_secs(5)) + .await; + + let fetched = fetcher.get().await; + assert!(fetched.is_some()); + assert_eq!(fetched, catalog_state.sort_key()); + } +} diff --git a/ingester/src/data/partition/resolver/trait.rs b/ingester/src/data/partition/resolver/trait.rs index c18ccdf1a2..4ca50ec949 100644 --- a/ingester/src/data/partition/resolver/trait.rs +++ b/ingester/src/data/partition/resolver/trait.rs @@ -3,7 +3,7 @@ use std::{fmt::Debug, sync::Arc}; use async_trait::async_trait; use data_types::{NamespaceId, PartitionKey, ShardId, TableId}; -use crate::data::partition::PartitionData; +use crate::data::{partition::PartitionData, table::TableName}; /// An infallible resolver of [`PartitionData`] for the specified shard, table, /// and partition key, returning an initialised [`PartitionData`] buffer for it. @@ -20,7 +20,7 @@ pub trait PartitionProvider: Send + Sync + Debug { shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, ) -> PartitionData; } @@ -35,7 +35,7 @@ where shard_id: ShardId, namespace_id: NamespaceId, table_id: TableId, - table_name: Arc, + table_name: TableName, ) -> PartitionData { (**self) .get_partition(partition_key, shard_id, namespace_id, table_id, table_name) @@ -49,7 +49,7 @@ mod tests { use data_types::PartitionId; - use crate::data::partition::resolver::MockPartitionProvider; + use crate::data::partition::{resolver::MockPartitionProvider, SortKeyState}; use super::*; @@ -59,7 +59,7 @@ mod tests { let shard_id = ShardId::new(42); let namespace_id = NamespaceId::new(1234); let table_id = TableId::new(24); - let table_name = "platanos".into(); + let table_name = TableName::from("platanos"); let partition = PartitionId::new(4242); let data = PartitionData::new( partition, @@ -67,22 +67,17 @@ mod tests { shard_id, namespace_id, table_id, - Arc::clone(&table_name), + table_name.clone(), + SortKeyState::Provided(None), None, ); let mock = Arc::new(MockPartitionProvider::default().with_partition(data)); let got = mock - .get_partition( - key, - shard_id, - namespace_id, - table_id, - Arc::clone(&table_name), - ) + .get_partition(key, shard_id, namespace_id, table_id, table_name.clone()) .await; - assert_eq!(got.id(), partition); + assert_eq!(got.partition_id(), partition); assert_eq!(got.namespace_id(), namespace_id); assert_eq!(*got.table_name(), *table_name); } diff --git a/ingester/src/data/query_dedup.rs b/ingester/src/data/query_dedup.rs deleted file mode 100644 index 199e3ae14e..0000000000 --- a/ingester/src/data/query_dedup.rs +++ /dev/null @@ -1,159 +0,0 @@ -use std::sync::Arc; - -use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream}; -use iox_query::{ - exec::{Executor, ExecutorType}, - QueryChunk, QueryChunkMeta, ScanPlanBuilder, -}; -use observability_deps::tracing::debug; -use snafu::{ResultExt, Snafu}; - -use crate::query::QueryableBatch; - -#[derive(Debug, Snafu)] -#[allow(missing_copy_implementations, missing_docs)] -pub enum Error { - #[snafu(display("Error creating plan for querying Ingester data to send to Querier"))] - Frontend { - source: iox_query::frontend::common::Error, - }, - - #[snafu(display("Error building logical plan for querying Ingester data to send to Querier"))] - LogicalPlan { source: DataFusionError }, - - #[snafu(display( - "Error building physical plan for querying Ingester data to send to Querier: {}", - source - ))] - PhysicalPlan { source: DataFusionError }, - - #[snafu(display( - "Error executing the query for getting Ingester data to send to Querier: {}", - source - ))] - ExecutePlan { source: DataFusionError }, -} - -/// A specialized `Error` for Ingester's Query errors -pub type Result = std::result::Result; - -/// Query a given Queryable Batch, applying selection and filters as appropriate -/// Return stream of record batches -pub(crate) async fn query( - executor: &Executor, - data: Arc, -) -> Result { - // Build logical plan for filtering data - // Note that this query will also apply the delete predicates that go with the QueryableBatch - - // TODO: Since we have different type of servers (router, - // ingester, compactor, and querier), we may want to add more - // types into the ExecutorType to have better log and resource - // managment - let ctx = executor.new_context(ExecutorType::Query); - - // Creates an execution plan for a scan and filter data of a single chunk - let schema = data.schema(); - let table_name = data.table_name().to_string(); - - debug!(%table_name, "Creating single chunk scan plan"); - - let logical_plan = ScanPlanBuilder::new(schema, ctx.child_ctx("scan_and_filter planning")) - .with_chunks([data as _]) - .build() - .context(FrontendSnafu)? - .plan_builder - .build() - .context(LogicalPlanSnafu)?; - - debug!(%table_name, plan=%logical_plan.display_indent_schema(), - "created single chunk scan plan"); - - // Build physical plan - let physical_plan = ctx - .create_physical_plan(&logical_plan) - .await - .context(PhysicalPlanSnafu {})?; - - // Execute the plan and return the filtered stream - let output_stream = ctx - .execute_stream(physical_plan) - .await - .context(ExecutePlanSnafu {})?; - - Ok(output_stream) -} - -#[cfg(test)] -mod tests { - use arrow_util::assert_batches_eq; - - use super::*; - use crate::test_util::{ - create_one_record_batch_with_influxtype_no_duplicates, create_tombstone, - make_queryable_batch, make_queryable_batch_with_deletes, - }; - - #[tokio::test] - async fn test_query() { - test_helpers::maybe_start_logging(); - - // create input data - let batches = create_one_record_batch_with_influxtype_no_duplicates().await; - - // build queryable batch from the input batches - let batch = make_queryable_batch("test_table", 0, 1, batches); - - // query without filters - let exc = Executor::new(1); - let stream = query(&exc, batch).await.unwrap(); - let output_batches = datafusion::physical_plan::common::collect(stream) - .await - .unwrap(); - - // verify data: all rows and columns should be returned - let expected = vec![ - "+-----------+------+-----------------------------+", - "| field_int | tag1 | time |", - "+-----------+------+-----------------------------+", - "| 70 | UT | 1970-01-01T00:00:00.000020Z |", - "| 10 | VT | 1970-01-01T00:00:00.000010Z |", - "| 1000 | WA | 1970-01-01T00:00:00.000008Z |", - "+-----------+------+-----------------------------+", - ]; - assert_batches_eq!(&expected, &output_batches); - - exc.join().await; - } - - #[tokio::test] - async fn test_query_with_delete() { - test_helpers::maybe_start_logging(); - - // create input data - let batches = create_one_record_batch_with_influxtype_no_duplicates().await; - let tombstones = vec![create_tombstone(1, 1, 1, 1, 0, 200000, "tag1=UT")]; - - // build queryable batch from the input batches - let batch = make_queryable_batch_with_deletes("test_table", 0, 1, batches, tombstones); - - let exc = Executor::new(1); - let stream = query(&exc, batch).await.unwrap(); - let output_batches = datafusion::physical_plan::common::collect(stream) - .await - .unwrap(); - - // verify data: - let expected = vec![ - "+-----------+------+-----------------------------+", - "| field_int | tag1 | time |", - "+-----------+------+-----------------------------+", - "| 10 | VT | 1970-01-01T00:00:00.000010Z |", - "| 1000 | WA | 1970-01-01T00:00:00.000008Z |", - "+-----------+------+-----------------------------+", - ]; - assert_batches_eq!(&expected, &output_batches); - - exc.join().await; - } -} diff --git a/ingester/src/data/shard.rs b/ingester/src/data/shard.rs index 76fa44ab8b..b01504085f 100644 --- a/ingester/src/data/shard.rs +++ b/ingester/src/data/shard.rs @@ -1,22 +1,49 @@ //! Shard level data buffer structures. -use std::{ - collections::{btree_map::Entry, BTreeMap}, - sync::Arc, -}; +use std::{collections::HashMap, sync::Arc}; -use data_types::{ShardId, ShardIndex}; +use data_types::{NamespaceId, ShardId, ShardIndex}; use dml::DmlOperation; use iox_catalog::interface::Catalog; -use iox_query::exec::Executor; use metric::U64Counter; use parking_lot::RwLock; use snafu::{OptionExt, ResultExt}; use write_summary::ShardProgress; -use super::{namespace::NamespaceData, partition::resolver::PartitionProvider}; +use super::{ + namespace::{NamespaceData, NamespaceName}, + partition::resolver::PartitionProvider, +}; use crate::lifecycle::LifecycleHandle; +/// A double-referenced map where [`NamespaceData`] can be looked up by name, or +/// ID. +#[derive(Debug, Default)] +struct DoubleRef { + // TODO(4880): this can be removed when IDs are sent over the wire. + by_name: HashMap>, + by_id: HashMap>, +} + +impl DoubleRef { + fn insert(&mut self, name: NamespaceName, ns: NamespaceData) -> Arc { + let id = ns.namespace_id(); + + let ns = Arc::new(ns); + self.by_name.insert(name, Arc::clone(&ns)); + self.by_id.insert(id, Arc::clone(&ns)); + ns + } + + fn by_name(&self, name: &NamespaceName) -> Option> { + self.by_name.get(name).map(Arc::clone) + } + + fn by_id(&self, id: NamespaceId) -> Option> { + self.by_id.get(&id).map(Arc::clone) + } +} + /// Data of a Shard #[derive(Debug)] pub(crate) struct ShardData { @@ -32,7 +59,7 @@ pub(crate) struct ShardData { partition_provider: Arc, // New namespaces can come in at any time so we need to be able to add new ones - namespaces: RwLock>>, + namespaces: RwLock, metrics: Arc, namespace_count: U64Counter, @@ -72,9 +99,8 @@ impl ShardData { dml_operation: DmlOperation, catalog: &Arc, lifecycle_handle: &dyn LifecycleHandle, - executor: &Executor, ) -> Result { - let namespace_data = match self.namespace(dml_operation.namespace()) { + let namespace_data = match self.namespace(&NamespaceName::from(dml_operation.namespace())) { Some(d) => d, None => { self.insert_namespace(dml_operation.namespace(), &**catalog) @@ -83,14 +109,24 @@ impl ShardData { }; namespace_data - .buffer_operation(dml_operation, catalog, lifecycle_handle, executor) + .buffer_operation(dml_operation, catalog, lifecycle_handle) .await } /// Gets the namespace data out of the map - pub(crate) fn namespace(&self, namespace: &str) -> Option> { + pub(crate) fn namespace(&self, namespace: &NamespaceName) -> Option> { let n = self.namespaces.read(); - n.get(namespace).cloned() + n.by_name(namespace) + } + + /// Gets the namespace data out of the map + pub(crate) fn namespace_by_id(&self, namespace_id: NamespaceId) -> Option> { + // TODO: this should be the default once IDs are pushed over the wire. + // + // At which point the map should be indexed by IDs, instead of namespace + // names. + let n = self.namespaces.read(); + n.by_id(namespace_id) } /// Retrieves the namespace from the catalog and initializes an empty buffer, or @@ -101,6 +137,8 @@ impl ShardData { catalog: &dyn Catalog, ) -> Result, super::Error> { let mut repos = catalog.repositories().await; + + let ns_name = NamespaceName::from(namespace); let namespace = repos .namespaces() .get_by_name(namespace) @@ -110,26 +148,35 @@ impl ShardData { let mut n = self.namespaces.write(); - let data = match n.entry(namespace.name) { - Entry::Vacant(v) => { - let v = v.insert(Arc::new(NamespaceData::new( - namespace.id, - self.shard_id, - Arc::clone(&self.partition_provider), - &*self.metrics, - ))); + Ok(match n.by_name(&ns_name) { + Some(v) => v, + None => { self.namespace_count.inc(1); - Arc::clone(v) - } - Entry::Occupied(v) => Arc::clone(v.get()), - }; - Ok(data) + // Insert the table and then return a ref to it. + n.insert( + ns_name.clone(), + NamespaceData::new( + namespace.id, + ns_name, + self.shard_id, + Arc::clone(&self.partition_provider), + &*self.metrics, + ), + ) + } + }) } /// Return the progress of this shard pub(super) async fn progress(&self) -> ShardProgress { - let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect(); + let namespaces: Vec<_> = self + .namespaces + .read() + .by_id + .values() + .map(Arc::clone) + .collect(); let mut progress = ShardProgress::new(); @@ -144,3 +191,89 @@ impl ShardData { self.shard_index } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use data_types::{PartitionId, PartitionKey, ShardIndex}; + use metric::{Attributes, Metric}; + + use crate::{ + data::partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState}, + lifecycle::mock_handle::MockLifecycleHandle, + test_util::{make_write_op, populate_catalog}, + }; + + use super::*; + + const SHARD_INDEX: ShardIndex = ShardIndex::new(24); + const TABLE_NAME: &str = "bananas"; + const NAMESPACE_NAME: &str = "platanos"; + + #[tokio::test] + async fn test_shard_double_ref() { + let metrics = Arc::new(metric::Registry::default()); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + // Configure the mock partition provider to return a partition for this + // table ID. + let partition_provider = Arc::new(MockPartitionProvider::default().with_partition( + PartitionData::new( + PartitionId::new(0), + PartitionKey::from("banana-split"), + shard_id, + ns_id, + table_id, + TABLE_NAME.into(), + SortKeyState::Provided(None), + None, + ), + )); + + let shard = ShardData::new( + SHARD_INDEX, + shard_id, + partition_provider, + Arc::clone(&metrics), + ); + + // Assert the namespace does not contain the test data + assert!(shard.namespace(&NAMESPACE_NAME.into()).is_none()); + assert!(shard.namespace_by_id(ns_id).is_none()); + + // Write some test data + shard + .buffer_operation( + DmlOperation::Write(make_write_op( + &PartitionKey::from("banana-split"), + SHARD_INDEX, + NAMESPACE_NAME, + 0, + r#"bananas,city=Medford day="sun",temp=55 22"#, + )), + &catalog, + &MockLifecycleHandle::default(), + ) + .await + .expect("buffer op should succeed"); + + // Both forms of referencing the table should succeed + assert!(shard.namespace(&NAMESPACE_NAME.into()).is_some()); + assert!(shard.namespace_by_id(ns_id).is_some()); + + // And the table counter metric should increase + let tables = metrics + .get_instrument::>("ingester_namespaces_total") + .expect("failed to read metric") + .get_observer(&Attributes::from([])) + .expect("failed to get observer") + .fetch(); + assert_eq!(tables, 1); + } +} diff --git a/ingester/src/data/table.rs b/ingester/src/data/table.rs index 89127d04bf..8ebaa7a192 100644 --- a/ingester/src/data/table.rs +++ b/ingester/src/data/table.rs @@ -1,41 +1,94 @@ //! Table level data buffer structures. -use std::{collections::BTreeMap, sync::Arc}; +use std::{collections::HashMap, sync::Arc}; -use data_types::{ - DeletePredicate, NamespaceId, PartitionKey, SequenceNumber, ShardId, TableId, Timestamp, -}; -use iox_catalog::interface::Catalog; -use iox_query::exec::Executor; +use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId}; use mutable_batch::MutableBatch; -use snafu::ResultExt; +use observability_deps::tracing::*; use write_summary::ShardProgress; -use super::partition::{ - resolver::PartitionProvider, PartitionData, PartitionStatus, UnpersistedPartitionData, -}; -use crate::lifecycle::LifecycleHandle; +use super::partition::{resolver::PartitionProvider, PartitionData, UnpersistedPartitionData}; +use crate::{lifecycle::LifecycleHandle, querier_handler::PartitionStatus}; + +/// A double-referenced map where [`PartitionData`] can be looked up by +/// [`PartitionKey`], or ID. +#[derive(Debug, Default)] +struct DoubleRef { + // TODO(4880): this can be removed when IDs are sent over the wire. + by_key: HashMap, + by_id: HashMap, +} + +impl DoubleRef { + fn insert(&mut self, ns: PartitionData) { + let id = ns.partition_id(); + let key = ns.partition_key().clone(); + + assert!(self.by_key.insert(key.clone(), ns).is_none()); + assert!(self.by_id.insert(id, key).is_none()); + } + + #[cfg(test)] + fn by_key(&self, key: &PartitionKey) -> Option<&PartitionData> { + self.by_key.get(key) + } + + fn by_key_mut(&mut self, key: &PartitionKey) -> Option<&mut PartitionData> { + self.by_key.get_mut(key) + } + + fn by_id_mut(&mut self, id: PartitionId) -> Option<&mut PartitionData> { + let key = self.by_id.get(&id)?.clone(); + self.by_key_mut(&key) + } +} + +/// The string name / identifier of a Table. +/// +/// A reference-counted, cheap clone-able string. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct TableName(Arc); + +impl From for TableName +where + T: AsRef, +{ + fn from(v: T) -> Self { + Self(Arc::from(v.as_ref())) + } +} + +impl std::fmt::Display for TableName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl std::ops::Deref for TableName { + type Target = str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} /// Data of a Table in a given Namesapce that belongs to a given Shard #[derive(Debug)] pub(crate) struct TableData { table_id: TableId, - table_name: Arc, + table_name: TableName, /// The catalog ID of the shard & namespace this table is being populated /// from. shard_id: ShardId, namespace_id: NamespaceId, - // the max sequence number for a tombstone associated with this table - tombstone_max_sequence_number: Option, - /// An abstract constructor of [`PartitionData`] instances for a given /// `(key, shard, table)` triplet. partition_provider: Arc, - // Map pf partition key to its data - pub(super) partition_data: BTreeMap, + // Map of partition key to its data + partition_data: DoubleRef, } impl TableData { @@ -51,18 +104,16 @@ impl TableData { /// for the first time. pub(super) fn new( table_id: TableId, - table_name: &str, + table_name: TableName, shard_id: ShardId, namespace_id: NamespaceId, - tombstone_max_sequence_number: Option, partition_provider: Arc, ) -> Self { Self { table_id, - table_name: table_name.into(), + table_name, shard_id, namespace_id, - tombstone_max_sequence_number, partition_data: Default::default(), partition_provider, } @@ -71,18 +122,13 @@ impl TableData { /// Return parquet_max_sequence_number pub(super) fn parquet_max_sequence_number(&self) -> Option { self.partition_data + .by_key .values() .map(|p| p.max_persisted_sequence_number()) .max() .flatten() } - /// Return tombstone_max_sequence_number - #[allow(dead_code)] // Used in tests - pub(super) fn tombstone_max_sequence_number(&self) -> Option { - self.tombstone_max_sequence_number - } - // buffers the table write and returns true if the lifecycle manager indicates that // ingest should be paused. pub(super) async fn buffer_table_write( @@ -92,7 +138,7 @@ impl TableData { partition_key: PartitionKey, lifecycle_handle: &dyn LifecycleHandle, ) -> Result { - let partition_data = match self.partition_data.get_mut(&partition_key) { + let partition_data = match self.partition_data.by_key.get_mut(&partition_key) { Some(p) => p, None => { let p = self @@ -102,86 +148,87 @@ impl TableData { self.shard_id, self.namespace_id, self.table_id, - Arc::clone(&self.table_name), + self.table_name.clone(), ) .await; - // Add the partition to the map. - assert!(self - .partition_data - .insert(partition_key.clone(), p) - .is_none()); - self.partition_data.get_mut(&partition_key).unwrap() + // Add the double-referenced partition to the map. + self.partition_data.insert(p); + self.partition_data.by_key_mut(&partition_key).unwrap() } }; // skip the write if it has already been persisted if let Some(max) = partition_data.max_persisted_sequence_number() { if max >= sequence_number { + trace!( + shard_id=%self.shard_id, + op_sequence_number=?sequence_number, + "skipping already-persisted write" + ); return Ok(false); } } + let size = batch.size(); + let rows = batch.rows(); + partition_data.buffer_write(sequence_number, batch)?; + + // Record the write as having been buffered. + // + // This should happen AFTER the write is applied, because buffering the + // op may fail which would lead to a write being recorded, but not + // applied. let should_pause = lifecycle_handle.log_write( - partition_data.id(), + partition_data.partition_id(), self.shard_id, self.namespace_id, self.table_id, sequence_number, - batch.size(), - batch.rows(), + size, + rows, ); - partition_data.buffer_write(sequence_number, batch)?; Ok(should_pause) } - pub(super) async fn buffer_delete( + /// Return the [`PartitionData`] for the specified ID. + #[allow(unused)] + pub(crate) fn get_partition( &mut self, - predicate: &DeletePredicate, - sequence_number: SequenceNumber, - catalog: &dyn Catalog, - executor: &Executor, - ) -> Result<(), super::Error> { - let min_time = Timestamp::new(predicate.range.start()); - let max_time = Timestamp::new(predicate.range.end()); + partition_id: PartitionId, + ) -> Option<&mut PartitionData> { + self.partition_data.by_id_mut(partition_id) + } - let mut repos = catalog.repositories().await; - let tombstone = repos - .tombstones() - .create_or_get( - self.table_id, - self.shard_id, - sequence_number, - min_time, - max_time, - &predicate.expr_sql_string(), - ) - .await - .context(super::CatalogSnafu)?; + /// Return the [`PartitionData`] for the specified partition key. + #[cfg(test)] + pub(crate) fn get_partition_by_key( + &self, + partition_key: &PartitionKey, + ) -> Option<&PartitionData> { + self.partition_data.by_key(partition_key) + } - // remember "persisted" state - self.tombstone_max_sequence_number = Some(sequence_number); - - // modify one partition at a time - for data in self.partition_data.values_mut() { - data.buffer_tombstone(executor, tombstone.clone()).await; - } - - Ok(()) + /// Return the [`PartitionData`] for the specified partition key. + pub(crate) fn get_partition_by_key_mut( + &mut self, + partition_key: &PartitionKey, + ) -> Option<&mut PartitionData> { + self.partition_data.by_key_mut(partition_key) } pub(crate) fn unpersisted_partition_data(&self) -> Vec { self.partition_data + .by_key .values() .map(|p| UnpersistedPartitionData { - partition_id: p.id(), + partition_id: p.partition_id(), non_persisted: p .get_non_persisting_data() .expect("get_non_persisting should always work"), persisting: p.get_persisting_data(), partition_status: PartitionStatus { parquet_max_sequence_number: p.max_persisted_sequence_number(), - tombstone_max_sequence_number: self.tombstone_max_sequence_number, }, }) .collect() @@ -196,14 +243,223 @@ impl TableData { }; self.partition_data + .by_key .values() .fold(progress, |progress, partition_data| { progress.combine(partition_data.progress()) }) } - #[cfg(test)] + /// Returns the table ID for this partition. pub(super) fn table_id(&self) -> TableId { self.table_id } + + /// Returns the name of this table. + pub(crate) fn table_name(&self) -> &TableName { + &self.table_name + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use assert_matches::assert_matches; + use data_types::{PartitionId, ShardIndex}; + use iox_catalog::interface::Catalog; + use mutable_batch::writer; + use mutable_batch_lp::lines_to_batches; + use schema::{InfluxColumnType, InfluxFieldType}; + + use crate::{ + data::{ + partition::{resolver::MockPartitionProvider, PartitionData, SortKeyState}, + Error, + }, + lifecycle::mock_handle::{MockLifecycleCall, MockLifecycleHandle}, + test_util::populate_catalog, + }; + + use super::*; + + const SHARD_INDEX: ShardIndex = ShardIndex::new(24); + const TABLE_NAME: &str = "bananas"; + const NAMESPACE_NAME: &str = "platanos"; + const PARTITION_KEY: &str = "platanos"; + const PARTITION_ID: PartitionId = PartitionId::new(0); + + #[tokio::test] + async fn test_partition_double_ref() { + let metrics = Arc::new(metric::Registry::default()); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + // Configure the mock partition provider to return a partition for this + // table ID. + let partition_provider = Arc::new(MockPartitionProvider::default().with_partition( + PartitionData::new( + PARTITION_ID, + PARTITION_KEY.into(), + shard_id, + ns_id, + table_id, + TABLE_NAME.into(), + SortKeyState::Provided(None), + None, + ), + )); + + let mut table = TableData::new( + table_id, + TABLE_NAME.into(), + shard_id, + ns_id, + partition_provider, + ); + + let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0) + .unwrap() + .remove(TABLE_NAME) + .unwrap(); + + // Assert the table does not contain the test partition + assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none()); + assert!(table.partition_data.by_id_mut(PARTITION_ID).is_none()); + + // Write some test data + let pause = table + .buffer_table_write( + SequenceNumber::new(42), + batch, + PARTITION_KEY.into(), + &MockLifecycleHandle::default(), + ) + .await + .expect("buffer op should succeed"); + assert!(!pause); + + // Referencing the partition should succeed + assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some()); + assert!(table.partition_data.by_id_mut(PARTITION_ID).is_some()); + } + + #[tokio::test] + async fn test_bad_write_memory_counting() { + let metrics = Arc::new(metric::Registry::default()); + let catalog: Arc = + Arc::new(iox_catalog::mem::MemCatalog::new(Arc::clone(&metrics))); + + // Populate the catalog with the shard / namespace / table + let (shard_id, ns_id, table_id) = + populate_catalog(&*catalog, SHARD_INDEX, NAMESPACE_NAME, TABLE_NAME).await; + + // Configure the mock partition provider to return a partition for this + // table ID. + let partition_provider = Arc::new(MockPartitionProvider::default().with_partition( + PartitionData::new( + PARTITION_ID, + PARTITION_KEY.into(), + shard_id, + ns_id, + table_id, + TABLE_NAME.into(), + SortKeyState::Provided(None), + None, + ), + )); + + let mut table = TableData::new( + table_id, + TABLE_NAME.into(), + shard_id, + ns_id, + partition_provider, + ); + + let batch = lines_to_batches(r#"bananas,bat=man value=24 42"#, 0) + .unwrap() + .remove(TABLE_NAME) + .unwrap(); + + // Initialise the mock lifecycle handle and use it to inspect the calls + // made to the lifecycle manager during buffering. + let handle = MockLifecycleHandle::default(); + + // Assert the table does not contain the test partition + assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_none()); + + // Write some test data + let pause = table + .buffer_table_write( + SequenceNumber::new(42), + batch, + PARTITION_KEY.into(), + &handle, + ) + .await + .expect("buffer op should succeed"); + assert!(!pause); + + // Referencing the partition should succeed + assert!(table.partition_data.by_key(&PARTITION_KEY.into()).is_some()); + + // And the lifecycle handle was called with the expected values + assert_eq!( + handle.get_log_calls(), + &[MockLifecycleCall { + partition_id: PARTITION_ID, + shard_id, + namespace_id: ns_id, + table_id, + sequence_number: SequenceNumber::new(42), + bytes_written: 1131, + rows_written: 1, + }] + ); + + // Attempt to buffer the second op that contains a type conflict - this + // should return an error, and not make a call to the lifecycle handle + // (as no data was buffered) + // + // Note the type of value was numeric previously, and here it is a string. + let batch = lines_to_batches(r#"bananas,bat=man value="platanos" 42"#, 0) + .unwrap() + .remove(TABLE_NAME) + .unwrap(); + + let err = table + .buffer_table_write( + SequenceNumber::new(42), + batch, + PARTITION_KEY.into(), + &handle, + ) + .await + .expect_err("type conflict should error"); + + // The buffer op should return a column type error + assert_matches!( + err, + Error::BufferWrite { + source: mutable_batch::Error::WriterError { + source: writer::Error::TypeMismatch { + existing: InfluxColumnType::Field(InfluxFieldType::Float), + inserted: InfluxColumnType::Field(InfluxFieldType::String), + column: col_name, + } + }, + } => { assert_eq!(col_name, "value") } + ); + + // And the lifecycle handle should not be called. + // + // It still contains the first call, so the desired length is 1 + // indicating no second call was made. + assert_eq!(handle.get_log_calls().len(), 1); + } } diff --git a/ingester/src/handler.rs b/ingester/src/handler.rs index dde159dc52..981a43cd57 100644 --- a/ingester/src/handler.rs +++ b/ingester/src/handler.rs @@ -30,17 +30,24 @@ use crate::{ data::{ partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider}, shard::ShardData, - IngesterData, IngesterQueryResponse, + IngesterData, }, lifecycle::{run_lifecycle_manager, LifecycleConfig, LifecycleManager}, poison::PoisonCabinet, - querier_handler::prepare_data_to_querier, + querier_handler::{prepare_data_to_querier, IngesterQueryResponse}, stream_handler::{ handler::SequencedStreamHandler, sink_adaptor::IngestSinkAdaptor, sink_instrumentation::SinkInstrumentation, PeriodicWatermarkFetcher, }, }; +/// The maximum duration of time between creating a [`PartitionData`] and its +/// [`SortKey`] being fetched from the catalog. +/// +/// [`PartitionData`]: crate::data::partition::PartitionData +/// [`SortKey`]: schema::sort::SortKey +const SORT_KEY_PRE_FETCH: Duration = Duration::from_secs(30); + #[derive(Debug, Snafu)] #[allow(missing_copy_implementations, missing_docs)] pub enum Error { @@ -160,7 +167,13 @@ impl IngestHandlerImpl { // Build the partition provider. let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog)); - let partition_provider = PartitionCache::new(partition_provider, recent_partitions); + let partition_provider = PartitionCache::new( + partition_provider, + recent_partitions, + SORT_KEY_PRE_FETCH, + Arc::clone(&catalog), + BackoffConfig::default(), + ); let partition_provider: Arc = Arc::new(partition_provider); // build the initial ingester data state @@ -432,7 +445,7 @@ mod tests { use write_buffer::mock::{MockBufferForReading, MockBufferSharedState}; use super::*; - use crate::data::partition::SnapshotBatch; + use crate::data::{partition::SnapshotBatch, table::TableName}; #[tokio::test] async fn read_from_write_buffer_write_to_mutable_buffer() { @@ -499,13 +512,16 @@ mod tests { // give the writes some time to go through the buffer. Exit once we've verified there's // data in there from both writes. tokio::time::timeout(Duration::from_secs(2), async { + let ns_name = ingester.namespace.name.into(); + let table_name = TableName::from("a"); loop { let mut has_measurement = false; if let Some(data) = ingester.ingester.data.shard(ingester.shard.id) { - if let Some(data) = data.namespace(&ingester.namespace.name) { + if let Some(data) = data.namespace(&ns_name) { // verify there's data in the buffer - if let Some((b, _)) = data.snapshot("a", &"1970-01-01".into()).await { + if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await + { if let Some(b) = b.first() { if b.data.num_rows() > 0 { has_measurement = true; @@ -740,13 +756,16 @@ mod tests { // give the writes some time to go through the buffer. Exit once we've verified there's // data in there tokio::time::timeout(Duration::from_secs(1), async move { + let ns_name = namespace.name.into(); + let table_name = TableName::from("cpu"); loop { let mut has_measurement = false; if let Some(data) = ingester.data.shard(shard.id) { - if let Some(data) = data.namespace(&namespace.name) { + if let Some(data) = data.namespace(&ns_name) { // verify there's data in the buffer - if let Some((b, _)) = data.snapshot("cpu", &"1970-01-01".into()).await { + if let Some((b, _)) = data.snapshot(&table_name, &"1970-01-01".into()).await + { if let Some(b) = b.first() { custom_batch_verification(b); diff --git a/ingester/src/lifecycle.rs b/ingester/src/lifecycle.rs index b46b84dde7..d15389ed60 100644 --- a/ingester/src/lifecycle.rs +++ b/ingester/src/lifecycle.rs @@ -12,7 +12,7 @@ use std::{collections::BTreeMap, sync::Arc, time::Duration}; use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId}; use iox_time::{Time, TimeProvider}; use metric::{Metric, U64Counter}; -use observability_deps::tracing::{error, info, warn}; +use observability_deps::tracing::{error, info, trace, warn}; use parking_lot::Mutex; use tokio_util::sync::CancellationToken; use tracker::TrackedFutureExt; @@ -97,6 +97,18 @@ impl LifecycleHandle for LifecycleHandleImpl { stats.last_write = now; stats.rows_written += rows_written; + trace!( + shard_id=%stats.shard_id, + partition_id=%stats.partition_id, + namespace_id=%stats.namespace_id, + table_id=%stats.table_id, + first_write=%stats.first_write, + last_write=%stats.last_write, + bytes_written=%stats.bytes_written, + first_sequence_number=?stats.first_sequence_number, + "logged write" + ); + s.total_bytes += bytes_written; // Pause if the server has exceeded the configured memory limit. @@ -234,7 +246,7 @@ struct LifecycleStats { } /// The stats for a partition -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] struct PartitionLifecycleStats { /// The shard this partition is under shard_id: ShardId, @@ -469,6 +481,18 @@ impl LifecycleManager { let persist_tasks: Vec<_> = to_persist .into_iter() .map(|s| { + // BUG: TOCTOU: memory usage released may be incorrect. + // + // Here the amount of memory to be reduced is acquired, but this + // code does not prevent continued writes adding more data to + // the partition in another thread. + // + // This may lead to more actual data being persisted than the + // call below returns to the server pool - this would slowly + // starve the ingester of memory it thinks it has. + // + // See https://github.com/influxdata/influxdb_iox/issues/5777 + // Mark this partition as being persisted, and remember the // memory allocation it had accumulated. let partition_memory_usage = self @@ -483,7 +507,9 @@ impl LifecycleManager { let state = Arc::clone(&self.state); tokio::task::spawn(async move { - persister.persist(s.partition_id).await; + persister + .persist(s.shard_id, s.namespace_id, s.table_id, s.partition_id) + .await; // Now the data has been uploaded and the memory it was // using has been freed, released the memory capacity back // the ingester. @@ -524,6 +550,12 @@ impl LifecycleManager { .map(|s| s.first_sequence_number) .min() .unwrap_or(sequence_number); + trace!( + min_unpersisted_sequence_number=?min, + shard_id=%shard_id, + sequence_number=?sequence_number, + "updated min_unpersisted_sequence_number for persisted shard" + ); persister .update_min_unpersisted_sequence_number(shard_id, min) .await; @@ -602,7 +634,13 @@ mod tests { #[async_trait] impl Persister for TestPersister { - async fn persist(&self, partition_id: PartitionId) { + async fn persist( + &self, + _shard_id: ShardId, + _namespace_id: NamespaceId, + _table_id: TableId, + partition_id: PartitionId, + ) { let mut p = self.persist_called.lock(); p.insert(partition_id); } @@ -662,8 +700,16 @@ mod tests { #[async_trait] impl Persister for PausablePersister { - async fn persist(&self, partition_id: PartitionId) { - self.inner.persist(partition_id).await; + async fn persist( + &self, + shard_id: ShardId, + namespace_id: NamespaceId, + table_id: TableId, + partition_id: PartitionId, + ) { + self.inner + .persist(shard_id, namespace_id, table_id, partition_id) + .await; if let Some(event) = self.event(partition_id) { event.before.wait().await; event.after.wait().await; diff --git a/ingester/src/lifecycle/mock_handle.rs b/ingester/src/lifecycle/mock_handle.rs index d5b889c4af..bec4af5ce0 100644 --- a/ingester/src/lifecycle/mock_handle.rs +++ b/ingester/src/lifecycle/mock_handle.rs @@ -1,26 +1,66 @@ //! A mock [`LifecycleHandle`] impl for testing. +use std::sync::Arc; + use data_types::{NamespaceId, PartitionId, SequenceNumber, ShardId, TableId}; +use parking_lot::Mutex; use super::LifecycleHandle; -/// Special [`LifecycleHandle`] that never persists and always accepts more data. -/// -/// This is useful to control persists manually. -#[derive(Debug, Default, Clone, Copy)] -pub struct NoopLifecycleHandle; +/// A set of arguments captured from a call to +/// [`MockLifecycleHandle::log_write()`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(missing_docs)] +pub struct MockLifecycleCall { + pub partition_id: PartitionId, + pub shard_id: ShardId, + pub namespace_id: NamespaceId, + pub table_id: TableId, + pub sequence_number: SequenceNumber, + pub bytes_written: usize, + pub rows_written: usize, +} -impl LifecycleHandle for NoopLifecycleHandle { +/// A mock [`LifecycleHandle`] implementation that records calls made to +/// [`Self::log_write()`] and never blocks ingest, always accepting more data. +/// +/// # Cloning +/// +/// Cloning a [`MockLifecycleHandle`] will clone the inner state - calls to all +/// cloned instances are reported in a call to [`Self::get_log_calls()`]. +#[derive(Debug, Default, Clone)] +pub struct MockLifecycleHandle { + log_calls: Arc>>, +} + +impl MockLifecycleHandle { + /// Returns the ordered [`Self::log_write()`] calls made to this mock. + pub fn get_log_calls(&self) -> Vec { + self.log_calls.lock().clone() + } +} + +impl LifecycleHandle for MockLifecycleHandle { fn log_write( &self, - _partition_id: PartitionId, - _shard_id: ShardId, - _namespace_id: NamespaceId, - _table_id: TableId, - _sequence_number: SequenceNumber, - _bytes_written: usize, - _rows_written: usize, + partition_id: PartitionId, + shard_id: ShardId, + namespace_id: NamespaceId, + table_id: TableId, + sequence_number: SequenceNumber, + bytes_written: usize, + rows_written: usize, ) -> bool { + self.log_calls.lock().push(MockLifecycleCall { + partition_id, + shard_id, + namespace_id, + table_id, + sequence_number, + bytes_written, + rows_written, + }); + // do NOT pause ingest false } diff --git a/ingester/src/querier_handler.rs b/ingester/src/querier_handler.rs index d3c8e37e19..59996f94cf 100644 --- a/ingester/src/querier_handler.rs +++ b/ingester/src/querier_handler.rs @@ -1,10 +1,13 @@ //! Handle all requests from Querier -use std::sync::Arc; +use std::{pin::Pin, sync::Arc}; +use arrow::{error::ArrowError, record_batch::RecordBatch}; +use arrow_util::optimize::{optimize_record_batch, optimize_schema}; +use data_types::{PartitionId, SequenceNumber}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_util::MemoryStream; -use futures::StreamExt; +use futures::{Stream, StreamExt}; use generated_types::ingester::IngesterQueryRequest; use observability_deps::tracing::debug; use schema::selection::Selection; @@ -12,8 +15,8 @@ use snafu::{ensure, Snafu}; use crate::{ data::{ - partition::UnpersistedPartitionData, IngesterData, IngesterQueryPartition, - IngesterQueryResponse, + namespace::NamespaceName, partition::UnpersistedPartitionData, table::TableName, + IngesterData, }, query::QueryableBatch, }; @@ -47,6 +50,159 @@ pub enum Error { /// A specialized `Error` for Ingester's Query errors pub type Result = std::result::Result; +/// Stream of snapshots. +/// +/// Every snapshot is a dedicated [`SendableRecordBatchStream`]. +pub(crate) type SnapshotStream = + Pin> + Send>>; + +/// Status of a partition that has unpersisted data. +/// +/// Note that this structure is specific to a partition (which itself is bound to a table and +/// shard)! +#[derive(Debug, Clone, PartialEq, Eq)] +#[allow(missing_copy_implementations)] +pub struct PartitionStatus { + /// Max sequence number persisted + pub parquet_max_sequence_number: Option, +} + +/// Response data for a single partition. +pub(crate) struct IngesterQueryPartition { + /// Stream of snapshots. + snapshots: SnapshotStream, + + /// Partition ID. + id: PartitionId, + + /// Partition persistence status. + status: PartitionStatus, +} + +impl std::fmt::Debug for IngesterQueryPartition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IngesterQueryPartition") + .field("snapshots", &"") + .field("id", &self.id) + .field("status", &self.status) + .finish() + } +} + +impl IngesterQueryPartition { + pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self { + Self { + snapshots, + id, + status, + } + } +} + +/// Stream of partitions in this response. +pub(crate) type IngesterQueryPartitionStream = + Pin> + Send>>; + +/// Response streams for querier<>ingester requests. +/// +/// The data structure is constructed to allow lazy/streaming data generation. For easier +/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method. +pub struct IngesterQueryResponse { + /// Stream of partitions. + partitions: IngesterQueryPartitionStream, +} + +impl std::fmt::Debug for IngesterQueryResponse { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IngesterQueryResponse") + .field("partitions", &"") + .finish() + } +} + +impl IngesterQueryResponse { + /// Make a response + pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self { + Self { partitions } + } + + /// Flattens the data according to the wire protocol. + pub fn flatten(self) -> FlatIngesterQueryResponseStream { + self.partitions + .flat_map(|partition_res| match partition_res { + Ok(partition) => { + let head = futures::stream::once(async move { + Ok(FlatIngesterQueryResponse::StartPartition { + partition_id: partition.id, + status: partition.status, + }) + }); + let tail = partition + .snapshots + .flat_map(|snapshot_res| match snapshot_res { + Ok(snapshot) => { + let schema = Arc::new(optimize_schema(&snapshot.schema())); + + let schema_captured = Arc::clone(&schema); + let head = futures::stream::once(async { + Ok(FlatIngesterQueryResponse::StartSnapshot { + schema: schema_captured, + }) + }); + + let tail = snapshot.map(move |batch_res| match batch_res { + Ok(batch) => Ok(FlatIngesterQueryResponse::RecordBatch { + batch: optimize_record_batch(&batch, Arc::clone(&schema))?, + }), + Err(e) => Err(e), + }); + + head.chain(tail).boxed() + } + Err(e) => futures::stream::once(async { Err(e) }).boxed(), + }); + + head.chain(tail).boxed() + } + Err(e) => futures::stream::once(async { Err(e) }).boxed(), + }) + .boxed() + } +} + +/// Flattened version of [`IngesterQueryResponse`]. +pub(crate) type FlatIngesterQueryResponseStream = + Pin> + Send>>; + +/// Element within the flat wire protocol. +#[derive(Debug, PartialEq)] +pub enum FlatIngesterQueryResponse { + /// Start a new partition. + StartPartition { + /// Partition ID. + partition_id: PartitionId, + + /// Partition persistence status. + status: PartitionStatus, + }, + + /// Start a new snapshot. + /// + /// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition) + /// message. + StartSnapshot { + /// Snapshot schema. + schema: Arc, + }, + + /// Add a record batch to the snapshot that was announced by the last + /// [`StartSnapshot`](Self::StartSnapshot) message. + RecordBatch { + /// Record batch. + batch: RecordBatch, + }, +} + /// Return data to send as a response back to the Querier per its request pub async fn prepare_data_to_querier( ingest_data: &Arc, @@ -57,7 +213,8 @@ pub async fn prepare_data_to_querier( let mut found_namespace = false; for (shard_id, shard_data) in ingest_data.shards() { debug!(shard_id=%shard_id.get()); - let namespace_data = match shard_data.namespace(&request.namespace) { + let namespace_name = NamespaceName::from(&request.namespace); + let namespace_data = match shard_data.namespace(&namespace_name) { Some(namespace_data) => { debug!(namespace=%request.namespace, "found namespace"); found_namespace = true; @@ -68,7 +225,8 @@ pub async fn prepare_data_to_querier( } }; - let table_data = match namespace_data.table_data(&request.table) { + let table_name = TableName::from(&request.table); + let table_data = match namespace_data.table_data(&table_name) { Some(table_data) => { debug!(table_name=%request.table, "found table"); table_data @@ -153,7 +311,6 @@ fn prepare_data_to_querier_for_partition( request.table.clone().into(), unpersisted_partition_data.partition_id, vec![], - vec![], ) }) .with_data(unpersisted_partition_data.non_persisted); @@ -188,22 +345,106 @@ fn prepare_data_to_querier_for_partition( #[cfg(test)] mod tests { - use arrow::{array::new_null_array, record_batch::RecordBatch}; + use std::task::{Context, Poll}; + + use arrow::{array::new_null_array, datatypes::SchemaRef, record_batch::RecordBatch}; use arrow_util::assert_batches_sorted_eq; use assert_matches::assert_matches; - use datafusion::logical_plan::{col, lit}; + use datafusion::{ + logical_plan::{col, lit}, + physical_plan::RecordBatchStream, + }; use futures::TryStreamExt; + use mutable_batch_lp::test_helpers::lp_to_mutable_batch; use predicate::Predicate; use schema::merge::SchemaMerger; use super::*; - use crate::{ - data::FlatIngesterQueryResponse, - test_util::{ - make_ingester_data, make_ingester_data_with_tombstones, DataLocation, TEST_NAMESPACE, - TEST_TABLE, - }, - }; + use crate::test_util::{make_ingester_data, DataLocation, TEST_NAMESPACE, TEST_TABLE}; + + #[tokio::test] + async fn test_ingester_query_response_flatten() { + let batch_1_1 = lp_to_batch("table x=1 0"); + let batch_1_2 = lp_to_batch("table x=2 1"); + let batch_2 = lp_to_batch("table y=1 10"); + let batch_3 = lp_to_batch("table z=1 10"); + + let schema_1 = batch_1_1.schema(); + let schema_2 = batch_2.schema(); + let schema_3 = batch_3.schema(); + + let response = IngesterQueryResponse::new(Box::pin(futures::stream::iter([ + Ok(IngesterQueryPartition::new( + Box::pin(futures::stream::iter([ + Ok(Box::pin(TestRecordBatchStream::new( + vec![ + Ok(batch_1_1.clone()), + Err(ArrowError::NotYetImplemented("not yet implemeneted".into())), + Ok(batch_1_2.clone()), + ], + Arc::clone(&schema_1), + )) as _), + Err(ArrowError::InvalidArgumentError("invalid arg".into())), + Ok(Box::pin(TestRecordBatchStream::new( + vec![Ok(batch_2.clone())], + Arc::clone(&schema_2), + )) as _), + Ok(Box::pin(TestRecordBatchStream::new(vec![], Arc::clone(&schema_3))) as _), + ])), + PartitionId::new(2), + PartitionStatus { + parquet_max_sequence_number: None, + }, + )), + Err(ArrowError::IoError("some io error".into())), + Ok(IngesterQueryPartition::new( + Box::pin(futures::stream::iter([])), + PartitionId::new(1), + PartitionStatus { + parquet_max_sequence_number: None, + }, + )), + ]))); + + let actual: Vec<_> = response.flatten().collect().await; + let expected = vec![ + Ok(FlatIngesterQueryResponse::StartPartition { + partition_id: PartitionId::new(2), + status: PartitionStatus { + parquet_max_sequence_number: None, + }, + }), + Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_1 }), + Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_1 }), + Err(ArrowError::NotYetImplemented("not yet implemeneted".into())), + Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_1_2 }), + Err(ArrowError::InvalidArgumentError("invalid arg".into())), + Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_2 }), + Ok(FlatIngesterQueryResponse::RecordBatch { batch: batch_2 }), + Ok(FlatIngesterQueryResponse::StartSnapshot { schema: schema_3 }), + Err(ArrowError::IoError("some io error".into())), + Ok(FlatIngesterQueryResponse::StartPartition { + partition_id: PartitionId::new(1), + status: PartitionStatus { + parquet_max_sequence_number: None, + }, + }), + ]; + + assert_eq!(actual.len(), expected.len()); + for (actual, expected) in actual.into_iter().zip(expected) { + match (actual, expected) { + (Ok(actual), Ok(expected)) => { + assert_eq!(actual, expected); + } + (Err(_), Err(_)) => { + // cannot compare `ArrowError`, but it's unlikely that someone changed the error + } + (Ok(_), Err(_)) => panic!("Actual is Ok but expected is Err"), + (Err(_), Ok(_)) => panic!("Actual is Err but expected is Ok"), + } + } + } #[tokio::test] async fn test_prepare_data_to_querier() { @@ -360,180 +601,44 @@ mod tests { } } - #[tokio::test] - async fn test_prepare_data_to_querier_with_tombstones() { - test_helpers::maybe_start_logging(); + pub struct TestRecordBatchStream { + schema: SchemaRef, + batches: Vec>, + } - // make 7 scenarios for ingester data with tombstones - let mut scenarios = vec![]; - for loc in &[ - DataLocation::BUFFER, - DataLocation::BUFFER_SNAPSHOT, - DataLocation::BUFFER_PERSISTING, - DataLocation::BUFFER_SNAPSHOT_PERSISTING, - DataLocation::SNAPSHOT, - DataLocation::SNAPSHOT_PERSISTING, - DataLocation::PERSISTING, - ] { - let scenario = Arc::new(make_ingester_data_with_tombstones(*loc).await); - scenarios.push((loc, scenario)); + impl TestRecordBatchStream { + pub fn new(batches: Vec>, schema: SchemaRef) -> Self { + Self { schema, batches } } + } - // read data from all scenarios without any filters - let request = Arc::new(IngesterQueryRequest::new( - TEST_NAMESPACE.to_string(), - TEST_TABLE.to_string(), - vec![], - None, - )); - let expected_not_persisting = vec![ - "+------------+-----+------+--------------------------------+", - "| city | day | temp | time |", - "+------------+-----+------+--------------------------------+", - "| Andover | mon | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Medford | wed | | 1970-01-01T00:00:00.000000026Z |", - "| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |", - "+------------+-----+------+--------------------------------+", - ]; - // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data - // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones. - let expected_persisting = vec![ - "+------------+-----+------+--------------------------------+", - "| city | day | temp | time |", - "+------------+-----+------+--------------------------------+", - "| Andover | mon | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Boston | mon | | 1970-01-01T00:00:00.000000038Z |", - "| Boston | sun | 60 | 1970-01-01T00:00:00.000000036Z |", - "| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Medford | wed | | 1970-01-01T00:00:00.000000026Z |", - "| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |", - "+------------+-----+------+--------------------------------+", - ]; - for (loc, scenario) in &scenarios { - println!("Location: {loc:?}"); - let expected = if loc.intersects(DataLocation::PERSISTING) { - &expected_persisting + impl RecordBatchStream for TestRecordBatchStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + } + + impl futures::Stream for TestRecordBatchStream { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + _: &mut Context<'_>, + ) -> Poll> { + if self.batches.is_empty() { + Poll::Ready(None) } else { - &expected_not_persisting - }; - - let stream = prepare_data_to_querier(scenario, &request).await.unwrap(); - let result = ingester_response_to_record_batches(stream).await; - assert_batches_sorted_eq!(expected, &result); + Poll::Ready(Some(self.batches.remove(0))) + } } - // read data from all scenarios and filter out column day - let request = Arc::new(IngesterQueryRequest::new( - TEST_NAMESPACE.to_string(), - TEST_TABLE.to_string(), - vec!["city".to_string(), "temp".to_string(), "time".to_string()], - None, - )); - let expected_not_persisting = vec![ - "+------------+------+--------------------------------+", - "| city | temp | time |", - "+------------+------+--------------------------------+", - "| Andover | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Medford | | 1970-01-01T00:00:00.000000026Z |", - "| Medford | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | | 1970-01-01T00:00:00.000000035Z |", - "+------------+------+--------------------------------+", - ]; - // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data - // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones. - let expected_persisting = vec![ - "+------------+------+--------------------------------+", - "| city | temp | time |", - "+------------+------+--------------------------------+", - "| Andover | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Boston | | 1970-01-01T00:00:00.000000038Z |", - "| Boston | 60 | 1970-01-01T00:00:00.000000036Z |", - "| Medford | | 1970-01-01T00:00:00.000000026Z |", - "| Medford | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | | 1970-01-01T00:00:00.000000035Z |", - "+------------+------+--------------------------------+", - ]; - for (loc, scenario) in &scenarios { - println!("Location: {loc:?}"); - let expected = if loc.intersects(DataLocation::PERSISTING) { - &expected_persisting - } else { - &expected_not_persisting - }; - - let stream = prepare_data_to_querier(scenario, &request).await.unwrap(); - let result = ingester_response_to_record_batches(stream).await; - assert_batches_sorted_eq!(expected, &result); + fn size_hint(&self) -> (usize, Option) { + (self.batches.len(), Some(self.batches.len())) } + } - // read data from all scenarios, filter out column day, city Medford, time outside range [0, 42) - let expr = col("city").not_eq(lit("Medford")); - let pred = Predicate::default().with_expr(expr).with_range(0, 42); - let request = Arc::new(IngesterQueryRequest::new( - TEST_NAMESPACE.to_string(), - TEST_TABLE.to_string(), - vec!["city".to_string(), "temp".to_string(), "time".to_string()], - Some(pred), - )); - // predicates and de-dup are NOT applied!, otherwise this would look like this: - // let expected = vec![ - // "+------------+------+--------------------------------+", - // "| city | temp | time |", - // "+------------+------+--------------------------------+", - // "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |", - // "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |", - // "| Wilmington | | 1970-01-01T00:00:00.000000035Z |", - // "+------------+------+--------------------------------+", - // ]; - let expected_not_persisting = vec![ - "+------------+------+--------------------------------+", - "| city | temp | time |", - "+------------+------+--------------------------------+", - "| Andover | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Medford | | 1970-01-01T00:00:00.000000026Z |", - "| Medford | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | | 1970-01-01T00:00:00.000000035Z |", - "+------------+------+--------------------------------+", - ]; - // For "persisting" data locations the tombstones were NOT applied because they arrived AFTER the data - // transitioned into the "persisting" state. In this case, the ingester will apply the tombstones. - let expected_persisting = vec![ - "+------------+------+--------------------------------+", - "| city | temp | time |", - "+------------+------+--------------------------------+", - "| Andover | | 1970-01-01T00:00:00.000000046Z |", - "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |", - "| Boston | | 1970-01-01T00:00:00.000000038Z |", - "| Boston | 60 | 1970-01-01T00:00:00.000000036Z |", - "| Medford | | 1970-01-01T00:00:00.000000026Z |", - "| Medford | 55 | 1970-01-01T00:00:00.000000022Z |", - "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |", - "| Wilmington | | 1970-01-01T00:00:00.000000035Z |", - "+------------+------+--------------------------------+", - ]; - for (loc, scenario) in &scenarios { - println!("Location: {loc:?}"); - let expected = if loc.intersects(DataLocation::PERSISTING) { - &expected_persisting - } else { - &expected_not_persisting - }; - - let stream = prepare_data_to_querier(scenario, &request).await.unwrap(); - let result = ingester_response_to_record_batches(stream).await; - assert_batches_sorted_eq!(expected, &result); - } + fn lp_to_batch(lp: &str) -> RecordBatch { + lp_to_mutable_batch(lp).1.to_arrow(Selection::All).unwrap() } /// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es. diff --git a/ingester/src/query.rs b/ingester/src/query.rs index 747ff4666c..dc38001e4f 100644 --- a/ingester/src/query.rs +++ b/ingester/src/query.rs @@ -6,26 +6,26 @@ use arrow::record_batch::RecordBatch; use arrow_util::util::ensure_schema; use data_types::{ ChunkId, ChunkOrder, DeletePredicate, PartitionId, SequenceNumber, TableSummary, - TimestampMinMax, Tombstone, + TimestampMinMax, }; -use datafusion::physical_plan::{ - common::SizedRecordBatchStream, - metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics}, - SendableRecordBatchStream, +use datafusion::{ + error::DataFusionError, + physical_plan::{ + common::SizedRecordBatchStream, + metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics}, + SendableRecordBatchStream, + }, }; use iox_query::{ exec::{stringset::StringSet, IOxSessionContext}, - QueryChunk, QueryChunkError, QueryChunkMeta, + QueryChunk, QueryChunkMeta, }; use observability_deps::tracing::trace; -use predicate::{ - delete_predicate::{tombstones_to_delete_predicates, tombstones_to_delete_predicates_iter}, - Predicate, -}; +use predicate::Predicate; use schema::{merge::merge_record_batch_schemas, selection::Selection, sort::SortKey, Schema}; use snafu::{ResultExt, Snafu}; -use crate::data::partition::SnapshotBatch; +use crate::data::{partition::SnapshotBatch, table::TableName}; #[allow(clippy::enum_variant_names)] #[derive(Debug, Snafu)] @@ -53,11 +53,8 @@ pub(crate) struct QueryableBatch { /// data pub(crate) data: Vec>, - /// Delete predicates of the tombstones - pub(crate) delete_predicates: Vec>, - /// This is needed to return a reference for a trait function - pub(crate) table_name: Arc, + pub(crate) table_name: TableName, /// Partition ID pub(crate) partition_id: PartitionId, @@ -66,15 +63,12 @@ pub(crate) struct QueryableBatch { impl QueryableBatch { /// Initilaize a QueryableBatch pub(crate) fn new( - table_name: Arc, + table_name: TableName, partition_id: PartitionId, data: Vec>, - deletes: Vec, ) -> Self { - let delete_predicates = tombstones_to_delete_predicates(&deletes); Self { data, - delete_predicates, table_name, partition_id, } @@ -86,12 +80,6 @@ impl QueryableBatch { self } - /// Add more tombstones - pub(crate) fn add_tombstones(&mut self, deletes: &[Tombstone]) { - let delete_predicates = tombstones_to_delete_predicates_iter(deletes); - self.delete_predicates.extend(delete_predicates); - } - /// return min and max of all the snapshots pub(crate) fn min_max_sequence_numbers(&self) -> (SequenceNumber, SequenceNumber) { let min = self @@ -110,11 +98,6 @@ impl QueryableBatch { (min, max) } - - /// return true if it has no data - pub(crate) fn is_empty(&self) -> bool { - self.data.is_empty() - } } impl QueryChunkMeta for QueryableBatch { @@ -144,16 +127,16 @@ impl QueryChunkMeta for QueryableBatch { None // Ingester data is not sorted } - fn delete_predicates(&self) -> &[Arc] { - self.delete_predicates.as_ref() - } - fn timestamp_min_max(&self) -> Option { // Note: we need to consider which option we want to go with // . Return None here and avoid taking time to compute time's min max of RecordBacthes (current choice) // . Compute time's min max here and avoid compacting non-overlapped QueryableBatches in the Ingester None } + + fn delete_predicates(&self) -> &[Arc] { + &[] + } } impl QueryChunk for QueryableBatch { @@ -185,7 +168,7 @@ impl QueryChunk for QueryableBatch { _ctx: IOxSessionContext, _predicate: &Predicate, _columns: Selection<'_>, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { Ok(None) } @@ -199,7 +182,7 @@ impl QueryChunk for QueryableBatch { _ctx: IOxSessionContext, _column_name: &str, _predicate: &Predicate, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { Ok(None) } @@ -210,12 +193,16 @@ impl QueryChunk for QueryableBatch { mut ctx: IOxSessionContext, _predicate: &Predicate, selection: Selection<'_>, - ) -> Result { + ) -> Result { ctx.set_metadata("storage", "ingester"); ctx.set_metadata("projection", format!("{}", selection)); trace!(?selection, "selection"); - let schema = self.schema().select(selection).context(SchemaSnafu)?; + let schema = self + .schema() + .select(selection) + .context(SchemaSnafu) + .map_err(|e| DataFusionError::External(Box::new(e)))?; // Get all record batches from their snapshots let batches = self @@ -234,7 +221,8 @@ impl QueryChunk for QueryableBatch { .map(Arc::new); Some(batch) }) - .collect::, _>>()?; + .collect::, _>>() + .map_err(|e| DataFusionError::External(Box::new(e)))?; // Return stream of data let dummy_metrics = ExecutionPlanMetricsSet::new(); @@ -257,165 +245,3 @@ impl QueryChunk for QueryableBatch { self } } - -#[cfg(test)] -mod tests { - use arrow::{ - array::{ - ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray, - TimestampNanosecondArray, UInt64Array, - }, - datatypes::{DataType, Int32Type, TimeUnit}, - }; - use data_types::{DeleteExpr, Op, Scalar, TimestampRange}; - - use super::*; - use crate::test_util::create_tombstone; - - #[tokio::test] - async fn test_merge_batch_schema() { - // Merge schema of the batches - // The fields in the schema are sorted by column name - let batches = create_batches(); - let merged_schema = (*merge_record_batch_schemas(&batches)).clone(); - - // Expected Arrow schema - let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![ - arrow::datatypes::Field::new( - "dict", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - arrow::datatypes::Field::new("int64", DataType::Int64, true), - arrow::datatypes::Field::new("string", DataType::Utf8, true), - arrow::datatypes::Field::new("bool", DataType::Boolean, true), - arrow::datatypes::Field::new( - "time", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - arrow::datatypes::Field::new("uint64", DataType::UInt64, false), - arrow::datatypes::Field::new("float64", DataType::Float64, true), - ])); - let expected_schema = Schema::try_from(arrow_schema) - .unwrap() - .sort_fields_by_name(); - - assert_eq!( - expected_schema, merged_schema, - "\nExpected:\n{:#?}\nActual:\n{:#?}", - expected_schema, merged_schema - ); - } - - #[tokio::test] - async fn test_tombstones_to_delete_predicates() { - // create tombstones - let tombstones = vec![ - create_tombstone(1, 1, 1, 1, 100, 200, "temp=10"), - create_tombstone(1, 1, 1, 2, 100, 350, "temp!=10 and city=Boston"), - ]; - - // This new queryable batch will convert tombstone to delete predicates - let query_batch = - QueryableBatch::new("test_table".into(), PartitionId::new(0), vec![], tombstones); - let predicates = query_batch.delete_predicates(); - let expected = vec![ - Arc::new(DeletePredicate { - range: TimestampRange::new(100, 200), - exprs: vec![DeleteExpr { - column: String::from("temp"), - op: Op::Eq, - scalar: Scalar::I64(10), - }], - }), - Arc::new(DeletePredicate { - range: TimestampRange::new(100, 350), - exprs: vec![ - DeleteExpr { - column: String::from("temp"), - op: Op::Ne, - scalar: Scalar::I64(10), - }, - DeleteExpr { - column: String::from("city"), - op: Op::Eq, - scalar: Scalar::String(String::from(r#"Boston"#)), - }, - ], - }), - ]; - - assert_eq!(expected, predicates); - } - - // ---------------------------------------------------------------------------------------------- - // Data for testing - - // Create pure RecordBatches without knowledge of Influx datatype - fn create_batches() -> Vec> { - // Batch 1: & 3 rows - let dict_array: ArrayRef = Arc::new( - vec![Some("a"), None, Some("b")] - .into_iter() - .collect::>(), - ); - let int64_array: ArrayRef = - Arc::new([Some(-1), None, Some(2)].iter().collect::()); - let string_array: ArrayRef = Arc::new( - vec![Some("foo"), Some("and"), Some("bar")] - .into_iter() - .collect::(), - ); - let bool_array: ArrayRef = Arc::new( - [Some(true), None, Some(false)] - .iter() - .collect::(), - ); - let ts_array: ArrayRef = Arc::new( - [Some(150), Some(200), Some(1526823730000000000)] - .iter() - .collect::(), - ); - let batch1 = RecordBatch::try_from_iter_with_nullable(vec![ - ("dict", dict_array, true), - ("int64", int64_array, true), - ("string", string_array, true), - ("bool", bool_array, true), - ("time", ts_array, false), // not null - ]) - .unwrap(); - - // Batch 2: & 2 rows - let dict_array: ArrayRef = Arc::new( - vec![None, Some("d")] - .into_iter() - .collect::>(), - ); - let uint64_array: ArrayRef = Arc::new([Some(1), Some(2)].iter().collect::()); // not null - let float64_array: ArrayRef = - Arc::new([Some(1.0), Some(2.0)].iter().collect::()); - let string_array: ArrayRef = Arc::new( - vec![Some("foo"), Some("bar")] - .into_iter() - .collect::(), - ); - let bool_array: ArrayRef = Arc::new([Some(true), None].iter().collect::()); - let ts_array: ArrayRef = Arc::new( - [Some(100), Some(1626823730000000000)] // not null - .iter() - .collect::(), - ); - let batch2 = RecordBatch::try_from_iter_with_nullable(vec![ - ("dict", dict_array, true), - ("uint64", uint64_array, false), // not null - ("float64", float64_array, true), - ("string", string_array, true), - ("bool", bool_array, true), - ("time", ts_array, false), // not null - ]) - .unwrap(); - - vec![Arc::new(batch1), Arc::new(batch2)] - } -} diff --git a/ingester/src/server/grpc.rs b/ingester/src/server/grpc.rs index 4f06a93a46..3bf785843d 100644 --- a/ingester/src/server/grpc.rs +++ b/ingester/src/server/grpc.rs @@ -30,8 +30,8 @@ use trace::ctx::SpanContext; use write_summary::WriteSummary; use crate::{ - data::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream}, handler::IngestHandler, + querier_handler::{FlatIngesterQueryResponse, FlatIngesterQueryResponseStream}, }; /// This type is responsible for managing all gRPC services exposed by @@ -410,9 +410,6 @@ impl Stream for GetStream { parquet_max_sequence_number: status .parquet_max_sequence_number .map(|x| x.get()), - tombstone_max_sequence_number: status - .tombstone_max_sequence_number - .map(|x| x.get()), }), }; prost::Message::encode(&app_metadata, &mut bytes) @@ -467,8 +464,9 @@ mod tests { use mutable_batch_lp::test_helpers::lp_to_mutable_batch; use schema::selection::Selection; + use crate::querier_handler::PartitionStatus; + use super::*; - use crate::data::partition::PartitionStatus; #[tokio::test] async fn test_get_stream_empty() { @@ -489,7 +487,6 @@ mod tests { partition_id: PartitionId::new(1), status: PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }, }), Ok(FlatIngesterQueryResponse::StartSnapshot { schema }), @@ -502,7 +499,6 @@ mod tests { partition_id: 1, status: Some(proto::PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, }), @@ -527,7 +523,6 @@ mod tests { partition_id: PartitionId::new(1), status: PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }, }), Err(ArrowError::IoError("foo".into())), @@ -535,7 +530,6 @@ mod tests { partition_id: PartitionId::new(1), status: PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }, }), ], @@ -546,7 +540,6 @@ mod tests { partition_id: 1, status: Some(proto::PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, }), diff --git a/ingester/src/stream_handler/handler.rs b/ingester/src/stream_handler/handler.rs index 3fa563b188..9a52b10505 100644 --- a/ingester/src/stream_handler/handler.rs +++ b/ingester/src/stream_handler/handler.rs @@ -396,6 +396,12 @@ something clever.", if let Some(delta) = duration_since_production { // Update the TTBR metric before potentially sleeping. self.time_to_be_readable.set(delta); + trace!( + kafka_topic=%self.topic_name, + shard_index=%self.shard_index, + delta=%delta.as_millis(), + "reporting TTBR for shard (ms)" + ); } if should_pause { @@ -939,7 +945,7 @@ mod tests { Ok(DmlOperation::Write(make_write("good_op", 2))) ]], sink_rets = [ - Err(crate::data::Error::TableNotPresent), + Err(crate::data::Error::NamespaceNotFound{namespace: "bananas".to_string() }), Ok(true), ], want_ttbr = 2, diff --git a/ingester/src/stream_handler/mod.rs b/ingester/src/stream_handler/mod.rs index 296f158e1a..5e9a351fe4 100644 --- a/ingester/src/stream_handler/mod.rs +++ b/ingester/src/stream_handler/mod.rs @@ -17,7 +17,7 @@ //! [`LifecycleManager`]: crate::lifecycle::LifecycleManager //! [`LifecycleHandle::can_resume_ingest()`]: crate::lifecycle::LifecycleHandle::can_resume_ingest() -pub mod handler; +pub(crate) mod handler; mod periodic_watermark_fetcher; mod sink; @@ -25,8 +25,8 @@ mod sink; pub mod mock_sink; #[cfg(test)] pub mod mock_watermark_fetcher; -pub mod sink_adaptor; -pub mod sink_instrumentation; +pub(crate) mod sink_adaptor; +pub(crate) mod sink_instrumentation; -pub use periodic_watermark_fetcher::*; -pub use sink::*; +pub(crate) use periodic_watermark_fetcher::*; +pub(crate) use sink::*; diff --git a/ingester/src/stream_handler/periodic_watermark_fetcher.rs b/ingester/src/stream_handler/periodic_watermark_fetcher.rs index 43c8cf52c9..37f99663cc 100644 --- a/ingester/src/stream_handler/periodic_watermark_fetcher.rs +++ b/ingester/src/stream_handler/periodic_watermark_fetcher.rs @@ -24,7 +24,7 @@ use super::sink_instrumentation::WatermarkFetcher; /// Emits an error metric named `write_buffer_watermark_fetch_errors` that /// increments once per fetch error. #[derive(Debug)] -pub struct PeriodicWatermarkFetcher { +pub(crate) struct PeriodicWatermarkFetcher { last_watermark: Arc, poll_handle: JoinHandle<()>, } diff --git a/ingester/src/stream_handler/sink.rs b/ingester/src/stream_handler/sink.rs index 5f8220a942..825b012ce9 100644 --- a/ingester/src/stream_handler/sink.rs +++ b/ingester/src/stream_handler/sink.rs @@ -5,7 +5,7 @@ use dml::DmlOperation; /// A [`DmlSink`] handles [`DmlOperation`] instances read from a shard. #[async_trait] -pub trait DmlSink: Debug + Send + Sync { +pub(crate) trait DmlSink: Debug + Send + Sync { /// Apply `op` read from a shard, returning `Ok(true)` if ingest should /// be paused. async fn apply(&self, op: DmlOperation) -> Result; diff --git a/ingester/src/stream_handler/sink_instrumentation.rs b/ingester/src/stream_handler/sink_instrumentation.rs index 24b05cbf21..998e14bb48 100644 --- a/ingester/src/stream_handler/sink_instrumentation.rs +++ b/ingester/src/stream_handler/sink_instrumentation.rs @@ -414,11 +414,13 @@ mod tests { let got = test( op, &metrics, - Err(crate::data::Error::TableNotPresent), + Err(crate::data::Error::NamespaceNotFound { + namespace: "bananas".to_string(), + }), Some(12345), ) .await; - assert_matches!(got, Err(crate::data::Error::TableNotPresent)); + assert_matches!(got, Err(crate::data::Error::NamespaceNotFound { .. })); // Validate the various write buffer metrics assert_matches!( diff --git a/ingester/src/test_util.rs b/ingester/src/test_util.rs index 09045083e8..05dc226f90 100644 --- a/ingester/src/test_util.rs +++ b/ingester/src/test_util.rs @@ -9,17 +9,16 @@ use arrow::record_batch::RecordBatch; use arrow_util::assert_batches_eq; use bitflags::bitflags; use data_types::{ - CompactionLevel, NamespaceId, NonEmptyString, PartitionId, PartitionKey, Sequence, - SequenceNumber, ShardId, ShardIndex, TableId, Timestamp, Tombstone, TombstoneId, + CompactionLevel, NamespaceId, PartitionId, PartitionKey, Sequence, SequenceNumber, ShardId, + ShardIndex, TableId, }; -use dml::{DmlDelete, DmlMeta, DmlOperation, DmlWrite}; +use dml::{DmlMeta, DmlOperation, DmlWrite}; use iox_catalog::{interface::Catalog, mem::MemCatalog}; use iox_query::test::{raw_data, TestChunk}; use iox_time::{SystemProvider, Time}; use mutable_batch_lp::lines_to_batches; use object_store::memory::InMemory; use parquet_file::metadata::IoxMetadata; -use predicate::delete_predicate::parse_delete_predicate; use schema::sort::SortKey; use uuid::Uuid; @@ -28,31 +27,10 @@ use crate::{ partition::{resolver::CatalogPartitionResolver, PersistingBatch, SnapshotBatch}, IngesterData, }, - lifecycle::{LifecycleConfig, LifecycleHandle, LifecycleManager}, + lifecycle::{LifecycleConfig, LifecycleManager}, query::QueryableBatch, }; -/// Create tombstone for testing -pub(crate) fn create_tombstone( - id: i64, - table_id: i64, - shard_id: i64, - seq_num: i64, - min_time: i64, - max_time: i64, - predicate: &str, -) -> Tombstone { - Tombstone { - id: TombstoneId::new(id), - table_id: TableId::new(table_id), - shard_id: ShardId::new(shard_id), - sequence_number: SequenceNumber::new(seq_num), - min_time: Timestamp::new(min_time), - max_time: Timestamp::new(max_time), - serialized_predicate: predicate.to_string(), - } -} - #[allow(clippy::too_many_arguments)] pub(crate) fn make_meta( object_store_id: Uuid, @@ -93,15 +71,8 @@ pub(crate) fn make_persisting_batch( partition_id: i64, object_store_id: Uuid, batches: Vec>, - tombstones: Vec, ) -> Arc { - let queryable_batch = make_queryable_batch_with_deletes( - table_name, - partition_id, - seq_num_start, - batches, - tombstones, - ); + let queryable_batch = make_queryable_batch(table_name, partition_id, seq_num_start, batches); Arc::new(PersistingBatch { shard_id: ShardId::new(shard_id), table_id: TableId::new(table_id), @@ -116,16 +87,6 @@ pub(crate) fn make_queryable_batch( partition_id: i64, seq_num_start: i64, batches: Vec>, -) -> Arc { - make_queryable_batch_with_deletes(table_name, partition_id, seq_num_start, batches, vec![]) -} - -pub(crate) fn make_queryable_batch_with_deletes( - table_name: &str, - partition_id: i64, - seq_num_start: i64, - batches: Vec>, - tombstones: Vec, ) -> Arc { // make snapshots for the batches let mut snapshots = vec![]; @@ -140,7 +101,6 @@ pub(crate) fn make_queryable_batch_with_deletes( table_name.into(), PartitionId::new(partition_id), snapshots, - tombstones, )) } @@ -655,65 +615,24 @@ pub(crate) async fn make_ingester_data(two_partitions: bool, loc: DataLocation) let _ignored = ingester .shard(shard_id) .unwrap() - .namespace(TEST_NAMESPACE) + .namespace(&TEST_NAMESPACE.into()) .unwrap() - .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1)) + .snapshot_to_persisting(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1)) .await; } else if loc.contains(DataLocation::SNAPSHOT) { // move partition 1 data to snapshot let _ignored = ingester .shard(shard_id) .unwrap() - .namespace(TEST_NAMESPACE) + .namespace(&TEST_NAMESPACE.into()) .unwrap() - .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1)) + .snapshot(&TEST_TABLE.into(), &PartitionKey::from(TEST_PARTITION_1)) .await; } ingester } -pub(crate) async fn make_ingester_data_with_tombstones(loc: DataLocation) -> IngesterData { - // Whatever data because they won't be used in the tests - let metrics: Arc = Default::default(); - let catalog: Arc = Arc::new(MemCatalog::new(Arc::clone(&metrics))); - let object_store = Arc::new(InMemory::new()); - let exec = Arc::new(iox_query::exec::Executor::new(1)); - let lifecycle = LifecycleManager::new( - LifecycleConfig::new( - 200_000_000, - 100_000_000, - 100_000_000, - Duration::from_secs(100_000_000), - Duration::from_secs(100_000_000), - 100_000_000, - ), - Arc::clone(&metrics), - Arc::new(SystemProvider::default()), - ); - - // Make data for one shard and two tables - let shard_index = ShardIndex::new(0); - let (shard_id, _, _) = - populate_catalog(&*catalog, shard_index, TEST_NAMESPACE, TEST_TABLE).await; - - let ingester = IngesterData::new( - object_store, - Arc::clone(&catalog), - [(shard_id, shard_index)], - exec, - Arc::new(CatalogPartitionResolver::new(catalog)), - backoff::BackoffConfig::default(), - metrics, - ); - - // Make partitions per requested - make_one_partition_with_tombstones(&ingester, &lifecycle.handle(), loc, shard_index, shard_id) - .await; - - ingester -} - /// Make data for one or two partitions per requested pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> Vec { // In-memory data includes these rows but split between 4 groups go into @@ -783,133 +702,6 @@ pub(crate) fn make_partitions(two_partitions: bool, shard_index: ShardIndex) -> ops } -/// Make data for one partition with tombstones -async fn make_one_partition_with_tombstones( - ingester: &IngesterData, - lifecycle_handle: &dyn LifecycleHandle, - loc: DataLocation, - shard_index: ShardIndex, - shard_id: ShardId, -) { - // In-memory data includes these rows but split between 4 groups go into - // different batches of parittion 1 or partittion 2 as requeted - // let expected = vec![ - // "+------------+-----+------+--------------------------------+", - // "| city | day | temp | time |", - // "+------------+-----+------+--------------------------------+", - // "| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |", // in group 1 - seq_num: 2 - // "| Andover | mon | | 1970-01-01T00:00:00.000000046Z |", // in group 2 - seq_num: 3 - // "| Boston | sun | 60 | 1970-01-01T00:00:00.000000036Z |", // in group 1 - seq_num: 1 --> will get deleted - // "| Boston | mon | | 1970-01-01T00:00:00.000000038Z |", // in group 3 - seq_num: 5 --> will get deleted - // "| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 8 (after the tombstone's seq num) - // "| Medford | wed | | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4 - // "| Reading | mon | 58 | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 9 - // "| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6 - // "+------------+-----+------+--------------------------------+", - // ]; - - let (ops, seq_num) = - make_first_partition_data(&PartitionKey::from(TEST_PARTITION_1), shard_index); - - // Apply all ops - for op in ops { - ingester - .buffer_operation(shard_id, op, lifecycle_handle) - .await - .unwrap(); - } - - if loc.contains(DataLocation::PERSISTING) { - // Move partition 1 data to persisting - let _ignored = ingester - .shard(shard_id) - .unwrap() - .namespace(TEST_NAMESPACE) - .unwrap() - .snapshot_to_persisting(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1)) - .await; - } else if loc.contains(DataLocation::SNAPSHOT) { - // move partition 1 data to snapshot - let _ignored = ingester - .shard(shard_id) - .unwrap() - .namespace(TEST_NAMESPACE) - .unwrap() - .snapshot(TEST_TABLE, &PartitionKey::from(TEST_PARTITION_1)) - .await; - } - - // Add tombstones - // Depending on where the existing data is, they (buffer & snapshot) will be either moved to a new snapshot after - // applying the tombstone or (persisting) stay where they are and the tombstones is kept to get applied later - // ------------------------------------------ - // Delete - let mut seq_num = seq_num.get(); - seq_num += 1; - - let delete = parse_delete_predicate( - "1970-01-01T00:00:00.000000010Z", - "1970-01-01T00:00:00.000000050Z", - "city=Boston", - ) - .unwrap(); - - ingester - .buffer_operation( - shard_id, - DmlOperation::Delete(DmlDelete::new( - TEST_NAMESPACE.to_string(), - delete, - NonEmptyString::new(TEST_TABLE), - DmlMeta::sequenced( - Sequence { - shard_index, - sequence_number: SequenceNumber::new(seq_num), - }, - Time::MIN, - None, - 42, - ), - )), - lifecycle_handle, - ) - .await - .unwrap(); - - // Group 4: in buffer of p1 after the tombstone - - ingester - .buffer_operation( - shard_id, - DmlOperation::Write(make_write_op( - &PartitionKey::from(TEST_PARTITION_1), - shard_index, - TEST_NAMESPACE, - seq_num, - r#"test_table,city=Medford day="sun",temp=55 22"#, - )), - lifecycle_handle, - ) - .await - .unwrap(); - seq_num += 1; - - ingester - .buffer_operation( - shard_id, - DmlOperation::Write(make_write_op( - &PartitionKey::from(TEST_PARTITION_1), - shard_index, - TEST_NAMESPACE, - seq_num, - r#"test_table,city=Reading day="mon",temp=58 40"#, - )), - lifecycle_handle, - ) - .await - .unwrap(); -} - pub(crate) fn make_write_op( partition_key: &PartitionKey, shard_index: ShardIndex, diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs index 431c22cdb7..3aae75747d 100644 --- a/iox_catalog/src/interface.rs +++ b/iox_catalog/src/interface.rs @@ -463,7 +463,10 @@ pub trait PartitionRepo: Send + Sync { partition_id: PartitionId, ) -> Result>; - /// Update the sort key for the partition + /// Update the sort key for the partition. + /// + /// NOTE: it is expected that ONLY the ingesters update sort keys for + /// existing partitions. async fn update_sort_key( &mut self, partition_id: PartitionId, diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs index 7544e65370..d28a5f310d 100644 --- a/iox_catalog/src/postgres.rs +++ b/iox_catalog/src/postgres.rs @@ -1878,7 +1878,7 @@ LIMIT $4; sqlx::query_as::<_, PartitionParam>( r#" SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id, - parquet_file.table_id, + parquet_file.table_id, count(case when to_delete is null then 1 end) total_count, max(case when compaction_level= $4 then parquet_file.created_at end) FROM parquet_file diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml index 3ace171104..24d6baac34 100644 --- a/iox_data_generator/Cargo.toml +++ b/iox_data_generator/Cargo.toml @@ -11,7 +11,7 @@ chrono = { version = "0.4", default-features = false } chrono-english = "0.1.4" clap = { version = "4", features = ["derive", "env", "cargo"] } futures = "0.3" -handlebars = "4.3.4" +handlebars = "4.3.5" humantime = "2.1.0" influxdb2_client = { path = "../influxdb2_client" } itertools = "0.10.5" @@ -22,7 +22,7 @@ rand = { version = "0.8.3", features = ["small_rng"] } regex = "1.6" schema = { path = "../schema" } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" tokio = { version = "1.21", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } toml = "0.5.9" diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs index 6c85358e4f..ca6be3acde 100644 --- a/iox_query/src/exec/seriesset/converter.rs +++ b/iox_query/src/exec/seriesset/converter.rs @@ -762,7 +762,7 @@ mod tests { .unwrap(); // Input has one row that has no value (NULL value) for tag_b, which is its own series - let input = stream_from_batch(batch); + let input = stream_from_batch(batch.schema(), batch); let table_name = "foo"; let tag_columns = ["tag_a", "tag_b"]; @@ -873,7 +873,8 @@ mod tests { .collect(); // stream from those batches - stream_from_batches(batches) + assert!(!batches.is_empty()); + stream_from_batches(batches[0].schema(), batches) }) .collect() } diff --git a/iox_query/src/frontend/influxrpc.rs b/iox_query/src/frontend/influxrpc.rs index 0940aff71b..1a8750c779 100644 --- a/iox_query/src/frontend/influxrpc.rs +++ b/iox_query/src/frontend/influxrpc.rs @@ -17,12 +17,14 @@ use arrow::datatypes::DataType; use data_types::ChunkId; use datafusion::{ error::DataFusionError, + logical_expr::utils::exprlist_to_columns, logical_plan::{col, when, DFSchemaRef, Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder}, + prelude::Column, }; use datafusion_util::AsExpr; use futures::{Stream, StreamExt, TryStreamExt}; use hashbrown::HashSet; -use observability_deps::tracing::{debug, trace}; +use observability_deps::tracing::{debug, trace, warn}; use predicate::{rpc_predicate::InfluxRpcPredicate, Predicate, PredicateMatch}; use query_functions::{ group_by::{Aggregate, WindowDuration}, @@ -31,39 +33,18 @@ use query_functions::{ }; use schema::{selection::Selection, InfluxColumnType, Schema, TIME_COLUMN_NAME}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use std::collections::HashSet as StdHashSet; use std::{cmp::Reverse, collections::BTreeSet, sync::Arc}; const CONCURRENT_TABLE_JOBS: usize = 10; #[derive(Debug, Snafu)] pub enum Error { - #[snafu(display("gRPC planner got error making table_name plan for chunk: {}", source))] - TableNamePlan { - source: Box, - }, - - #[snafu(display("gRPC planner got error listing partition keys: {}", source))] - ListingPartitions { - source: Box, - }, - #[snafu(display("gRPC planner got error finding column names: {}", source))] - FindingColumnNames { - source: Box, - }, + FindingColumnNames { source: DataFusionError }, #[snafu(display("gRPC planner got error finding column values: {}", source))] - FindingColumnValues { - source: Box, - }, - - #[snafu(display( - "gRPC planner got internal error making table_name with default predicate: {}", - source - ))] - InternalTableNamePlanForDefault { - source: Box, - }, + FindingColumnValues { source: DataFusionError }, #[snafu(display( "gRPC planner got error fetching chunks for table '{}': {}", @@ -72,7 +53,7 @@ pub enum Error { ))] GettingChunks { table_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display( @@ -82,19 +63,20 @@ pub enum Error { ))] CheckingChunkPredicate { chunk_id: ChunkId, - source: Box, + source: DataFusionError, }, #[snafu(display("gRPC planner got error creating string set plan: {}", source))] CreatingStringSet { source: StringSetError }, #[snafu(display("gRPC planner got error creating predicates: {}", source))] - CreatingPredicates { - source: datafusion::error::DataFusionError, - }, + CreatingPredicates { source: DataFusionError }, #[snafu(display("gRPC planner got error building plan: {}", source))] - BuildingPlan { + BuildingPlan { source: DataFusionError }, + + #[snafu(display("gRPC planner got error reading columns from expression: {}", source))] + ReadColumns { source: datafusion::error::DataFusionError, }, @@ -148,7 +130,7 @@ pub enum Error { CastingAggregates { agg: Aggregate, field_name: String, - source: datafusion::error::DataFusionError, + source: DataFusionError, }, #[snafu(display("Internal error: unexpected aggregate request for None aggregate",))] @@ -163,6 +145,35 @@ pub enum Error { pub type Result = std::result::Result; +impl Error { + pub fn to_df_error(self, method: &'static str) -> DataFusionError { + let msg = self.to_string(); + + match self { + Self::GettingChunks { source, .. } + | Self::CreatingPredicates { source, .. } + | Self::BuildingPlan { source, .. } + | Self::ReadColumns { source, .. } + | Self::CheckingChunkPredicate { source, .. } + | Self::FindingColumnNames { source, .. } + | Self::FindingColumnValues { source, .. } + | Self::CastingAggregates { source, .. } => { + DataFusionError::Context(format!("{method}: {msg}"), Box::new(source)) + } + e @ (Self::CreatingStringSet { .. } + | Self::TableRemoved { .. } + | Self::InvalidTagColumn { .. } + | Self::InternalInvalidTagType { .. } + | Self::DuplicateGroupColumn { .. } + | Self::GroupColumnNotFound { .. } + | Self::CreatingAggregates { .. } + | Self::CreatingScan { .. } + | Self::InternalUnexpectedNoneAggregate {} + | Self::InternalAggregateNotSelector { .. }) => DataFusionError::External(Box::new(e)), + } + } +} + impl From for Error { fn from(source: super::common::Error) -> Self { Self::CreatingScan { source } @@ -227,49 +238,50 @@ impl InfluxRpcPlanner { let table_predicates = rpc_predicate .table_predicates(database.as_meta()) .context(CreatingPredicatesSnafu)?; - let tables: Vec<_> = table_chunk_stream(Arc::clone(&database), &table_predicates, &ctx) - .try_filter_map(|(table_name, predicate, chunks)| async move { - // Identify which chunks can answer from its metadata and then record its table, - // and which chunks needs full plan and group them into their table - let mut chunks_full = vec![]; - for chunk in cheap_chunk_first(chunks) { - trace!(chunk_id=%chunk.id(), %table_name, "Considering table"); + let tables: Vec<_> = + table_chunk_stream(Arc::clone(&database), false, &table_predicates, &ctx) + .try_filter_map(|(table_name, predicate, chunks)| async move { + // Identify which chunks can answer from its metadata and then record its table, + // and which chunks needs full plan and group them into their table + let mut chunks_full = vec![]; + for chunk in cheap_chunk_first(chunks) { + trace!(chunk_id=%chunk.id(), %table_name, "Considering table"); - // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate - // deleted data before we can determine if its table participates in the requested predicate. - if chunk.has_delete_predicates() { - chunks_full.push(chunk); - } else { - // Try and apply the predicate using only metadata - let pred_result = chunk.apply_predicate_to_metadata(predicate).context( - CheckingChunkPredicateSnafu { - chunk_id: chunk.id(), - }, - )?; + // If the chunk has delete predicates, we need to scan (do full plan) the data to eliminate + // deleted data before we can determine if its table participates in the requested predicate. + if chunk.has_delete_predicates() { + chunks_full.push(chunk); + } else { + // Try and apply the predicate using only metadata + let pred_result = chunk + .apply_predicate_to_metadata(predicate) + .context(CheckingChunkPredicateSnafu { + chunk_id: chunk.id(), + })?; - match pred_result { - PredicateMatch::AtLeastOneNonNullField => { - trace!("Metadata predicate: table matches"); - // Meta data of the table covers predicates of the request - return Ok(Some((table_name, None))); + match pred_result { + PredicateMatch::AtLeastOneNonNullField => { + trace!("Metadata predicate: table matches"); + // Meta data of the table covers predicates of the request + return Ok(Some((table_name, None))); + } + PredicateMatch::Unknown => { + trace!("Metadata predicate: unknown match"); + // We cannot match the predicate to get answer from meta data, let do full plan + chunks_full.push(chunk); + } + PredicateMatch::Zero => { + trace!("Metadata predicate: zero rows match"); + } // this chunk's table does not participate in the request } - PredicateMatch::Unknown => { - trace!("Metadata predicate: unknown match"); - // We cannot match the predicate to get answer from meta data, let do full plan - chunks_full.push(chunk); - } - PredicateMatch::Zero => { - trace!("Metadata predicate: zero rows match"); - } // this chunk's table does not participate in the request } } - } - Ok((!chunks_full.is_empty()) - .then_some((table_name, Some((predicate, chunks_full))))) - }) - .try_collect() - .await?; + Ok((!chunks_full.is_empty()) + .then_some((table_name, Some((predicate, chunks_full))))) + }) + .try_collect() + .await?; // Feed builder let mut builder = StringSetPlanBuilder::new(); @@ -341,84 +353,88 @@ impl InfluxRpcPlanner { } } - let tables: Vec<_> = - table_chunk_stream(Arc::clone(&database), &table_predicates_need_chunks, &ctx) - .and_then(|(table_name, predicate, chunks)| { - let mut ctx = ctx.child_ctx("table"); - ctx.set_metadata("table", table_name.to_owned()); + let tables: Vec<_> = table_chunk_stream( + Arc::clone(&database), + false, + &table_predicates_need_chunks, + &ctx, + ) + .and_then(|(table_name, predicate, chunks)| { + let mut ctx = ctx.child_ctx("table"); + ctx.set_metadata("table", table_name.to_owned()); - async move { - let mut chunks_full = vec![]; - let mut known_columns = BTreeSet::new(); + async move { + let mut chunks_full = vec![]; + let mut known_columns = BTreeSet::new(); - for chunk in cheap_chunk_first(chunks) { - // Try and apply the predicate using only metadata - let pred_result = chunk - .apply_predicate_to_metadata(predicate) - .context(CheckingChunkPredicateSnafu { - chunk_id: chunk.id(), - })?; + for chunk in cheap_chunk_first(chunks) { + // Try and apply the predicate using only metadata + let pred_result = chunk.apply_predicate_to_metadata(predicate).context( + CheckingChunkPredicateSnafu { + chunk_id: chunk.id(), + }, + )?; - if matches!(pred_result, PredicateMatch::Zero) { - continue; + if matches!(pred_result, PredicateMatch::Zero) { + continue; + } + + // get only tag columns from metadata + let schema = chunk.schema(); + + let column_names: Vec<&str> = schema + .tags_iter() + .map(|f| f.name().as_str()) + .collect::>(); + + let selection = Selection::Some(&column_names); + + // If there are delete predicates, we need to scan (or do full plan) the data to eliminate + // deleted data before getting tag keys + if chunk.has_delete_predicates() { + debug!( + %table_name, + chunk_id=%chunk.id().get(), + "column names need full plan" + ); + chunks_full.push(chunk); + } else { + // filter the columns further from the predicate + let maybe_names = chunk + .column_names( + ctx.child_ctx("column_names execution"), + predicate, + selection, + ) + .context(FindingColumnNamesSnafu)?; + + match maybe_names { + Some(mut names) => { + debug!( + %table_name, + names=?names, + chunk_id=%chunk.id().get(), + "column names found from metadata", + ); + known_columns.append(&mut names); } - - // get only tag columns from metadata - let schema = chunk.schema(); - - let column_names: Vec<&str> = schema - .tags_iter() - .map(|f| f.name().as_str()) - .collect::>(); - - let selection = Selection::Some(&column_names); - - // If there are delete predicates, we need to scan (or do full plan) the data to eliminate - // deleted data before getting tag keys - if chunk.has_delete_predicates() { + None => { debug!( %table_name, chunk_id=%chunk.id().get(), "column names need full plan" ); chunks_full.push(chunk); - } else { - // filter the columns further from the predicate - let maybe_names = chunk - .column_names( - ctx.child_ctx("column_names execution"), - predicate, - selection, - ) - .context(FindingColumnNamesSnafu)?; - - match maybe_names { - Some(mut names) => { - debug!( - %table_name, - names=?names, - chunk_id=%chunk.id().get(), - "column names found from metadata", - ); - known_columns.append(&mut names); - } - None => { - debug!( - %table_name, - chunk_id=%chunk.id().get(), - "column names need full plan" - ); - chunks_full.push(chunk); - } - } } } - - Ok((table_name, predicate, chunks_full, known_columns)) } - }) - .try_collect() - .await?; + } + + Ok((table_name, predicate, chunks_full, known_columns)) + } + }) + .try_collect() + .await?; // At this point, we have a set of column names we know pass // in `known_columns`, and potentially some tables in chunks @@ -492,100 +508,104 @@ impl InfluxRpcPlanner { table_predicates_filtered.push((table_name, predicate)); } - let tables: Vec<_> = - table_chunk_stream(Arc::clone(&database), &table_predicates_filtered, &ctx) - .and_then(|(table_name, predicate, chunks)| async move { - let mut chunks_full = vec![]; - let mut known_values = BTreeSet::new(); + let tables: Vec<_> = table_chunk_stream( + Arc::clone(&database), + false, + &table_predicates_filtered, + &ctx, + ) + .and_then(|(table_name, predicate, chunks)| async move { + let mut chunks_full = vec![]; + let mut known_values = BTreeSet::new(); - for chunk in cheap_chunk_first(chunks) { - // Try and apply the predicate using only metadata - let pred_result = chunk.apply_predicate_to_metadata(predicate).context( - CheckingChunkPredicateSnafu { - chunk_id: chunk.id(), - }, - )?; + for chunk in cheap_chunk_first(chunks) { + // Try and apply the predicate using only metadata + let pred_result = chunk.apply_predicate_to_metadata(predicate).context( + CheckingChunkPredicateSnafu { + chunk_id: chunk.id(), + }, + )?; - if matches!(pred_result, PredicateMatch::Zero) { - continue; + if matches!(pred_result, PredicateMatch::Zero) { + continue; + } + + // use schema to validate column type + let schema = chunk.schema(); + + // Skip this table if the tag_name is not a column in this chunk + // Note: This may happen even when the table contains the tag_name, because some chunks may not + // contain all columns. + let idx = if let Some(idx) = schema.find_index_of(tag_name) { + idx + } else { + continue; + }; + + // Validate that this really is a Tag column + let (influx_column_type, field) = schema.field(idx); + ensure!( + matches!(influx_column_type, Some(InfluxColumnType::Tag)), + InvalidTagColumnSnafu { + tag_name, + influx_column_type, + } + ); + ensure!( + influx_column_type + .unwrap() + .valid_arrow_type(field.data_type()), + InternalInvalidTagTypeSnafu { + tag_name, + data_type: field.data_type().clone(), + } + ); + + // If there are delete predicates, we need to scan (or do full plan) the data to eliminate + // deleted data before getting tag values + if chunk.has_delete_predicates() { + debug!( + %table_name, + chunk_id=%chunk.id().get(), + "need full plan to find tag values" + ); + + chunks_full.push(chunk); + } else { + // try and get the list of values directly from metadata + let mut ctx = self.ctx.child_ctx("tag_values execution"); + ctx.set_metadata("table", table_name.to_owned()); + + let maybe_values = chunk + .column_values(ctx, tag_name, predicate) + .context(FindingColumnValuesSnafu)?; + + match maybe_values { + Some(mut names) => { + debug!( + %table_name, + names=?names, + chunk_id=%chunk.id().get(), + "tag values found from metadata", + ); + known_values.append(&mut names); } - - // use schema to validate column type - let schema = chunk.schema(); - - // Skip this table if the tag_name is not a column in this chunk - // Note: This may happen even when the table contains the tag_name, because some chunks may not - // contain all columns. - let idx = if let Some(idx) = schema.find_index_of(tag_name) { - idx - } else { - continue; - }; - - // Validate that this really is a Tag column - let (influx_column_type, field) = schema.field(idx); - ensure!( - matches!(influx_column_type, Some(InfluxColumnType::Tag)), - InvalidTagColumnSnafu { - tag_name, - influx_column_type, - } - ); - ensure!( - influx_column_type - .unwrap() - .valid_arrow_type(field.data_type()), - InternalInvalidTagTypeSnafu { - tag_name, - data_type: field.data_type().clone(), - } - ); - - // If there are delete predicates, we need to scan (or do full plan) the data to eliminate - // deleted data before getting tag values - if chunk.has_delete_predicates() { + None => { debug!( %table_name, chunk_id=%chunk.id().get(), "need full plan to find tag values" ); - chunks_full.push(chunk); - } else { - // try and get the list of values directly from metadata - let mut ctx = self.ctx.child_ctx("tag_values execution"); - ctx.set_metadata("table", table_name.to_owned()); - - let maybe_values = chunk - .column_values(ctx, tag_name, predicate) - .context(FindingColumnValuesSnafu)?; - - match maybe_values { - Some(mut names) => { - debug!( - %table_name, - names=?names, - chunk_id=%chunk.id().get(), - "tag values found from metadata", - ); - known_values.append(&mut names); - } - None => { - debug!( - %table_name, - chunk_id=%chunk.id().get(), - "need full plan to find tag values" - ); - chunks_full.push(chunk); - } - } } } + } + } - Ok((table_name, predicate, chunks_full, known_values)) - }) - .try_collect() - .await?; + Ok((table_name, predicate, chunks_full, known_values)) + }) + .try_collect() + .await?; let mut builder = StringSetPlanBuilder::new(); @@ -1312,8 +1332,18 @@ impl InfluxRpcPlanner { } /// Stream of chunks for table predicates. +/// This function is used by influx grpc meta queries that want to know which table/tags/fields +/// that match the given predicates. +/// `need_fields` means the grpc queries will need to return field columns. If `need_fields` +/// is false, the grpc query does not need to return field columns but it still filters data on the +/// field columns in the predicate +/// +/// This function is directly invoked by `table_name, `tag_keys` and `tag_values` where need_fields should be false. +/// This function is indirectly invoked by `field_columns`, `read_filter`, `read_group` and `read_window_aggregate` +/// through the function `create_plans` where need_fields should be true. fn table_chunk_stream<'a>( database: Arc, + need_fields: bool, table_predicates: &'a [(String, Predicate)], ctx: &'a IOxSessionContext, ) -> impl Stream>)>> + 'a { @@ -1324,9 +1354,22 @@ fn table_chunk_stream<'a>( let database = Arc::clone(&database); + let table_schema = database.table_schema(table_name); + let projection = match table_schema { + Some(table_schema) => { + columns_in_predicates(need_fields, table_schema, table_name, predicate) + } + None => None, + }; + async move { let chunks = database - .chunks(table_name, predicate, ctx.child_ctx("table chunks")) + .chunks( + table_name, + predicate, + &projection, + ctx.child_ctx("table chunks"), + ) .await .context(GettingChunksSnafu { table_name })?; @@ -1336,6 +1379,89 @@ fn table_chunk_stream<'a>( .buffered(CONCURRENT_TABLE_JOBS) } +// Return all columns in predicate's field_columns, exprs and val_exprs. +// Return None means nothing is filtered in this function and all field columns should be used. +// None is returned when: +// - we cannot determine at least one column in the predicate +// - need_fields is true and the predicate does not have any field_columns. +// This signal that all fields are needed. +// Note that the returned columns can also include tag and time columns if they happen to be +// in the predicate. +fn columns_in_predicates( + need_fields: bool, + table_schema: Arc, + table_name: &String, + predicate: &Predicate, +) -> Option> { + let mut columns = StdHashSet::new(); + + // columns in field_columns + match &predicate.field_columns { + Some(field_columns) => { + for field in field_columns { + columns.insert(Column { + relation: None, + name: (*field).clone(), + }); + } + } + None => { + if need_fields { + // fields wanted and `field_columns` is empty mean al fields will be needed + return None; + } + } + } + + // columns in exprs + let expr_cols_result = + exprlist_to_columns(&predicate.exprs, &mut columns).context(ReadColumnsSnafu); + + // columns in val_exprs + let exprs: Vec = predicate + .value_expr + .iter() + .map(|e| Expr::from((*e).clone())) + .collect(); + let val_exprs_cols_result = exprlist_to_columns(&exprs, &mut columns).context(ReadColumnsSnafu); + + let projection = if expr_cols_result.is_err() || val_exprs_cols_result.is_err() { + if expr_cols_result.is_err() { + let error_message = expr_cols_result.err().unwrap().to_string(); + warn!(?table_name, ?predicate.exprs, ?error_message, "cannot determine columns in predicate.exprs"); + } + if val_exprs_cols_result.is_err() { + let error_message = val_exprs_cols_result.err().unwrap().to_string(); + warn!(?table_name, ?predicate.value_expr, ?error_message, "cannot determine columns in predicate.value_expr"); + } + + None + } else { + // convert the column names into their corresponding indexes in the schema + let cols = columns + .iter() + .map(|c| table_schema.find_index_of(&c.name)) + .collect::>(); + + if cols.contains(&None) || cols.is_empty() { + // At least one column has no matching index, we do not know which + // columns to filter. Read all columns + warn!( + ?table_name, + ?predicate, + ?table_schema, + "cannot find index for at least one column in the table schema" + ); + None + } else { + // We know which columns to filter, read only those columns + Some(cols.into_iter().flatten().collect::>()) + } + }; + + projection +} + /// Create plans that fetch the data specified in table_predicates. /// /// table_predicates contains `(table_name, predicate_specialized_for_that_table)` @@ -1364,7 +1490,7 @@ where + Sync, P: Send, { - table_chunk_stream(Arc::clone(&database), table_predicates, &ctx) + table_chunk_stream(Arc::clone(&database), true, table_predicates, &ctx) .and_then(|(table_name, predicate, chunks)| async move { let chunks = prune_chunks_metadata(chunks, predicate)?; Ok((table_name, predicate, chunks)) @@ -1762,15 +1888,462 @@ fn cheap_chunk_first(mut chunks: Vec>) -> Vec all columns will be selected + let need_fields = true; + + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes all 5 columns of the table because we asked it return all fileds (and implicit PK) even though the predicate is on `foo` only + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 5); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field"); + assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2"); + assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME); + executor.join().await; + + //////////////////////////// + // Test 2: no need_fields --> only PK + columns in predicate are return + let need_fields = false; + + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes only 3 columns of the table PK + cols in predicate + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 3); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), TIME_COLUMN_NAME); + executor.join().await; + } + + #[tokio::test] + async fn test_table_chunk_stream_empty_pred() { + let chunk0 = Arc::new( + TestChunk::new("h2o") + .with_id(0) + .with_tag_column("foo") + .with_tag_column("bar") + .with_i64_field_column("i64_field") + .with_i64_field_column("i64_field_2") + .with_time_column() + .with_one_row_of_data(), + ); + + let executor = Arc::new(Executor::new(1)); + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + + // empty predicate + let predicate = Predicate::new(); + let table_predicates = vec![("h2o".to_string(), predicate)]; + + ///////////// + // Test 1: empty predicate with need_fields + let need_fields = true; + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes all 5 columns of the table because the preidcate is empty + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 5); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field"); + assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2"); + assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME); + executor.join().await; + + ///////////// + // Test 2: empty predicate without need_fields + let need_fields = false; + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes all 5 columns of the table because the preidcate is empty + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 5); + executor.join().await; + } + + #[tokio::test] + async fn test_table_chunk_stream_pred_on_tag_no_data() { + let chunk0 = Arc::new( + TestChunk::new("h2o") + .with_id(0) + .with_tag_column("foo") + .with_tag_column("bar") + .with_i64_field_column("i64_field") + .with_i64_field_column("i64_field_2") + .with_time_column(), // no row added for this chunk on purpose + ); + + let executor = Arc::new(Executor::new(1)); + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + + // predicate on a tag column `foo` + let expr = col("foo").eq(lit("some_thing")); + let predicate = Predicate::new().with_expr(expr); + let table_predicates = vec![("h2o".to_string(), predicate)]; + + let need_fields = false; + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // Since no data, we do not do pushdown in the test chunk. + // the no-data returned chunk will include all columns of the table + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 5); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field"); + assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2"); + assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME); + executor.join().await; + } + + #[tokio::test] + async fn test_table_chunk_stream_pred_and_field_columns() { + let chunk0 = Arc::new( + TestChunk::new("h2o") + .with_id(0) + .with_tag_column("foo") + .with_tag_column("bar") + .with_i64_field_column("i64_field") + .with_i64_field_column("i64_field_2") + .with_time_column() + .with_one_row_of_data(), + ); + + let executor = Arc::new(Executor::new(1)); + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + + let need_fields = false; + + ///////////// + // Test 1: predicate on field `i64_field_2` and `field_columns` is empty + // predicate on field column + let expr = col("i64_field_2").eq(lit(10)); + let predicate = Predicate::new().with_expr(expr); + let table_predicates = vec![("h2o".to_string(), predicate)]; + + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes 4 columns: 3 cols of PK plus i64_field_2 + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 4); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field_2"); + assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME); + executor.join().await; + + ///////////// + // Test 2: predicate on tag `foo` and `field_columns` is not empty + let expr = col("bar").eq(lit(10)); + let predicate = Predicate::new() + .with_expr(expr) + .with_field_columns(vec!["i64_field".to_string()]); + let table_predicates = vec![("h2o".to_string(), predicate)]; + + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes 4 columns: 3 cols of PK plus i64_field_1 + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 4); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field"); + assert_eq!(chunk_schema.field(3).1.name(), TIME_COLUMN_NAME); + executor.join().await; + } + + #[tokio::test] + async fn test_table_chunk_stream_pred_on_unknown_field() { + let chunk0 = Arc::new( + TestChunk::new("h2o") + .with_id(0) + .with_tag_column("foo") + .with_tag_column("bar") + .with_i64_field_column("i64_field") + .with_i64_field_column("i64_field_2") + .with_time_column() + .with_one_row_of_data(), + ); + + let executor = Arc::new(Executor::new(1)); + let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor))); + test_db.add_chunk("my_partition_key", Arc::clone(&chunk0)); + let ctx = test_db.new_query_context(None); + + // predicate on unknown column + let expr = col("unknown_name").eq(lit(10)); + let predicate = Predicate::new().with_expr(expr); + let table_predicates = vec![("h2o".to_string(), predicate)]; + + let need_fields = false; + let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx) + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "h2o"); // table name + assert_eq!(result[0].2.len(), 1); // returned chunks + + // chunk schema includes all 5 columns since we hit the unknown columnd + let chunk = &result[0].2[0]; + let chunk_schema = (*chunk.schema()).clone(); + assert_eq!(chunk_schema.len(), 5); + let chunk_schema = chunk_schema.sort_fields_by_name(); + assert_eq!(chunk_schema.field(0).1.name(), "bar"); + assert_eq!(chunk_schema.field(1).1.name(), "foo"); + assert_eq!(chunk_schema.field(2).1.name(), "i64_field"); + assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2"); + assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME); + executor.join().await; + } + #[tokio::test] async fn test_predicate_rewrite_table_names() { run_test(|test_db, rpc_predicate| { diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs index a0bd37a68b..7863e9750f 100644 --- a/iox_query/src/lib.rs +++ b/iox_query/src/lib.rs @@ -14,7 +14,7 @@ use async_trait::async_trait; use data_types::{ ChunkId, ChunkOrder, DeletePredicate, InfluxDbType, PartitionId, TableSummary, TimestampMinMax, }; -use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream}; use exec::{stringset::StringSet, IOxSessionContext}; use hashbrown::HashMap; use observability_deps::tracing::{debug, trace}; @@ -141,9 +141,6 @@ impl Drop for QueryCompletedToken { /// This avoids storing potentially large strings pub type QueryText = Box; -/// Error type for [`QueryDatabase`] operations. -pub type QueryDatabaseError = Box; - /// A `Database` is the main trait implemented by the IOx subsystems /// that store actual data. /// @@ -154,12 +151,15 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync { /// Returns a set of chunks within the partition with data that may match /// the provided predicate. If possible, chunks which have no rows that can /// possibly match the predicate may be omitted. + /// If projection is None, returned chunks will include all columns of its original data. Otherwise, + /// returned chunks will includs PK columns (tags and time) and columns specified in the projection. async fn chunks( &self, table_name: &str, predicate: &Predicate, + projection: &Option>, ctx: IOxSessionContext, - ) -> Result>, QueryDatabaseError>; + ) -> Result>, DataFusionError>; /// Record that particular type of query was run / planned fn record_query( @@ -175,9 +175,6 @@ pub trait QueryDatabase: QueryDatabaseMeta + Debug + Send + Sync { fn as_meta(&self) -> &dyn QueryDatabaseMeta; } -/// Error type for [`QueryChunk`] operations. -pub type QueryChunkError = Box; - /// Collection of data that shares the same partition key pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static { /// returns the Id of this chunk. Ids are unique within a @@ -200,7 +197,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static { fn apply_predicate_to_metadata( &self, predicate: &Predicate, - ) -> Result { + ) -> Result { Ok(self .summary() .map(|summary| predicate.apply_to_table_summary(&summary, self.schema().as_arrow())) @@ -216,7 +213,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static { ctx: IOxSessionContext, predicate: &Predicate, columns: Selection<'_>, - ) -> Result, QueryChunkError>; + ) -> Result, DataFusionError>; /// Return a set of Strings containing the distinct values in the /// specified columns. If the predicate can be evaluated entirely @@ -228,7 +225,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static { ctx: IOxSessionContext, column_name: &str, predicate: &Predicate, - ) -> Result, QueryChunkError>; + ) -> Result, DataFusionError>; /// Provides access to raw `QueryChunk` data as an /// asynchronous stream of `RecordBatch`es filtered by a *required* @@ -248,7 +245,7 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync + 'static { ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result; + ) -> Result; /// Returns chunk type. Useful in tests and debug logs. fn chunk_type(&self) -> &str; diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs index 23cb2e2f6a..cf143dcb57 100644 --- a/iox_query/src/provider/adapter.rs +++ b/iox_query/src/provider/adapter.rs @@ -262,7 +262,7 @@ mod tests { let batch = make_batch(); let output_schema = batch.schema(); - let input_stream = stream_from_batch(batch); + let input_stream = stream_from_batch(batch.schema(), batch); let adapter_stream = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap(); @@ -291,7 +291,7 @@ mod tests { Field::new("c", DataType::Utf8, false), Field::new("a", DataType::Int32, false), ])); - let input_stream = stream_from_batch(batch); + let input_stream = stream_from_batch(batch.schema(), batch); let adapter_stream = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap(); @@ -321,7 +321,7 @@ mod tests { Field::new("d", DataType::Float32, true), Field::new("a", DataType::Int32, false), ])); - let input_stream = stream_from_batch(batch); + let input_stream = stream_from_batch(batch.schema(), batch); let adapter_stream = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()).unwrap(); @@ -349,7 +349,7 @@ mod tests { Field::new("c", DataType::Utf8, false), Field::new("a", DataType::Int32, false), ])); - let input_stream = stream_from_batch(batch); + let input_stream = stream_from_batch(batch.schema(), batch); let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()); assert_contains!( @@ -368,7 +368,7 @@ mod tests { Field::new("b", DataType::Int32, false), Field::new("a", DataType::Int32, false), ])); - let input_stream = stream_from_batch(batch); + let input_stream = stream_from_batch(batch.schema(), batch); let res = SchemaAdapterStream::try_new(input_stream, output_schema, baseline_metrics()); assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'"); diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs index dee2d1120b..e7a0503f1c 100644 --- a/iox_query/src/test.rs +++ b/iox_query/src/test.rs @@ -8,8 +8,8 @@ use crate::{ stringset::{StringSet, StringSetRef}, ExecutionContextProvider, Executor, ExecutorType, IOxSessionContext, }, - Predicate, PredicateMatch, QueryChunk, QueryChunkError, QueryChunkMeta, QueryCompletedToken, - QueryDatabase, QueryDatabaseError, QueryText, + Predicate, PredicateMatch, QueryChunk, QueryChunkMeta, QueryCompletedToken, QueryDatabase, + QueryText, }; use arrow::{ array::{ @@ -24,7 +24,7 @@ use data_types::{ ChunkId, ChunkOrder, ColumnSummary, DeletePredicate, InfluxDbType, PartitionId, StatValues, Statistics, TableSummary, TimestampMinMax, }; -use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream}; use datafusion_util::stream_from_batches; use futures::StreamExt; use hashbrown::HashSet; @@ -108,18 +108,54 @@ impl QueryDatabase for TestDatabase { &self, table_name: &str, predicate: &Predicate, + projection: &Option>, _ctx: IOxSessionContext, - ) -> Result>, QueryDatabaseError> { + ) -> Result>, DataFusionError> { // save last predicate *self.chunks_predicate.lock() = predicate.clone(); - let partitions = self.partitions.lock(); - Ok(partitions + let partitions = self.partitions.lock().clone(); + let chunks = partitions .values() .flat_map(|x| x.values()) .filter(|x| x.table_name == table_name) - .map(|x| Arc::clone(x) as _) - .collect()) + .map(|x| Arc::clone(x) as Arc) + .collect::>(); + + // Return chunks with fewer columns if a projection is specified + let mut new_chunks = Vec::with_capacity(chunks.len()); + for c in chunks { + let schema = c.schema(); + let cols = schema.select_given_and_pk_columns(projection); + let cols = cols.iter().map(|c| c.as_str()).collect::>(); + let selection = Selection::Some(&cols); + + let read_result = + c.read_filter(IOxSessionContext::with_testing(), predicate, selection); + if read_result.is_err() { + return Err(read_result.err().unwrap()); + } + let mut stream = read_result.unwrap(); + + let mut new_chunk = TestChunk::new(c.table_name()); + while let Some(b) = stream.next().await { + let b = b.expect("Error in stream"); + new_chunk.table_data.push(Arc::new(b)); + } + + let new_chunk = if !new_chunk.table_data.is_empty() { + let new_schema = Schema::try_from(new_chunk.table_data[0].schema()).unwrap(); + let new_chunk = new_chunk.add_schema_to_table(new_schema, true, None); + Arc::new(new_chunk) as _ + } else { + // No data, return the original empty chunk with the original schema + c + }; + + new_chunks.push(new_chunk); + } + + Ok(new_chunks) } fn record_query( @@ -327,9 +363,9 @@ impl TestChunk { } /// Checks the saved error, and returns it if any, otherwise returns OK - fn check_error(&self) -> Result<(), QueryChunkError> { + fn check_error(&self) -> Result<(), DataFusionError> { if let Some(message) = self.saved_error.as_ref() { - Err(message.clone().into()) + Err(DataFusionError::External(message.clone().into())) } else { Ok(()) } @@ -509,12 +545,8 @@ impl TestChunk { mut self, new_column_schema: Schema, add_column_summary: bool, - stats: Option, + input_stats: Option, ) -> Self { - // assume the new schema has exactly a single table - assert_eq!(new_column_schema.len(), 1); - let (col_type, new_field) = new_column_schema.field(0); - let mut merger = SchemaMerger::new(); merger = merger.merge(&new_column_schema).unwrap(); merger = merger @@ -522,34 +554,38 @@ impl TestChunk { .expect("merging was successful"); self.schema = merger.build(); - if add_column_summary { - let influxdb_type = col_type.map(|t| match t { - InfluxColumnType::Tag => InfluxDbType::Tag, - InfluxColumnType::Field(_) => InfluxDbType::Field, - InfluxColumnType::Timestamp => InfluxDbType::Timestamp, - }); + for i in 0..new_column_schema.len() { + let (col_type, new_field) = new_column_schema.field(i); + if add_column_summary { + let influxdb_type = col_type.map(|t| match t { + InfluxColumnType::Tag => InfluxDbType::Tag, + InfluxColumnType::Field(_) => InfluxDbType::Field, + InfluxColumnType::Timestamp => InfluxDbType::Timestamp, + }); - let stats = stats.unwrap_or_else(|| match new_field.data_type() { - DataType::Boolean => Statistics::Bool(StatValues::default()), - DataType::Int64 => Statistics::I64(StatValues::default()), - DataType::UInt64 => Statistics::U64(StatValues::default()), - DataType::Utf8 => Statistics::String(StatValues::default()), - DataType::Dictionary(_, value_type) => { - assert!(matches!(**value_type, DataType::Utf8)); - Statistics::String(StatValues::default()) - } - DataType::Float64 => Statistics::F64(StatValues::default()), - DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()), - _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()), - }); + let stats = input_stats.clone(); + let stats = stats.unwrap_or_else(|| match new_field.data_type() { + DataType::Boolean => Statistics::Bool(StatValues::default()), + DataType::Int64 => Statistics::I64(StatValues::default()), + DataType::UInt64 => Statistics::U64(StatValues::default()), + DataType::Utf8 => Statistics::String(StatValues::default()), + DataType::Dictionary(_, value_type) => { + assert!(matches!(**value_type, DataType::Utf8)); + Statistics::String(StatValues::default()) + } + DataType::Float64 => Statistics::F64(StatValues::default()), + DataType::Timestamp(_, _) => Statistics::I64(StatValues::default()), + _ => panic!("Unsupported type in TestChunk: {:?}", new_field.data_type()), + }); - let column_summary = ColumnSummary { - name: new_field.name().clone(), - influxdb_type, - stats, - }; + let column_summary = ColumnSummary { + name: new_field.name().clone(), + influxdb_type, + stats, + }; - self.table_summary.columns.push(column_summary); + self.table_summary.columns.push(column_summary); + } } self @@ -921,13 +957,17 @@ impl QueryChunk for TestChunk { _ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result { + ) -> Result { self.check_error()?; // save the predicate self.predicates.lock().push(predicate.clone()); - let batches = match self.schema.df_projection(selection)? { + let batches = match self + .schema + .df_projection(selection) + .map_err(|e| DataFusionError::External(Box::new(e)))? + { None => self.table_data.clone(), Some(projection) => self .table_data @@ -938,7 +978,8 @@ impl QueryChunk for TestChunk { }) .collect::, ArrowError>>()?, }; - Ok(stream_from_batches(batches)) + + Ok(stream_from_batches(self.schema().as_arrow(), batches)) } fn chunk_type(&self) -> &str { @@ -948,7 +989,7 @@ impl QueryChunk for TestChunk { fn apply_predicate_to_metadata( &self, predicate: &Predicate, - ) -> Result { + ) -> Result { self.check_error()?; // save the predicate @@ -967,7 +1008,7 @@ impl QueryChunk for TestChunk { _ctx: IOxSessionContext, _column_name: &str, _predicate: &Predicate, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { // Model not being able to get column values from metadata Ok(None) } @@ -977,7 +1018,7 @@ impl QueryChunk for TestChunk { _ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { self.check_error()?; // save the predicate diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml index 8760728d4e..514bfb5754 100644 --- a/iox_tests/Cargo.toml +++ b/iox_tests/Cargo.toml @@ -14,7 +14,7 @@ iox_catalog = { path = "../iox_catalog" } iox_time = { path = "../iox_time" } metric = { path = "../metric" } mutable_batch_lp = { path = "../mutable_batch_lp" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } once_cell = { version = "1.15.0", features = ["parking_lot"] } parquet_file = { path = "../parquet_file" } diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml index eb41af0c4f..26d9d3fdeb 100644 --- a/ioxd_common/Cargo.toml +++ b/ioxd_common/Cargo.toml @@ -40,7 +40,7 @@ log = "0.4" parking_lot = "0.12" reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" serde_urlencoded = "0.7.0" snafu = "0.7" tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } diff --git a/ioxd_compactor/Cargo.toml b/ioxd_compactor/Cargo.toml index 3fae827159..6cbe04119c 100644 --- a/ioxd_compactor/Cargo.toml +++ b/ioxd_compactor/Cargo.toml @@ -15,7 +15,7 @@ iox_catalog = { path = "../iox_catalog" } ioxd_common = { path = "../ioxd_common" } metric = { path = "../metric" } iox_query = { path = "../iox_query" } -object_store = "0.5.0" +object_store = "0.5.1" iox_time = { path = "../iox_time" } trace = { path = "../trace" } diff --git a/ioxd_ingester/Cargo.toml b/ioxd_ingester/Cargo.toml index db8f65e202..11e3118c2d 100644 --- a/ioxd_ingester/Cargo.toml +++ b/ioxd_ingester/Cargo.toml @@ -11,7 +11,7 @@ ingester = { path = "../ingester" } iox_catalog = { path = "../iox_catalog" } ioxd_common = { path = "../ioxd_common" } metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" iox_query = { path = "../iox_query" } trace = { path = "../trace" } write_buffer = { path = "../write_buffer" } diff --git a/ioxd_querier/Cargo.toml b/ioxd_querier/Cargo.toml index e90a4a68df..60574ed73d 100644 --- a/ioxd_querier/Cargo.toml +++ b/ioxd_querier/Cargo.toml @@ -11,7 +11,7 @@ generated_types = { path = "../generated_types" } iox_catalog = { path = "../iox_catalog" } ioxd_common = { path = "../ioxd_common" } metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" querier = { path = "../querier" } iox_query = { path = "../iox_query" } router = { path = "../router" } diff --git a/ioxd_router/Cargo.toml b/ioxd_router/Cargo.toml index 5797a9cf01..1ae3d3ab2a 100644 --- a/ioxd_router/Cargo.toml +++ b/ioxd_router/Cargo.toml @@ -11,7 +11,7 @@ iox_catalog = { path = "../iox_catalog" } ioxd_common = { path = "../ioxd_common" } metric = { path = "../metric" } mutable_batch = { path = "../mutable_batch" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } router = { path = "../router" } sharder = { path = "../sharder" } diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml index 60838a8e28..f04cb909ef 100644 --- a/object_store_metrics/Cargo.toml +++ b/object_store_metrics/Cargo.toml @@ -10,7 +10,7 @@ bytes = "1.2" futures = "0.3" iox_time = { version = "0.1.0", path = "../iox_time" } metric = { version = "0.1.0", path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" pin-project = "1.0.12" tokio = { version = "1.21", features = ["io-util"] } workspace-hack = { path = "../workspace-hack" } diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml index 6fd9bafa4f..783b1ddca4 100644 --- a/parquet_file/Cargo.toml +++ b/parquet_file/Cargo.toml @@ -14,7 +14,7 @@ datafusion_util = { path = "../datafusion_util" } futures = "0.3" generated_types = { path = "../generated_types" } iox_time = { path = "../iox_time" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parking_lot = "0.12" parquet = {version = "23.0.0", features = ["experimental"]} diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml index 9b4cc08004..5273a01dd3 100644 --- a/parquet_to_line_protocol/Cargo.toml +++ b/parquet_to_line_protocol/Cargo.toml @@ -10,7 +10,7 @@ datafusion = { path = "../datafusion" } influxdb_line_protocol = { path = "../influxdb_line_protocol" } futures = {version = "0.3"} num_cpus = "1.13.1" -object_store = { version = "0.5.0" } +object_store = { version = "0.5.1" } parquet_file = { path = "../parquet_file" } schema = { path = "../schema" } tokio = "1.0" diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml index 9bf303b6c1..e1d423255f 100644 --- a/predicate/Cargo.toml +++ b/predicate/Cargo.toml @@ -13,9 +13,9 @@ itertools = "0.10" observability_deps = { path = "../observability_deps" } query_functions = { path = "../query_functions"} schema = { path = "../schema" } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" -sqlparser = "0.24.0" +sqlparser = "0.25.0" workspace-hack = { path = "../workspace-hack"} [dev-dependencies] diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs index 03b52e521d..633a345e50 100644 --- a/predicate/src/lib.rs +++ b/predicate/src/lib.rs @@ -12,7 +12,6 @@ pub mod delete_expr; pub mod delete_predicate; -pub mod rewrite; pub mod rpc_predicate; use arrow::{ diff --git a/predicate/src/rpc_predicate.rs b/predicate/src/rpc_predicate.rs index 2836a6e57e..833dfdc063 100644 --- a/predicate/src/rpc_predicate.rs +++ b/predicate/src/rpc_predicate.rs @@ -1,19 +1,23 @@ +mod column_rewrite; mod field_rewrite; mod measurement_rewrite; +mod rewrite; mod value_rewrite; -use crate::{rewrite, Predicate}; +use crate::Predicate; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::lit; use datafusion::logical_plan::{ - Column, Expr, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo, + Column, Expr, ExprRewritable, ExprSchema, ExprSchemable, ExprSimplifiable, SimplifyInfo, }; +use observability_deps::tracing::{debug, trace}; use schema::Schema; use std::collections::BTreeSet; use std::sync::Arc; +use self::column_rewrite::MissingColumnRewriter; use self::field_rewrite::FieldProjectionRewriter; use self::measurement_rewrite::rewrite_measurement_references; use self::value_rewrite::rewrite_field_value_references; @@ -187,6 +191,7 @@ fn normalize_predicate( let mut predicate = predicate.clone(); let mut field_projections = FieldProjectionRewriter::new(Arc::clone(&schema)); + let mut missing_columums = MissingColumnRewriter::new(Arc::clone(&schema)); let mut field_value_exprs = vec![]; @@ -194,24 +199,38 @@ fn normalize_predicate( .exprs .into_iter() .map(|e| { - rewrite_measurement_references(table_name, e) + debug!(?e, "rewriting expr"); + + let e = rewrite_measurement_references(table_name, e) + .map(|e| log_rewrite(e, "rewrite_measurement_references")) // Rewrite any references to `_value = some_value` to literal true values. // Keeps track of these expressions, which can then be used to // augment field projections with conditions using `CASE` statements. .and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e)) + .map(|e| log_rewrite(e, "rewrite_field_value_references")) // Rewrite any references to `_field` with a literal // and keep track of referenced field names to add to // the field column projection set. .and_then(|e| field_projections.rewrite_field_exprs(e)) + .map(|e| log_rewrite(e, "field_projections")) + // remove references to columns that don't exist in this schema + .and_then(|e| e.rewrite(&mut missing_columums)) + .map(|e| log_rewrite(e, "missing_columums")) // apply IOx specific rewrites (that unlock other simplifications) .and_then(rewrite::rewrite) - // Call the core DataFusion simplification logic + .map(|e| log_rewrite(e, "rewrite")) + // Call DataFusion simplification logic .and_then(|e| { let adapter = SimplifyAdapter::new(schema.as_ref()); // simplify twice to ensure "full" cleanup e.simplify(&adapter)?.simplify(&adapter) }) + .map(|e| log_rewrite(e, "simplify_expr")) .and_then(rewrite::simplify_predicate) + .map(|e| log_rewrite(e, "simplify_expr")); + + debug!(?e, "rewritten expr"); + e }) // Filter out literal true so is_empty works correctly .filter(|f| match f { @@ -227,6 +246,11 @@ fn normalize_predicate( field_projections.add_to_predicate(predicate) } +fn log_rewrite(expr: Expr, description: &str) -> Expr { + trace!(?expr, %description, "After rewrite"); + expr +} + struct SimplifyAdapter<'a> { schema: &'a Schema, execution_props: ExecutionProps, @@ -290,9 +314,27 @@ mod tests { use super::*; use arrow::datatypes::DataType; - use datafusion::logical_plan::{col, lit}; + use datafusion::{ + logical_plan::{col, lit}, + scalar::ScalarValue, + }; use test_helpers::assert_contains; + #[test] + fn test_normalize_predicate_coerced() { + let schema = schema(); + let predicate = normalize_predicate( + "table", + Arc::clone(&schema), + &Predicate::new().with_expr(col("t1").eq(lit("f1"))), + ) + .unwrap(); + + let expected = Predicate::new().with_expr(col("t1").eq(lit("f1"))); + + assert_eq!(predicate, expected); + } + #[test] fn test_normalize_predicate_field_rewrite() { let predicate = normalize_predicate( @@ -336,6 +378,20 @@ mod tests { assert_eq!(predicate, expected); } + #[test] + fn test_normalize_predicate_field_non_tag() { + // should treat + let predicate = normalize_predicate( + "table", + schema(), + &Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))), + ) + .unwrap(); + + let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None))); + assert_eq!(predicate, expected); + } + #[test] fn test_normalize_predicate_field_rewrite_multi_field_unsupported() { let err = normalize_predicate( diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs new file mode 100644 index 0000000000..7a29331fca --- /dev/null +++ b/predicate/src/rpc_predicate/column_rewrite.rs @@ -0,0 +1,99 @@ +use std::sync::Arc; + +use datafusion::{ + error::Result as DataFusionResult, logical_plan::ExprRewriter, prelude::*, scalar::ScalarValue, +}; +use schema::Schema; + +/// Logic for rewriting expressions from influxrpc that reference non +/// existent columns to NULL +#[derive(Debug)] +pub(crate) struct MissingColumnRewriter { + /// The input schema + schema: Arc, +} + +impl MissingColumnRewriter { + /// Create a new [`MissingColumnRewriter`] targeting the given schema + pub(crate) fn new(schema: Arc) -> Self { + Self { schema } + } + + fn column_exists(&self, col: &Column) -> DataFusionResult { + // todo a real error here (rpc_predicates shouldn't have table/relation qualifiers) + assert!(col.relation.is_none()); + + if self.schema.find_index_of(&col.name).is_some() { + Ok(true) + } else { + Ok(false) + } + } +} + +fn lit_null() -> Expr { + lit(ScalarValue::Utf8(None)) +} + +impl ExprRewriter for MissingColumnRewriter { + fn mutate(&mut self, expr: Expr) -> DataFusionResult { + Ok(match expr { + Expr::Column(col) if !self.column_exists(&col)? => lit_null(), + expr => expr, + }) + } +} + +#[cfg(test)] +mod tests { + use datafusion::{arrow::datatypes::DataType, logical_plan::ExprRewritable}; + use schema::SchemaBuilder; + + use super::*; + + #[test] + fn all_columns_defined_no_rewrite() { + // t1 = "foo" + let expr = col("t1").eq(lit("foo")); + assert_eq!(rewrite(expr.clone()), expr); + + // f1 > 1.0 + let expr = col("f1").gt(lit(1.0)); + assert_eq!(rewrite(expr.clone()), expr); + } + + #[test] + fn all_columns_not_defined() { + // non_defined = "foo" --> NULL = "foo" + let expr = col("non_defined").eq(lit("foo")); + let expected = lit_null().eq(lit("foo")); + assert_eq!(rewrite(expr), expected); + + // non_defined = 1.4 --> NULL = 1.4 + let expr = col("non_defined").eq(lit(1.4)); + // No type is inferred so this is a literal null string (even though it maybe should be a literal float) + let expected = lit_null().eq(lit(1.4)); + assert_eq!(rewrite(expr), expected); + } + + #[test] + fn some_columns_not_defined() { + // t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar" + let expr = col("t1") + .eq(lit("foo")) + .and(col("non_defined").eq(lit("bar"))); + let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar"))); + assert_eq!(rewrite(expr), expected); + } + + fn rewrite(expr: Expr) -> Expr { + let schema = SchemaBuilder::new() + .tag("t1") + .field("f1", DataType::Int64) + .build() + .unwrap(); + + let mut rewriter = MissingColumnRewriter::new(Arc::new(schema)); + expr.rewrite(&mut rewriter).unwrap() + } +} diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs index 3cccfa219a..3f983a28e7 100644 --- a/predicate/src/rpc_predicate/field_rewrite.rs +++ b/predicate/src/rpc_predicate/field_rewrite.rs @@ -55,8 +55,8 @@ impl FieldProjectionRewriter { } } - // Rewrites the predicate. See the description on - // [`FieldProjectionRewriter`] for more details. + /// Rewrites the predicate. See the description on + /// [`FieldProjectionRewriter`] for more details. pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult { // for predicates like `A AND B AND C` // rewrite `A`, `B` and `C` separately and put them back together diff --git a/predicate/src/rewrite.rs b/predicate/src/rpc_predicate/rewrite.rs similarity index 100% rename from predicate/src/rewrite.rs rename to predicate/src/rpc_predicate/rewrite.rs diff --git a/querier/Cargo.toml b/querier/Cargo.toml index 9d55643c4b..02fe680c07 100644 --- a/querier/Cargo.toml +++ b/querier/Cargo.toml @@ -18,7 +18,7 @@ generated_types = { path = "../generated_types" } influxdb_iox_client = { path = "../influxdb_iox_client" } iox_catalog = { path = "../iox_catalog" } metric = { path = "../metric" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parking_lot = "0.12" parquet_file = { path = "../parquet_file" } diff --git a/querier/src/cache/read_buffer.rs b/querier/src/cache/read_buffer.rs index 4c68bcac9d..63138e242a 100644 --- a/querier/src/cache/read_buffer.rs +++ b/querier/src/cache/read_buffer.rs @@ -470,9 +470,9 @@ mod tests { .into_iter() .map(lp_to_record_batch) .map(Arc::new) - .collect(); + .collect::>(); - let stream = stream_from_batches(batches); + let stream = stream_from_batches(batches[0].schema(), batches); let metric_registry = metric::Registry::new(); diff --git a/querier/src/chunk/query_access.rs b/querier/src/chunk/query_access.rs index 0edf477ec7..dc94a55b69 100644 --- a/querier/src/chunk/query_access.rs +++ b/querier/src/chunk/query_access.rs @@ -7,13 +7,16 @@ use arrow::{ use data_types::{ ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary, TimestampMinMax, }; -use datafusion::physical_plan::{ - stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream, +use datafusion::{ + error::DataFusionError, + physical_plan::{ + stream::RecordBatchStreamAdapter, RecordBatchStream, SendableRecordBatchStream, + }, }; use futures::{Stream, TryStreamExt}; use iox_query::{ exec::{stringset::StringSet, IOxSessionContext}, - QueryChunk, QueryChunkError, QueryChunkMeta, + QueryChunk, QueryChunkMeta, }; use observability_deps::tracing::debug; use predicate::Predicate; @@ -114,7 +117,7 @@ impl QueryChunk for QuerierChunk { mut ctx: IOxSessionContext, predicate: &Predicate, columns: Selection<'_>, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { ctx.set_metadata("projection", format!("{}", columns)); ctx.set_metadata("predicate", format!("{}", &predicate)); @@ -161,10 +164,10 @@ impl QueryChunk for QuerierChunk { None } Err(other) => { - return Err(Box::new(Error::RBChunk { + return Err(DataFusionError::External(Box::new(Error::RBChunk { source: other, chunk_id: self.id(), - })) + }))) } }; @@ -178,7 +181,7 @@ impl QueryChunk for QuerierChunk { mut ctx: IOxSessionContext, column_name: &str, predicate: &Predicate, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { ctx.set_metadata("column_name", column_name.to_string()); ctx.set_metadata("predicate", format!("{}", &predicate)); @@ -205,11 +208,13 @@ impl QueryChunk for QuerierChunk { }; ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate)); - let mut values = rb_chunk.column_values( - rb_predicate, - Selection::Some(&[column_name]), - BTreeMap::new(), - )?; + let mut values = rb_chunk + .column_values( + rb_predicate, + Selection::Some(&[column_name]), + BTreeMap::new(), + ) + .map_err(|e| DataFusionError::External(Box::new(e)))?; // The InfluxRPC frontend only supports getting column values // for one column at a time (this is a restriction on the Influx @@ -221,7 +226,8 @@ impl QueryChunk for QuerierChunk { .context(ColumnNameNotFoundSnafu { chunk_id: self.id(), column_name, - })?; + }) + .map_err(|e| DataFusionError::External(Box::new(e)))?; ctx.set_metadata("output_values", values.len() as i64); Ok(Some(values)) @@ -234,7 +240,7 @@ impl QueryChunk for QuerierChunk { mut ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result { + ) -> Result { let span_recorder = SpanRecorder::new( ctx.span() .map(|span| span.child("QuerierChunk::read_filter")), diff --git a/querier/src/ingester/mod.rs b/querier/src/ingester/mod.rs index aac2635c29..9c9f7a8910 100644 --- a/querier/src/ingester/mod.rs +++ b/querier/src/ingester/mod.rs @@ -11,6 +11,7 @@ use data_types::{ ChunkId, ChunkOrder, IngesterMapping, PartitionId, SequenceNumber, ShardId, ShardIndex, TableSummary, TimestampMinMax, }; +use datafusion::error::DataFusionError; use datafusion_util::MemoryStream; use futures::{stream::FuturesUnordered, TryStreamExt}; use generated_types::{ @@ -24,7 +25,7 @@ use influxdb_iox_client::flight::{ use iox_query::{ exec::{stringset::StringSet, IOxSessionContext}, util::compute_timenanosecond_min_max, - QueryChunk, QueryChunkError, QueryChunkMeta, + QueryChunk, QueryChunkMeta, }; use iox_time::{Time, TimeProvider}; use metric::{DurationHistogram, Metric}; @@ -612,9 +613,7 @@ impl IngesterStreamDecoder { partition_id, shard_id, status.parquet_max_sequence_number.map(SequenceNumber::new), - status - .tombstone_max_sequence_number - .map(SequenceNumber::new), + None, partition_sort_key, ); self.current_partition = Some(partition); @@ -1097,7 +1096,7 @@ impl QueryChunk for IngesterChunk { _ctx: IOxSessionContext, _predicate: &Predicate, _columns: Selection<'_>, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { // TODO maybe some special handling? Ok(None) } @@ -1107,7 +1106,7 @@ impl QueryChunk for IngesterChunk { _ctx: IOxSessionContext, _column_name: &str, _predicate: &Predicate, - ) -> Result, QueryChunkError> { + ) -> Result, DataFusionError> { // TODO maybe some special handling? Ok(None) } @@ -1117,11 +1116,15 @@ impl QueryChunk for IngesterChunk { _ctx: IOxSessionContext, predicate: &Predicate, selection: Selection<'_>, - ) -> Result { + ) -> Result { trace!(?predicate, ?selection, input_batches=?self.batches, "Reading data"); // Apply selection to in-memory batch - let batches = match self.schema.df_projection(selection)? { + let batches = match self + .schema + .df_projection(selection) + .map_err(|e| DataFusionError::External(Box::new(e)))? + { None => self.batches.clone(), Some(projection) => self .batches @@ -1333,7 +1336,6 @@ mod tests { partition_id: 1, status: Some(PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, ))], @@ -1389,7 +1391,6 @@ mod tests { partition_id: 1, status: Some(PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, )), @@ -1399,7 +1400,6 @@ mod tests { partition_id: 2, status: Some(PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, )), @@ -1409,7 +1409,6 @@ mod tests { partition_id: 1, status: Some(PartitionStatus { parquet_max_sequence_number: None, - tombstone_max_sequence_number: None, }), }, )), @@ -1489,7 +1488,6 @@ mod tests { partition_id: 1, status: Some(PartitionStatus { parquet_max_sequence_number: Some(11), - tombstone_max_sequence_number: Some(12), }), }, )), @@ -1519,7 +1517,6 @@ mod tests { partition_id: 2, status: Some(PartitionStatus { parquet_max_sequence_number: Some(21), - tombstone_max_sequence_number: Some(22), }), }, )), @@ -1544,7 +1541,6 @@ mod tests { partition_id: 3, status: Some(PartitionStatus { parquet_max_sequence_number: Some(31), - tombstone_max_sequence_number: Some(32), }), }, )), @@ -1574,10 +1570,7 @@ mod tests { p1.parquet_max_sequence_number, Some(SequenceNumber::new(11)) ); - assert_eq!( - p1.tombstone_max_sequence_number, - Some(SequenceNumber::new(12)) - ); + assert_eq!(p1.tombstone_max_sequence_number, None); assert_eq!(p1.chunks.len(), 2); assert_eq!(p1.chunks[0].schema().as_arrow(), schema_1_1); assert_eq!(p1.chunks[0].batches.len(), 2); @@ -1594,10 +1587,7 @@ mod tests { p2.parquet_max_sequence_number, Some(SequenceNumber::new(21)) ); - assert_eq!( - p2.tombstone_max_sequence_number, - Some(SequenceNumber::new(22)) - ); + assert_eq!(p2.tombstone_max_sequence_number, None); assert_eq!(p2.chunks.len(), 1); assert_eq!(p2.chunks[0].schema().as_arrow(), schema_2_1); assert_eq!(p2.chunks[0].batches.len(), 1); @@ -1610,10 +1600,7 @@ mod tests { p3.parquet_max_sequence_number, Some(SequenceNumber::new(31)) ); - assert_eq!( - p3.tombstone_max_sequence_number, - Some(SequenceNumber::new(32)) - ); + assert_eq!(p3.tombstone_max_sequence_number, None); assert_eq!(p3.chunks.len(), 1); assert_eq!(p3.chunks[0].schema().as_arrow(), schema_3_1); assert_eq!(p3.chunks[0].batches.len(), 1); @@ -1733,7 +1720,6 @@ mod tests { partition_id: 1, status: Some(PartitionStatus { parquet_max_sequence_number: Some(11), - tombstone_max_sequence_number: Some(12), }), }, )), @@ -1773,10 +1759,7 @@ mod tests { p1.parquet_max_sequence_number, Some(SequenceNumber::new(11)) ); - assert_eq!( - p1.tombstone_max_sequence_number, - Some(SequenceNumber::new(12)) - ); + assert_eq!(p1.tombstone_max_sequence_number, None); assert_eq!(p1.chunks.len(), 1); } diff --git a/querier/src/namespace/query_access.rs b/querier/src/namespace/query_access.rs index b7451000b3..30b9975a06 100644 --- a/querier/src/namespace/query_access.rs +++ b/querier/src/namespace/query_access.rs @@ -11,10 +11,11 @@ use data_types::NamespaceId; use datafusion::{ catalog::{catalog::CatalogProvider, schema::SchemaProvider}, datasource::TableProvider, + error::DataFusionError, }; use iox_query::{ exec::{ExecutionContextProvider, ExecutorType, IOxSessionContext}, - QueryChunk, QueryCompletedToken, QueryDatabase, QueryDatabaseError, QueryText, DEFAULT_SCHEMA, + QueryChunk, QueryCompletedToken, QueryDatabase, QueryText, DEFAULT_SCHEMA, }; use observability_deps::tracing::{debug, trace}; use predicate::{rpc_predicate::QueryDatabaseMeta, Predicate}; @@ -40,8 +41,9 @@ impl QueryDatabase for QuerierNamespace { &self, table_name: &str, predicate: &Predicate, + projection: &Option>, ctx: IOxSessionContext, - ) -> Result>, QueryDatabaseError> { + ) -> Result>, DataFusionError> { debug!(%table_name, %predicate, "Finding chunks for table"); // get table metadata let table = match self.tables.get(table_name).map(Arc::clone) { @@ -57,7 +59,7 @@ impl QueryDatabase for QuerierNamespace { .chunks( predicate, ctx.span().map(|span| span.child("querier table chunks")), - &None, // todo: pushdown projection to chunks + projection, ) .await?; @@ -627,7 +629,7 @@ mod tests { .unwrap_err(); assert_eq!( err.to_string(), - format!("Cannot build plan: External error: Chunk pruning failed: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."), + format!("Cannot build plan: Resources exhausted: Query would scan at least {total_size} bytes, more than configured maximum {limit} bytes. Try adjusting your compactor settings or increasing the per query memory limit."), ); } diff --git a/querier/src/table/mod.rs b/querier/src/table/mod.rs index 19835fde6f..767fa6c83a 100644 --- a/querier/src/table/mod.rs +++ b/querier/src/table/mod.rs @@ -8,6 +8,7 @@ use crate::{ IngesterConnection, }; use data_types::{ColumnId, PartitionId, ShardIndex, TableId, TimestampMinMax}; +use datafusion::error::DataFusionError; use futures::{join, StreamExt}; use iox_query::pruning::prune_summaries; use iox_query::{exec::Executor, provider, provider::ChunkPruner, QueryChunk}; @@ -65,6 +66,17 @@ pub enum Error { pub type Result = std::result::Result; +impl From for DataFusionError { + fn from(err: Error) -> Self { + match err { + Error::ChunkPruning { + source: err @ provider::Error::TooMuchData { .. }, + } => Self::ResourcesExhausted(err.to_string()), + _ => Self::External(Box::new(err) as _), + } + } +} + /// Args to create a [`QuerierTable`]. pub struct QuerierTableArgs { pub sharder: Arc>>, diff --git a/querier/src/table/query_access/mod.rs b/querier/src/table/query_access/mod.rs index 5665f79171..e16830577b 100644 --- a/querier/src/table/query_access/mod.rs +++ b/querier/src/table/query_access/mod.rs @@ -66,8 +66,7 @@ impl TableProvider for QuerierTable { ctx.child_span("querier table chunks"), projection, ) - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; + .await?; for chunk in chunks { builder = builder.add_chunk(chunk); diff --git a/querier/src/table/state_reconciler.rs b/querier/src/table/state_reconciler.rs index baa2935911..d5fe4cced6 100644 --- a/querier/src/table/state_reconciler.rs +++ b/querier/src/table/state_reconciler.rs @@ -23,6 +23,7 @@ use crate::{ use self::interface::{IngesterPartitionInfo, ParquetFileInfo, TombstoneInfo}; #[derive(Snafu, Debug)] +#[allow(missing_copy_implementations)] pub enum ReconcileError { #[snafu(display("Compactor processed file that the querier would need to split apart which is not yet implemented"))] CompactorConflict, diff --git a/query_tests/cases/in/delete_all.expected b/query_tests/cases/in/delete_all.expected deleted file mode 100644 index ba828eab9a..0000000000 --- a/query_tests/cases/in/delete_all.expected +++ /dev/null @@ -1,25 +0,0 @@ --- Test Setup: OneDeleteSimpleExprOneChunkDeleteAll --- SQL: SELECT * from cpu; -++ -++ --- SQL: SELECT time from cpu; -++ -++ --- SQL: SELECT count(*), count(bar), count(time) from cpu; -+-----------------+----------------+-----------------+ -| COUNT(UInt8(1)) | COUNT(cpu.bar) | COUNT(cpu.time) | -+-----------------+----------------+-----------------+ -| 0 | 0 | 0 | -+-----------------+----------------+-----------------+ --- SQL: SELECT min(bar), max(bar), min(time), max(time) from cpu; -+--------------+--------------+---------------+---------------+ -| MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) | -+--------------+--------------+---------------+---------------+ -| | | | | -+--------------+--------------+---------------+---------------+ --- SQL: SELECT max(bar) from cpu; -+--------------+ -| MAX(cpu.bar) | -+--------------+ -| | -+--------------+ diff --git a/query_tests/cases/in/delete_all.sql b/query_tests/cases/in/delete_all.sql deleted file mode 100644 index b79612846e..0000000000 --- a/query_tests/cases/in/delete_all.sql +++ /dev/null @@ -1,17 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: OneDeleteSimpleExprOneChunkDeleteAll - --- select * -SELECT * from cpu; - --- select one specific column -SELECT time from cpu; - --- select aggregate of every column inlcuding star -SELECT count(*), count(bar), count(time) from cpu; - --- select aggregate of every column -SELECT min(bar), max(bar), min(time), max(time) from cpu; - --- select aggregate of one column -SELECT max(bar) from cpu; \ No newline at end of file diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_multi_expr_one_chunk.expected deleted file mode 100644 index f0765f7c16..0000000000 --- a/query_tests/cases/in/delete_multi_expr_one_chunk.expected +++ /dev/null @@ -1,207 +0,0 @@ --- Test Setup: OneDeleteMultiExprsOneChunk --- SQL: SELECT * from cpu order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 2 | you | 1970-01-01T00:00:00.000000020Z | -+-----+-----+--------------------------------+ --- SQL: SELECT time, bar from cpu order by time, bar; -+--------------------------------+-----+ -| time | bar | -+--------------------------------+-----+ -| 1970-01-01T00:00:00.000000020Z | 2 | -| 1970-01-01T00:00:00.000000040Z | 1 | -+--------------------------------+-----+ --- SQL: SELECT bar from cpu order by bar; -+-----+ -| bar | -+-----+ -| 1 | -| 2 | -+-----+ --- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu; -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| 2 | 2 | 2 | 1 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000040Z | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ --- SQL: SELECT count(time) from cpu; -+-----------------+ -| COUNT(cpu.time) | -+-----------------+ -| 2 | -+-----------------+ --- SQL: SELECT count(foo) from cpu; -+----------------+ -| COUNT(cpu.foo) | -+----------------+ -| 2 | -+----------------+ --- SQL: SELECT count(bar) from cpu; -+----------------+ -| COUNT(cpu.bar) | -+----------------+ -| 2 | -+----------------+ --- SQL: SELECT count(*) from cpu; -+-----------------+ -| COUNT(UInt8(1)) | -+-----------------+ -| 2 | -+-----------------+ --- SQL: SELECT min(bar) from cpu; -+--------------+ -| MIN(cpu.bar) | -+--------------+ -| 1 | -+--------------+ --- SQL: SELECT foo from cpu; --- Results After Sorting -+-----+ -| foo | -+-----+ -| me | -| you | -+-----+ --- SQL: SELECT min(foo) as min_foo from cpu order by min_foo; -+---------+ -| min_foo | -+---------+ -| me | -+---------+ --- SQL: SELECT max(foo) as max_foo from cpu order by max_foo; -+---------+ -| max_foo | -+---------+ -| you | -+---------+ --- SQL: SELECT min(foo) as min_foo from cpu group by time order by min_foo; -+---------+ -| min_foo | -+---------+ -| me | -| you | -+---------+ --- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo; -+---------+ -| max_foo | -+---------+ -| me | -| you | -+---------+ --- SQL: SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo; -+--------------------------------+---------+ -| time | max_foo | -+--------------------------------+---------+ -| 1970-01-01T00:00:00.000000020Z | you | -| 1970-01-01T00:00:00.000000040Z | me | -+--------------------------------+---------+ --- SQL: SELECT min(foo) as min_foo from cpu group by bar order by min_foo; -+---------+ -| min_foo | -+---------+ -| me | -| you | -+---------+ --- SQL: SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo; -+-----+---------+ -| bar | max_foo | -+-----+---------+ -| 1 | me | -| 2 | you | -+-----+---------+ --- SQL: SELECT max(foo) as max_foo from cpu group by time order by max_foo; -+---------+ -| max_foo | -+---------+ -| me | -| you | -+---------+ --- SQL: SELECT min(time) as min_time from cpu order by min_time; -+--------------------------------+ -| min_time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -+--------------------------------+ --- SQL: SELECT max(time) as max_time from cpu order by max_time; -+--------------------------------+ -| max_time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT min(time) as min_time from cpu group by bar order by min_time; -+--------------------------------+ -| min_time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time; -+-----+--------------------------------+ -| bar | min_time | -+-----+--------------------------------+ -| 1 | 1970-01-01T00:00:00.000000040Z | -| 2 | 1970-01-01T00:00:00.000000020Z | -+-----+--------------------------------+ --- SQL: SELECT max(time) as max_time from cpu group by foo order by max_time; -+--------------------------------+ -| max_time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time; -+-----+--------------------------------+ -| foo | max_time | -+-----+--------------------------------+ -| me | 1970-01-01T00:00:00.000000040Z | -| you | 1970-01-01T00:00:00.000000020Z | -+-----+--------------------------------+ --- SQL: SELECT time from cpu; --- Results After Sorting -+--------------------------------+ -| time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT max(bar) from cpu order by 1; -+--------------+ -| MAX(cpu.bar) | -+--------------+ -| 2 | -+--------------+ --- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 2 | you | 1970-01-01T00:00:00.000000020Z | -+-----+-----+--------------------------------+ --- SQL: SELECT foo from cpu where bar >= 1.0 order by foo; -+-----+ -| foo | -+-----+ -| me | -| you | -+-----+ --- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time; -+--------------------------------+-----+ -| time | bar | -+--------------------------------+-----+ -| 1970-01-01T00:00:00.000000040Z | 1 | -| 1970-01-01T00:00:00.000000020Z | 2 | -+--------------------------------+-----+ --- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 2 | you | 1970-01-01T00:00:00.000000020Z | -+-----+-----+--------------------------------+ --- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma -+----+--------------------------------+ -| mi | ma | -+----+--------------------------------+ -| 2 | 1970-01-01T00:00:00.000000020Z | -+----+--------------------------------+ diff --git a/query_tests/cases/in/delete_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_multi_expr_one_chunk.sql deleted file mode 100644 index 5295c53055..0000000000 --- a/query_tests/cases/in/delete_multi_expr_one_chunk.sql +++ /dev/null @@ -1,61 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: OneDeleteMultiExprsOneChunk - --- select * -SELECT * from cpu order by bar, foo, time; - -SELECT time, bar from cpu order by time, bar; - -SELECT bar from cpu order by bar; - -SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu; - -SELECT count(time) from cpu; - -SELECT count(foo) from cpu; - -SELECT count(bar) from cpu; - -SELECT count(*) from cpu; - -SELECT min(bar) from cpu; - --- IOX_COMPARE: sorted -SELECT foo from cpu; - -SELECT min(foo) as min_foo from cpu order by min_foo; -SELECT max(foo) as max_foo from cpu order by max_foo; - -SELECT min(foo) as min_foo from cpu group by time order by min_foo; -SELECT max(foo) as max_foo from cpu group by time order by max_foo; -SELECT time, max(foo) as max_foo from cpu group by time order by time, max_foo; - -SELECT min(foo) as min_foo from cpu group by bar order by min_foo; -SELECT bar, max(foo) as max_foo from cpu group by bar order by bar, max_foo; -SELECT max(foo) as max_foo from cpu group by time order by max_foo; - -SELECT min(time) as min_time from cpu order by min_time; -SELECT max(time) as max_time from cpu order by max_time; - -SELECT min(time) as min_time from cpu group by bar order by min_time; -SELECT bar, min(time) as min_time from cpu group by bar order by bar, min_time; -SELECT max(time) as max_time from cpu group by foo order by max_time; -SELECT foo, max(time) as max_time from cpu group by foo order by foo, max_time; - --- IOX_COMPARE: sorted -SELECT time from cpu; - -SELECT max(bar) from cpu order by 1; - --------------------------------------------------------- --- With selection predicate - -SELECT * from cpu where bar >= 1.0 order by bar, foo, time; - -SELECT foo from cpu where bar >= 1.0 order by foo; - -SELECT time, bar from cpu where bar >= 1.0 order by bar, time; - -SELECT * from cpu where foo = 'you' order by bar, foo, time; - -SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.expected b/query_tests/cases/in/delete_simple_pred_one_chunk.expected deleted file mode 100644 index f367cdefef..0000000000 --- a/query_tests/cases/in/delete_simple_pred_one_chunk.expected +++ /dev/null @@ -1,91 +0,0 @@ --- Test Setup: OneDeleteSimpleExprOneChunk --- SQL: SELECT * from cpu; -+-----+--------------------------------+ -| bar | time | -+-----+--------------------------------+ -| 2 | 1970-01-01T00:00:00.000000020Z | -+-----+--------------------------------+ --- SQL: SELECT time, bar from cpu; -+--------------------------------+-----+ -| time | bar | -+--------------------------------+-----+ -| 1970-01-01T00:00:00.000000020Z | 2 | -+--------------------------------+-----+ --- SQL: SELECT min(bar), max(bar) from cpu; -+--------------+--------------+ -| MIN(cpu.bar) | MAX(cpu.bar) | -+--------------+--------------+ -| 2 | 2 | -+--------------+--------------+ --- SQL: SELECT time from cpu; -+--------------------------------+ -| time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -+--------------------------------+ --- SQL: SELECT max(time) from cpu; -+--------------------------------+ -| MAX(cpu.time) | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -+--------------------------------+ --- SQL: SELECT min(time) from cpu group by bar; -+--------------------------------+ -| MIN(cpu.time) | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -+--------------------------------+ --- SQL: SELECT bar, min(time) from cpu group by bar; -+-----+--------------------------------+ -| bar | MIN(cpu.time) | -+-----+--------------------------------+ -| 2 | 1970-01-01T00:00:00.000000020Z | -+-----+--------------------------------+ --- SQL: SELECT count(time), max(time) from cpu; -+-----------------+--------------------------------+ -| COUNT(cpu.time) | MAX(cpu.time) | -+-----------------+--------------------------------+ -| 1 | 1970-01-01T00:00:00.000000020Z | -+-----------------+--------------------------------+ --- SQL: SELECT count(time) from cpu; -+-----------------+ -| COUNT(cpu.time) | -+-----------------+ -| 1 | -+-----------------+ --- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu; -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| 1 | 1 | 1 | 2 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ --- SQL: SELECT * from cpu where bar = 2.0; -+-----+--------------------------------+ -| bar | time | -+-----+--------------------------------+ -| 2 | 1970-01-01T00:00:00.000000020Z | -+-----+--------------------------------+ --- SQL: SELECT * from cpu where bar != 2.0; -++ -++ --- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar= 2.0; -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ -| 1 | 1 | 1 | 2 | 2 | 1970-01-01T00:00:00.000000020Z | 1970-01-01T00:00:00.000000020Z | -+-----------------+-----------------+----------------+--------------+--------------+--------------------------------+--------------------------------+ --- SQL: SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar != 2.0; -+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+ -| COUNT(cpu.time) | COUNT(UInt8(1)) | COUNT(cpu.bar) | MIN(cpu.bar) | MAX(cpu.bar) | MIN(cpu.time) | MAX(cpu.time) | -+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+ -| 0 | 0 | 0 | | | | | -+-----------------+-----------------+----------------+--------------+--------------+---------------+---------------+ --- SQL: SELECT time from cpu where bar=2; -+--------------------------------+ -| time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000020Z | -+--------------------------------+ --- SQL: SELECT bar from cpu where bar!= 2; -++ -++ diff --git a/query_tests/cases/in/delete_simple_pred_one_chunk.sql b/query_tests/cases/in/delete_simple_pred_one_chunk.sql deleted file mode 100644 index 7b22641c63..0000000000 --- a/query_tests/cases/in/delete_simple_pred_one_chunk.sql +++ /dev/null @@ -1,37 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: OneDeleteSimpleExprOneChunk - --- select * -SELECT * from cpu; - -SELECT time, bar from cpu; - -SELECT min(bar), max(bar) from cpu; - -SELECT time from cpu; - -SELECT max(time) from cpu; -SELECT min(time) from cpu group by bar; -SELECT bar, min(time) from cpu group by bar; - -SELECT count(time), max(time) from cpu; - -SELECT count(time) from cpu; - -SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu; - ----------------------------------------------------------------- --- Now add selection predicate -SELECT * from cpu where bar = 2.0; - -SELECT * from cpu where bar != 2.0; - -SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar= 2.0; - -SELECT count(time), count(*), count(bar), min(bar), max(bar), min(time), max(time) from cpu where bar != 2.0; - -SELECT time from cpu where bar=2; - -SELECT bar from cpu where bar!= 2; - - diff --git a/query_tests/cases/in/delete_three_chunks_1.expected b/query_tests/cases/in/delete_three_chunks_1.expected deleted file mode 100644 index 47ec3d3de4..0000000000 --- a/query_tests/cases/in/delete_three_chunks_1.expected +++ /dev/null @@ -1,85 +0,0 @@ --- Test Setup: ThreeDeleteThreeChunks --- SQL: SELECT * from cpu order by foo, bar, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 1 | me | 1970-01-01T00:00:00.000000042Z | -| 1 | me | 1970-01-01T00:00:00.000000062Z | -| 4 | me | 1970-01-01T00:00:00.000000050Z | -| 5 | me | 1970-01-01T00:00:00.000000060Z | -| 7 | me | 1970-01-01T00:00:00.000000080Z | -| 3 | you | 1970-01-01T00:00:00.000000070Z | -+-----+-----+--------------------------------+ --- SQL: SELECT time, bar from cpu order by bar, time; -+--------------------------------+-----+ -| time | bar | -+--------------------------------+-----+ -| 1970-01-01T00:00:00.000000040Z | 1 | -| 1970-01-01T00:00:00.000000042Z | 1 | -| 1970-01-01T00:00:00.000000062Z | 1 | -| 1970-01-01T00:00:00.000000070Z | 3 | -| 1970-01-01T00:00:00.000000050Z | 4 | -| 1970-01-01T00:00:00.000000060Z | 5 | -| 1970-01-01T00:00:00.000000080Z | 7 | -+--------------------------------+-----+ --- SQL: SELECT bar from cpu order by bar; -+-----+ -| bar | -+-----+ -| 1 | -| 1 | -| 1 | -| 3 | -| 4 | -| 5 | -| 7 | -+-----+ --- SQL: SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat; -+---+---+---+----+--------------------------------+--------------------------------+ -| t | c | b | mi | mt | mat | -+---+---+---+----+--------------------------------+--------------------------------+ -| 7 | 7 | 7 | 1 | 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z | -+---+---+---+----+--------------------------------+--------------------------------+ --- SQL: SELECT count(time) from cpu; -+-----------------+ -| COUNT(cpu.time) | -+-----------------+ -| 7 | -+-----------------+ --- SQL: SELECT count(foo) from cpu; -+----------------+ -| COUNT(cpu.foo) | -+----------------+ -| 7 | -+----------------+ --- SQL: SELECT count(bar) from cpu; -+----------------+ -| COUNT(cpu.bar) | -+----------------+ -| 7 | -+----------------+ --- SQL: SELECT count(*) from cpu; -+-----------------+ -| COUNT(UInt8(1)) | -+-----------------+ -| 7 | -+-----------------+ --- SQL: SELECT min(bar) from cpu; -+--------------+ -| MIN(cpu.bar) | -+--------------+ -| 1 | -+--------------+ --- SQL: SELECT foo from cpu order by foo; -+-----+ -| foo | -+-----+ -| me | -| me | -| me | -| me | -| me | -| me | -| you | -+-----+ diff --git a/query_tests/cases/in/delete_three_chunks_1.sql b/query_tests/cases/in/delete_three_chunks_1.sql deleted file mode 100644 index c0105412e9..0000000000 --- a/query_tests/cases/in/delete_three_chunks_1.sql +++ /dev/null @@ -1,23 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: ThreeDeleteThreeChunks - --- select * -SELECT * from cpu order by foo, bar, time; - -SELECT time, bar from cpu order by bar, time; - -SELECT bar from cpu order by bar; - -SELECT count(time) as t, count(*) as c, count(bar) as b, min(bar) as mi, min(time) as mt, max(time) as mat from cpu order by t, c, b, mi, mt, mat; - -SELECT count(time) from cpu; - -SELECT count(foo) from cpu; - -SELECT count(bar) from cpu; - -SELECT count(*) from cpu; - -SELECT min(bar) from cpu; - -SELECT foo from cpu order by foo; diff --git a/query_tests/cases/in/delete_three_chunks_2.expected b/query_tests/cases/in/delete_three_chunks_2.expected deleted file mode 100644 index 99fda88e70..0000000000 --- a/query_tests/cases/in/delete_three_chunks_2.expected +++ /dev/null @@ -1,77 +0,0 @@ --- Test Setup: ThreeDeleteThreeChunks --- SQL: SELECT min(foo) from cpu; -+--------------+ -| MIN(cpu.foo) | -+--------------+ -| me | -+--------------+ --- SQL: SELECT max(foo) from cpu; -+--------------+ -| MAX(cpu.foo) | -+--------------+ -| you | -+--------------+ --- SQL: SELECT min(time) from cpu; -+--------------------------------+ -| MIN(cpu.time) | -+--------------------------------+ -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT max(time) from cpu; -+--------------------------------+ -| MAX(cpu.time) | -+--------------------------------+ -| 1970-01-01T00:00:00.000000080Z | -+--------------------------------+ --- SQL: SELECT foo, min(time) from cpu group by foo; --- Results After Sorting -+-----+--------------------------------+ -| foo | MIN(cpu.time) | -+-----+--------------------------------+ -| me | 1970-01-01T00:00:00.000000040Z | -| you | 1970-01-01T00:00:00.000000070Z | -+-----+--------------------------------+ --- SQL: SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time; -+-----+--------------------------------+ -| bar | max_time | -+-----+--------------------------------+ -| 1 | 1970-01-01T00:00:00.000000062Z | -| 3 | 1970-01-01T00:00:00.000000070Z | -| 4 | 1970-01-01T00:00:00.000000050Z | -| 5 | 1970-01-01T00:00:00.000000060Z | -| 7 | 1970-01-01T00:00:00.000000080Z | -+-----+--------------------------------+ --- SQL: SELECT max(time) as max_time from cpu group by bar order by max_time; -+--------------------------------+ -| max_time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000050Z | -| 1970-01-01T00:00:00.000000060Z | -| 1970-01-01T00:00:00.000000062Z | -| 1970-01-01T00:00:00.000000070Z | -| 1970-01-01T00:00:00.000000080Z | -+--------------------------------+ --- SQL: SELECT time from cpu order by time; -+--------------------------------+ -| time | -+--------------------------------+ -| 1970-01-01T00:00:00.000000040Z | -| 1970-01-01T00:00:00.000000042Z | -| 1970-01-01T00:00:00.000000050Z | -| 1970-01-01T00:00:00.000000060Z | -| 1970-01-01T00:00:00.000000062Z | -| 1970-01-01T00:00:00.000000070Z | -| 1970-01-01T00:00:00.000000080Z | -+--------------------------------+ --- SQL: SELECT max(bar) from cpu; -+--------------+ -| MAX(cpu.bar) | -+--------------+ -| 7 | -+--------------+ --- SQL: SELECT min(time), max(time) from cpu; -+--------------------------------+--------------------------------+ -| MIN(cpu.time) | MAX(cpu.time) | -+--------------------------------+--------------------------------+ -| 1970-01-01T00:00:00.000000040Z | 1970-01-01T00:00:00.000000080Z | -+--------------------------------+--------------------------------+ diff --git a/query_tests/cases/in/delete_three_chunks_2.sql b/query_tests/cases/in/delete_three_chunks_2.sql deleted file mode 100644 index bb35711393..0000000000 --- a/query_tests/cases/in/delete_three_chunks_2.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: ThreeDeleteThreeChunks - -SELECT min(foo) from cpu; -SELECT max(foo) from cpu; - -SELECT min(time) from cpu; -SELECT max(time) from cpu; - --- IOX_COMPARE: sorted -SELECT foo, min(time) from cpu group by foo; -SELECT bar, max(time) as max_time from cpu group by bar order by bar, max_time; -SELECT max(time) as max_time from cpu group by bar order by max_time; - -SELECT time from cpu order by time; - -SELECT max(bar) from cpu; - -SELECT min(time), max(time) from cpu; diff --git a/query_tests/cases/in/delete_three_chunks_3.expected b/query_tests/cases/in/delete_three_chunks_3.expected deleted file mode 100644 index 3e0c5fb2f6..0000000000 --- a/query_tests/cases/in/delete_three_chunks_3.expected +++ /dev/null @@ -1,76 +0,0 @@ --- Test Setup: ThreeDeleteThreeChunks --- SQL: SELECT * from cpu where bar != 1.0 order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 3 | you | 1970-01-01T00:00:00.000000070Z | -| 4 | me | 1970-01-01T00:00:00.000000050Z | -| 5 | me | 1970-01-01T00:00:00.000000060Z | -| 7 | me | 1970-01-01T00:00:00.000000080Z | -+-----+-----+--------------------------------+ --- SQL: SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 4 | me | 1970-01-01T00:00:00.000000050Z | -| 5 | me | 1970-01-01T00:00:00.000000060Z | -| 7 | me | 1970-01-01T00:00:00.000000080Z | -+-----+-----+--------------------------------+ --- SQL: SELECT * from cpu where bar = 1 order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 1 | me | 1970-01-01T00:00:00.000000042Z | -| 1 | me | 1970-01-01T00:00:00.000000062Z | -+-----+-----+--------------------------------+ --- SQL: SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 1 | me | 1970-01-01T00:00:00.000000042Z | -| 1 | me | 1970-01-01T00:00:00.000000062Z | -| 4 | me | 1970-01-01T00:00:00.000000050Z | -| 5 | me | 1970-01-01T00:00:00.000000060Z | -| 7 | me | 1970-01-01T00:00:00.000000080Z | -+-----+-----+--------------------------------+ --- SQL: SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time; -++ -++ --- SQL: SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+--------------+ -| MIN(cpu.bar) | -+--------------+ -| 1 | -+--------------+ --- SQL: SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+--------------+ -| MAX(cpu.foo) | -+--------------+ -| me | -+--------------+ --- SQL: SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+--------------------------------+ -| MIN(cpu.time) | -+--------------------------------+ -| 1970-01-01T00:00:00.000000040Z | -+--------------------------------+ --- SQL: SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+----------------+ -| COUNT(cpu.bar) | -+----------------+ -| 6 | -+----------------+ --- SQL: SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+-----------------+ -| COUNT(cpu.time) | -+-----------------+ -| 6 | -+-----------------+ --- SQL: SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); -+-----------------+ -| COUNT(UInt8(1)) | -+-----------------+ -| 6 | -+-----------------+ diff --git a/query_tests/cases/in/delete_three_chunks_3.sql b/query_tests/cases/in/delete_three_chunks_3.sql deleted file mode 100644 index 146fcaf95e..0000000000 --- a/query_tests/cases/in/delete_three_chunks_3.sql +++ /dev/null @@ -1,27 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: ThreeDeleteThreeChunks - --------------------------------------------------------- --- With selection predicate - -SELECT * from cpu where bar != 1.0 order by bar, foo, time; - -SELECT * from cpu where foo = 'me' and bar > 2.0 order by bar, foo, time; - -SELECT * from cpu where bar = 1 order by bar, foo, time; - -SELECT * from cpu where foo = 'me' and (bar > 2 or bar = 1.0) order by bar, foo, time; - -SELECT * from cpu where foo = 'you' and (bar > 3.0 or bar = 1) order by bar, foo, time; - -SELECT min(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); - -SELECT max(foo) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); - -SELECT min(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); - -SELECT count(bar) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); - -SELECT count(time) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); - -SELECT count(*) from cpu where foo = 'me' and (bar > 2 or bar = 1.0); diff --git a/query_tests/cases/in/delete_three_chunks_4.expected b/query_tests/cases/in/delete_three_chunks_4.expected deleted file mode 100644 index 2283d15375..0000000000 --- a/query_tests/cases/in/delete_three_chunks_4.expected +++ /dev/null @@ -1,49 +0,0 @@ --- Test Setup: ThreeDeleteThreeChunks --- SQL: SELECT * from cpu where bar >= 1.0 order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -| 1 | me | 1970-01-01T00:00:00.000000042Z | -| 1 | me | 1970-01-01T00:00:00.000000062Z | -| 3 | you | 1970-01-01T00:00:00.000000070Z | -| 4 | me | 1970-01-01T00:00:00.000000050Z | -| 5 | me | 1970-01-01T00:00:00.000000060Z | -| 7 | me | 1970-01-01T00:00:00.000000080Z | -+-----+-----+--------------------------------+ --- SQL: SELECT foo from cpu where bar >= 1.0 order by foo; -+-----+ -| foo | -+-----+ -| me | -| me | -| me | -| me | -| me | -| me | -| you | -+-----+ --- SQL: SELECT time, bar from cpu where bar >= 1.0 order by bar, time; -+--------------------------------+-----+ -| time | bar | -+--------------------------------+-----+ -| 1970-01-01T00:00:00.000000040Z | 1 | -| 1970-01-01T00:00:00.000000042Z | 1 | -| 1970-01-01T00:00:00.000000062Z | 1 | -| 1970-01-01T00:00:00.000000070Z | 3 | -| 1970-01-01T00:00:00.000000050Z | 4 | -| 1970-01-01T00:00:00.000000060Z | 5 | -| 1970-01-01T00:00:00.000000080Z | 7 | -+--------------------------------+-----+ --- SQL: SELECT * from cpu where foo = 'you' order by bar, foo, time; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 3 | you | 1970-01-01T00:00:00.000000070Z | -+-----+-----+--------------------------------+ --- SQL: SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma; -+----+--------------------------------+ -| mi | ma | -+----+--------------------------------+ -| 3 | 1970-01-01T00:00:00.000000070Z | -+----+--------------------------------+ diff --git a/query_tests/cases/in/delete_three_chunks_4.sql b/query_tests/cases/in/delete_three_chunks_4.sql deleted file mode 100644 index 95442f6b07..0000000000 --- a/query_tests/cases/in/delete_three_chunks_4.sql +++ /dev/null @@ -1,13 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: ThreeDeleteThreeChunks - ----------- -SELECT * from cpu where bar >= 1.0 order by bar, foo, time; - -SELECT foo from cpu where bar >= 1.0 order by foo; - -SELECT time, bar from cpu where bar >= 1.0 order by bar, time; - -SELECT * from cpu where foo = 'you' order by bar, foo, time; - -SELECT min(bar) as mi, max(time) as ma from cpu where foo = 'you' order by mi, ma; diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected deleted file mode 100644 index 6871fa7358..0000000000 --- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.expected +++ /dev/null @@ -1,34 +0,0 @@ --- Test Setup: TwoDeletesMultiExprsOneChunk --- SQL: SELECT * from cpu; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -+-----+-----+--------------------------------+ --- SQL: SELECT foo from cpu; -+-----+ -| foo | -+-----+ -| me | -+-----+ --- SQL: SELECT * from cpu where cast(time as bigint) > 30; -+-----+-----+--------------------------------+ -| bar | foo | time | -+-----+-----+--------------------------------+ -| 1 | me | 1970-01-01T00:00:00.000000040Z | -+-----+-----+--------------------------------+ --- SQL: SELECT count(bar) from cpu where cast(time as bigint) > 30; -+----------------+ -| COUNT(cpu.bar) | -+----------------+ -| 1 | -+----------------+ --- SQL: SELECT * from cpu where cast(time as bigint) > 40; -++ -++ --- SQL: SELECT max(time) from cpu where cast(time as bigint) > 40; -+---------------+ -| MAX(cpu.time) | -+---------------+ -| | -+---------------+ diff --git a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql b/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql deleted file mode 100644 index 132d6f42cf..0000000000 --- a/query_tests/cases/in/delete_two_del_multi_expr_one_chunk.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Demonstrate soft deleted rows will not be return to queries --- IOX_SETUP: TwoDeletesMultiExprsOneChunk - --- select * -SELECT * from cpu; - -SELECT foo from cpu; - -SELECT * from cpu where cast(time as bigint) > 30; - -SELECT count(bar) from cpu where cast(time as bigint) > 30; - -SELECT * from cpu where cast(time as bigint) > 40; - -SELECT max(time) from cpu where cast(time as bigint) > 40; diff --git a/query_tests/src/cases.rs b/query_tests/src/cases.rs index 9946819fac..69caf0dfe5 100644 --- a/query_tests/src/cases.rs +++ b/query_tests/src/cases.rs @@ -1,8 +1,7 @@ - //! This file is auto generated by query_tests/generate. //! Do not edit manually --> will result in sadness -use std::path::Path; use crate::runner::Runner; +use std::path::Path; #[tokio::test] // Tests from "basic.sql", @@ -11,141 +10,8 @@ async fn test_cases_basic_sql() { let input_path = Path::new("cases").join("in").join("basic.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_all.sql", -async fn test_cases_delete_all_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_all.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_multi_expr_one_chunk.sql", -async fn test_cases_delete_multi_expr_one_chunk_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_multi_expr_one_chunk.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_simple_pred_one_chunk.sql", -async fn test_cases_delete_simple_pred_one_chunk_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_simple_pred_one_chunk.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_three_chunks_1.sql", -async fn test_cases_delete_three_chunks_1_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_three_chunks_1.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_three_chunks_2.sql", -async fn test_cases_delete_three_chunks_2_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_three_chunks_2.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_three_chunks_3.sql", -async fn test_cases_delete_three_chunks_3_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_three_chunks_3.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_three_chunks_4.sql", -async fn test_cases_delete_three_chunks_4_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_three_chunks_4.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} - -#[tokio::test] -// Tests from "delete_two_del_multi_expr_one_chunk.sql", -async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() { - test_helpers::maybe_start_logging(); - - let input_path = Path::new("cases").join("in").join("delete_two_del_multi_expr_one_chunk.sql"); - let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -153,15 +19,12 @@ async fn test_cases_delete_two_del_multi_expr_one_chunk_sql() { async fn test_cases_duplicates_ingester_sql() { test_helpers::maybe_start_logging(); - let input_path = Path::new("cases").join("in").join("duplicates_ingester.sql"); + let input_path = Path::new("cases") + .join("in") + .join("duplicates_ingester.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -171,13 +34,8 @@ async fn test_cases_duplicates_parquet_sql() { let input_path = Path::new("cases").join("in").join("duplicates_parquet.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -185,15 +43,12 @@ async fn test_cases_duplicates_parquet_sql() { async fn test_cases_new_sql_system_tables_sql() { test_helpers::maybe_start_logging(); - let input_path = Path::new("cases").join("in").join("new_sql_system_tables.sql"); + let input_path = Path::new("cases") + .join("in") + .join("new_sql_system_tables.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -203,13 +58,8 @@ async fn test_cases_pushdown_sql() { let input_path = Path::new("cases").join("in").join("pushdown.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -219,13 +69,8 @@ async fn test_cases_selectors_sql() { let input_path = Path::new("cases").join("in").join("selectors.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -235,13 +80,8 @@ async fn test_cases_several_chunks_sql() { let input_path = Path::new("cases").join("in").join("several_chunks.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -249,15 +89,12 @@ async fn test_cases_several_chunks_sql() { async fn test_cases_sql_information_schema_sql() { test_helpers::maybe_start_logging(); - let input_path = Path::new("cases").join("in").join("sql_information_schema.sql"); + let input_path = Path::new("cases") + .join("in") + .join("sql_information_schema.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -267,13 +104,8 @@ async fn test_cases_timestamps_sql() { let input_path = Path::new("cases").join("in").join("timestamps.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -283,13 +115,8 @@ async fn test_cases_two_chunks_sql() { let input_path = Path::new("cases").join("in").join("two_chunks.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); } #[tokio::test] @@ -297,13 +124,10 @@ async fn test_cases_two_chunks_sql() { async fn test_cases_two_chunks_missing_columns_sql() { test_helpers::maybe_start_logging(); - let input_path = Path::new("cases").join("in").join("two_chunks_missing_columns.sql"); + let input_path = Path::new("cases") + .join("in") + .join("two_chunks_missing_columns.sql"); let mut runner = Runner::new(); - runner - .run(input_path) - .await - .expect("test failed"); - runner - .flush() - .expect("flush worked"); -} \ No newline at end of file + runner.run(input_path).await.expect("test failed"); + runner.flush().expect("flush worked"); +} diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs index eecb583e6b..8d7339dafc 100644 --- a/query_tests/src/influxrpc/field_columns.rs +++ b/query_tests/src/influxrpc/field_columns.rs @@ -56,8 +56,6 @@ async fn test_field_columns_no_predicate() { run_field_columns_test_case(TwoMeasurementsManyFields {}, predicate, expected_fields).await; } -// NGA todo: add delete tests when the TwoMeasurementsManyFieldsWithDelete available - #[tokio::test] async fn test_field_columns_with_pred() { // get only fields from h20 (but both chunks) @@ -201,86 +199,6 @@ async fn test_field_name_plan() { run_field_columns_test_case(OneMeasurementManyFields {}, predicate, expected_fields).await; } -#[tokio::test] -async fn test_field_name_plan_with_delete() { - test_helpers::maybe_start_logging(); - - let predicate = Predicate::default().with_range(0, 2000); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_fields = FieldList { - fields: vec![ - Field { - name: "field1".into(), - data_type: DataType::Float64, - last_timestamp: 100, - }, - Field { - name: "field2".into(), - data_type: DataType::Utf8, - last_timestamp: 100, - }, - Field { - name: "field3".into(), - data_type: DataType::Float64, - last_timestamp: 100, - }, - ], - }; - - run_field_columns_test_case( - OneMeasurementManyFieldsWithDelete {}, - predicate, - expected_fields, - ) - .await; -} - -#[tokio::test] -async fn test_field_name_plan_with_delete_all_time() { - test_helpers::maybe_start_logging(); - - let predicate = Predicate::default(); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_fields = FieldList { - fields: vec![ - Field { - name: "field1".into(), - data_type: DataType::Float64, - last_timestamp: 0, // all time queries are optimized but do not return timestamps - }, - Field { - name: "field2".into(), - data_type: DataType::Utf8, - last_timestamp: 0, - }, - Field { - name: "field3".into(), - data_type: DataType::Float64, - last_timestamp: 0, - }, - Field { - name: "field4".into(), - data_type: DataType::Boolean, - last_timestamp: 0, - }, - Field { - name: "field5".into(), - data_type: DataType::Boolean, - last_timestamp: 0, - }, - ], - }; - - run_field_columns_test_case( - OneMeasurementManyFieldsWithDelete {}, - predicate, - expected_fields, - ) - .await; -} - #[tokio::test] async fn list_field_columns_all_time() { let predicate = Predicate::default().with_range(MIN_NANO_TIME, MAX_NANO_TIME); diff --git a/query_tests/src/influxrpc/read_filter.rs b/query_tests/src/influxrpc/read_filter.rs index c0485f42aa..7f32084f46 100644 --- a/query_tests/src/influxrpc/read_filter.rs +++ b/query_tests/src/influxrpc/read_filter.rs @@ -4,15 +4,13 @@ use std::sync::Arc; #[cfg(test)] use crate::scenarios::{ DbScenario, DbSetup, EndToEndTest, TwoMeasurements, TwoMeasurementsManyFields, - TwoMeasurementsWithDelete, TwoMeasurementsWithDeleteAll, }; use crate::{ db::AbstractDb, influxrpc::util::run_series_set_plan_maybe_error, scenarios::{ MeasurementStatusCode, MeasurementsForDefect2845, MeasurementsSortableTags, - MeasurementsSortableTagsWithDelete, TwoMeasurementsMultiSeries, - TwoMeasurementsMultiSeriesWithDelete, TwoMeasurementsMultiSeriesWithDeleteAll, + TwoMeasurementsMultiSeries, }, }; use datafusion::{ @@ -205,12 +203,12 @@ async fn test_read_filter_invalid_predicate_case() { #[tokio::test] async fn test_read_filter_unknown_column_in_predicate() { let predicate = Predicate::new() - // mystery_region is not a real column, so this predicate is + // mystery_region and bar are not real columns, so this predicate is // invalid but IOx should be able to handle it (and produce no results) .with_expr( - col("baz") - .eq(lit(4i32)) - .or(col("bar").and(col("mystery_region").gt(lit(5i32)))), + col("baz").eq(lit(4i32)).or(col("bar") + .eq(lit("baz")) + .and(col("mystery_region").gt(lit(5i32)))), ); let predicate = InfluxRpcPredicate::new(None, predicate); @@ -220,39 +218,6 @@ async fn test_read_filter_unknown_column_in_predicate() { run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_no_pred_with_delete() { - let expected_results = vec![ - "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n FloatPoints timestamps: [100], values: [70.4]", - "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n FloatPoints timestamps: [350], values: [90.0]", - "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.0, 51.0]", - "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.4, 53.4]", - ]; - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - InfluxRpcPredicate::default(), - expected_results, - ) - .await; -} - -#[tokio::test] -async fn test_read_filter_data_no_pred_with_delete_all() { - // nothing from h2o table because all rows were deleted - let expected_results = vec![ - "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.0, 51.0]", - "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.4, 53.4]", - ]; - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDeleteAll {}, - InfluxRpcPredicate::default(), - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_filter_data_filter() { // filter out one row in h20 @@ -281,58 +246,6 @@ async fn test_read_filter_data_filter() { run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_filter_with_delete() { - // filter out one row in h20 but the leftover row was deleted to nothing will be returned - let predicate = Predicate::default() - .with_range(200, 300) - .with_expr(col("state").eq(lit("CA"))); // state=CA - - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![]; - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate, - expected_results.clone(), - ) - .await; - - // Same results via a != predicate. - let predicate = Predicate::default() - .with_range(200, 300) - .with_expr(col("state").not_eq(lit("MA"))); // state=CA - - let predicate = InfluxRpcPredicate::new(None, predicate); - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate, - expected_results, - ) - .await; - - // Use different predicate to have data returned - let predicate = Predicate::default() - .with_range(100, 300) - .with_expr(col("state").eq(lit("MA"))) // state=MA - .with_expr(col("_measurement").eq(lit("h2o"))); - - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![ - "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n FloatPoints timestamps: [100], values: [70.4]", - ]; - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate, - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_filter_data_filter_fields() { // filter out one row in h20 @@ -350,8 +263,6 @@ async fn test_read_filter_data_filter_fields() { run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await; } -// NGA todo: add delete tests here after we have delete scenarios for 2 chunks for 1 table - #[tokio::test] async fn test_read_filter_data_filter_measurement_pred() { // use an expr on table name to pick just the last row from o2 @@ -378,16 +289,6 @@ async fn test_read_filter_data_pred_refers_to_non_existent_column() { run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_pred_refers_to_non_existent_column_with_delete() { - let predicate = Predicate::default().with_expr(col("tag_not_in_h20").eq(lit("foo"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![] as Vec<&str>; - - run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await; -} - #[tokio::test] async fn test_read_filter_data_pred_no_columns() { // predicate with no columns, @@ -402,59 +303,6 @@ async fn test_read_filter_data_pred_no_columns() { run_read_filter_test_case(TwoMeasurements {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_pred_no_columns_with_delete() { - // predicate with no columns, - let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![ - "Series tags={_field=user, _measurement=cpu, region=west}\n FloatPoints timestamps: [100], values: [23.2]", - "Series tags={_field=bytes, _measurement=disk, region=east}\n IntegerPoints timestamps: [200], values: [99]", - ]; - - run_read_filter_test_case(TwoMeasurementsWithDelete {}, predicate, expected_results).await; -} - -#[tokio::test] -async fn test_read_filter_data_pred_no_columns_with_delete_all() { - // predicate with no columns, - let predicate = Predicate::default().with_expr(lit("foo").eq(lit("foo"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - - // Only table disk has no deleted data - let expected_results = vec![ - "Series tags={_field=bytes, _measurement=disk, region=east}\n IntegerPoints timestamps: [200], values: [99]", - ]; - - run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await; -} - -#[tokio::test] -async fn test_read_filter_data_pred_refers_to_good_and_non_existent_columns() { - // predicate with both a column that does and does not appear - let predicate = Predicate::default() - .with_expr(col("state").eq(lit("MA"))) - .with_expr(col("tag_not_in_h20").eq(lit("foo"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![] as Vec<&str>; - - run_read_filter_test_case( - TwoMeasurements {}, - predicate.clone(), - expected_results.clone(), - ) - .await; - run_read_filter_test_case( - TwoMeasurementsWithDelete {}, - predicate.clone(), - expected_results.clone(), - ) - .await; - run_read_filter_test_case(TwoMeasurementsWithDeleteAll {}, predicate, expected_results).await; -} - #[tokio::test] async fn test_read_filter_data_pred_using_regex_match() { let predicate = Predicate::default() @@ -487,50 +335,6 @@ async fn test_read_filter_data_pred_using_regex_match_on_field() { run_read_filter_test_case(TwoMeasurementsManyFields {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_pred_using_regex_match_with_delete() { - let predicate = Predicate::default() - .with_range(200, 300) - // will match CA state - .with_regex_match_expr("state", "C.*"); - let predicate = InfluxRpcPredicate::new(None, predicate); - - // the selected row was soft deleted - let expected_results = vec![]; - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate, - expected_results, - ) - .await; - - // Different predicate to have data returned - let predicate = Predicate::default() - .with_range(200, 400) - // will match CA state - .with_regex_match_expr("state", "C.*"); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let expected_results = vec![ - "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n FloatPoints timestamps: [350], values: [90.0]", - ]; - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate.clone(), - expected_results, - ) - .await; - - // Try same predicate but on delete_all data - let expected_results = vec![]; - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDeleteAll {}, - predicate, - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_filter_data_pred_using_regex_not_match() { let predicate = Predicate::default() @@ -600,45 +404,6 @@ async fn test_read_filter_data_pred_unsupported_in_scan() { run_read_filter_test_case(TwoMeasurementsMultiSeries {}, predicate, expected_results).await; } -#[tokio::test] -async fn test_read_filter_data_pred_unsupported_in_scan_with_delete() { - test_helpers::maybe_start_logging(); - - // These predicates can't be pushed down into chunks, but they can - // be evaluated by the general purpose DataFusion plan - - // (STATE = 'CA') OR (READING > 0) - let predicate = - Predicate::default().with_expr(col("state").eq(lit("CA")).or(col("reading").gt(lit(0)))); - let predicate = InfluxRpcPredicate::new(None, predicate); - - // Note these results include data from both o2 and h2o - let expected_results = vec![ - "Series tags={_field=temp, _measurement=h2o, city=LA, state=CA}\n FloatPoints timestamps: [350], values: [90.0]", - "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.0, 51.0]", - "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.4, 53.4]", - ]; - - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDelete {}, - predicate.clone(), - expected_results, - ) - .await; - - // With delete all from h2o, no rows from h2p should be returned - let expected_results = vec![ - "Series tags={_field=reading, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.0, 51.0]", - "Series tags={_field=temp, _measurement=o2, city=Boston, state=MA}\n FloatPoints timestamps: [100, 250], values: [50.4, 53.4]", - ]; - run_read_filter_test_case( - TwoMeasurementsMultiSeriesWithDeleteAll {}, - predicate, - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_filter_data_plan_order() { test_helpers::maybe_start_logging(); @@ -659,25 +424,6 @@ async fn test_read_filter_data_plan_order() { .await; } -#[tokio::test] -async fn test_read_filter_data_plan_order_with_delete() { - test_helpers::maybe_start_logging(); - let expected_results = vec![ - "Series tags={_field=other, _measurement=h2o, city=Boston, state=MA}\n FloatPoints timestamps: [250], values: [5.0]", - "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA}\n FloatPoints timestamps: [250], values: [70.5]", - "Series tags={_field=temp, _measurement=h2o, city=Boston, state=MA, zz_tag=A}\n FloatPoints timestamps: [1000], values: [70.4]", - "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=A}\n FloatPoints timestamps: [800], values: [70.1]", - "Series tags={_field=temp, _measurement=h2o, city=Kingston, state=MA, zz_tag=B}\n FloatPoints timestamps: [100], values: [70.2]", - ]; - - run_read_filter_test_case( - MeasurementsSortableTagsWithDelete {}, - InfluxRpcPredicate::default(), - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_filter_filter_on_value() { test_helpers::maybe_start_logging(); diff --git a/query_tests/src/influxrpc/read_group.rs b/query_tests/src/influxrpc/read_group.rs index 25a0be0732..8867710b65 100644 --- a/query_tests/src/influxrpc/read_group.rs +++ b/query_tests/src/influxrpc/read_group.rs @@ -5,7 +5,6 @@ use crate::{ AnotherMeasurementForAggs, DbScenario, DbSetup, MeasurementForDefect2691, MeasurementForGroupByField, MeasurementForGroupKeys, MeasurementForMax, MeasurementForMin, MeasurementForSelectors, OneMeasurementForAggs, OneMeasurementNoTags2, - OneMeasurementNoTagsWithDelete, OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk, TwoMeasurementForAggs, TwoMeasurementsManyFields, TwoMeasurementsManyFieldsOneChunk, }, }; @@ -93,75 +92,6 @@ async fn test_read_group_data_no_tag_columns() { .await; } -#[tokio::test] -async fn test_read_group_data_no_tag_columns_count_with_delete() { - let agg = Aggregate::Count; - let group_columns = vec![]; - let expected_results = vec![ - "Group tag_keys: _field, _measurement partition_key_vals: ", - "Series tags={_field=foo, _measurement=m0}\n IntegerPoints timestamps: [2], values: [1]", - ]; - run_read_group_test_case( - OneMeasurementNoTagsWithDelete {}, - InfluxRpcPredicate::default(), - agg, - group_columns.clone(), - expected_results, - ) - .await; -} - -#[tokio::test] -async fn test_read_group_data_no_tag_columns_min_with_delete() { - let agg = Aggregate::Min; - let group_columns = vec![]; - let expected_results = vec![ - "Group tag_keys: _field, _measurement partition_key_vals: ", - "Series tags={_field=foo, _measurement=m0}\n FloatPoints timestamps: [2], values: [2.0]", - ]; - - run_read_group_test_case( - OneMeasurementNoTagsWithDelete {}, - InfluxRpcPredicate::default(), - agg, - group_columns.clone(), - expected_results, - ) - .await; -} - -#[tokio::test] -async fn test_read_group_data_no_tag_columns_count_with_delete_all() { - let agg = Aggregate::Count; - let group_columns = vec![]; - let expected_results = vec![]; - - run_read_group_test_case( - OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {}, - InfluxRpcPredicate::default(), - agg, - group_columns.clone(), - expected_results, - ) - .await; -} - -#[tokio::test] -async fn test_read_group_data_no_tag_columns_min_with_delete_all() { - let agg = Aggregate::Min; - let group_columns = vec![]; - let expected_results = vec![]; - - run_read_group_test_case( - OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {}, - InfluxRpcPredicate::default(), - agg, - group_columns, - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_group_data_pred() { let predicate = Predicate::default() diff --git a/query_tests/src/influxrpc/read_window_aggregate.rs b/query_tests/src/influxrpc/read_window_aggregate.rs index d1547dd6d2..1c3d1b44a0 100644 --- a/query_tests/src/influxrpc/read_window_aggregate.rs +++ b/query_tests/src/influxrpc/read_window_aggregate.rs @@ -170,47 +170,6 @@ async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697() { .await; } -#[tokio::test] -async fn test_grouped_series_set_plan_group_aggregate_min_defect_2697_with_delete() { - let predicate = Predicate::default() - // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z' - .with_range(1609459201000000001, 1609459201000000031); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let agg = Aggregate::Min; - let every = WindowDuration::from_nanoseconds(10); - let offset = WindowDuration::from_nanoseconds(0); - - // one row deleted - let expected_results = vec![ - "Series tags={_field=bar, _measurement=mm, section=1a}\n FloatPoints timestamps: [1609459201000000011], values: [5.0]", - "Series tags={_field=foo, _measurement=mm, section=1a}\n FloatPoints timestamps: [1609459201000000001, 1609459201000000024], values: [1.0, 11.24]", - "Series tags={_field=bar, _measurement=mm, section=2b}\n FloatPoints timestamps: [1609459201000000009, 1609459201000000015], values: [4.0, 6.0]", - "Series tags={_field=foo, _measurement=mm, section=2b}\n FloatPoints timestamps: [1609459201000000002], values: [2.0]", - ]; - run_read_window_aggregate_test_case( - MeasurementForDefect2697WithDelete {}, - predicate.clone(), - agg, - every, - offset, - expected_results, - ) - .await; - - // all rows deleted - let expected_results = vec![]; - run_read_window_aggregate_test_case( - MeasurementForDefect2697WithDeleteAll {}, - predicate, - agg, - every, - offset, - expected_results, - ) - .await; -} - // See https://github.com/influxdata/influxdb_iox/issues/2697 #[tokio::test] async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697() { @@ -276,50 +235,6 @@ async fn test_grouped_series_set_plan_group_aggregate_filter_on_field() { .await; } -#[tokio::test] -async fn test_grouped_series_set_plan_group_aggregate_sum_defect_2697_with_delete() { - let predicate = Predicate::default() - // time >= '2021-01-01T00:00:01.000000001Z' AND time <= '2021-01-01T00:00:01.000000031Z' - .with_range(1609459201000000001, 1609459201000000031); - let predicate = InfluxRpcPredicate::new(None, predicate); - - let agg = Aggregate::Sum; - let every = WindowDuration::from_nanoseconds(10); - let offset = WindowDuration::from_nanoseconds(0); - - // one row deleted - - // The windowed aggregate is using a non-selector aggregate (SUM, COUNT, MEAD). - // For each distinct series the window defines the `time` column - let expected_results = vec![ - "Series tags={_field=bar, _measurement=mm, section=1a}\n FloatPoints timestamps: [1609459201000000020], values: [5.0]", - "Series tags={_field=foo, _measurement=mm, section=1a}\n FloatPoints timestamps: [1609459201000000010, 1609459201000000030], values: [4.0, 11.24]", - "Series tags={_field=bar, _measurement=mm, section=2b}\n FloatPoints timestamps: [1609459201000000010, 1609459201000000020], values: [4.0, 6.0]", - "Series tags={_field=foo, _measurement=mm, section=2b}\n FloatPoints timestamps: [1609459201000000010], values: [2.0]", - ]; - run_read_window_aggregate_test_case( - MeasurementForDefect2697WithDelete {}, - predicate.clone(), - agg, - every, - offset, - expected_results, - ) - .await; - - // all rows deleted - let expected_results = vec![]; - run_read_window_aggregate_test_case( - MeasurementForDefect2697WithDeleteAll {}, - predicate, - agg, - every, - offset, - expected_results, - ) - .await; -} - #[tokio::test] async fn test_read_window_aggregate_overflow() { let predicate = Predicate::default().with_range(1609459201000000001, 1609459201000000024); diff --git a/query_tests/src/influxrpc/table_names.rs b/query_tests/src/influxrpc/table_names.rs index e18710d099..c7f23c3cd1 100644 --- a/query_tests/src/influxrpc/table_names.rs +++ b/query_tests/src/influxrpc/table_names.rs @@ -100,106 +100,31 @@ async fn list_table_names_no_non_null_general_data_passes() { run_table_names_test_case(TwoMeasurementsManyFields {}, predicate, vec![]).await; } -#[tokio::test] -async fn list_table_names_no_data_pred_with_delete() { - run_table_names_test_case( - TwoMeasurementsWithDelete {}, - InfluxRpcPredicate::default(), - vec!["cpu", "disk"], - ) - .await; -} - -#[tokio::test] -async fn list_table_names_no_data_pred_with_delete_all() { - run_table_names_test_case( - TwoMeasurementsWithDeleteAll {}, - InfluxRpcPredicate::default(), - vec!["disk"], - ) - .await; -} - #[tokio::test] async fn list_table_names_data_pred_0_201() { run_table_names_test_case(TwoMeasurements {}, tsp(0, 201), vec!["cpu", "disk"]).await; } -#[tokio::test] -async fn list_table_names_data_pred_0_201_with_delete() { - run_table_names_test_case( - TwoMeasurementsWithDelete {}, - tsp(0, 201), - vec!["cpu", "disk"], - ) - .await; -} - -#[tokio::test] -async fn list_table_names_data_pred_0_201_with_delete_all() { - run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 201), vec!["disk"]).await; -} - #[tokio::test] async fn list_table_names_data_pred_0_200() { run_table_names_test_case(TwoMeasurements {}, tsp(0, 200), vec!["cpu"]).await; } -#[tokio::test] -async fn list_table_names_data_pred_0_200_with_delete() { - run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(0, 200), vec!["cpu"]).await; -} - -#[tokio::test] -async fn list_table_names_data_pred_0_200_with_delete_all() { - run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(0, 200), vec![]).await; -} - #[tokio::test] async fn list_table_names_data_pred_50_101() { run_table_names_test_case(TwoMeasurements {}, tsp(50, 101), vec!["cpu"]).await; } -#[tokio::test] -async fn list_table_names_data_pred_50_101_with_delete() { - run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(50, 101), vec!["cpu"]).await; -} - -#[tokio::test] -async fn list_table_names_data_pred_50_101_with_delete_all() { - run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(50, 101), vec![]).await; -} - #[tokio::test] async fn list_table_names_data_pred_101_160() { run_table_names_test_case(TwoMeasurements {}, tsp(101, 160), vec!["cpu"]).await; } -#[tokio::test] -async fn list_table_names_data_pred_101_160_with_delete() { - run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(101, 160), vec![]).await; -} - -#[tokio::test] -async fn list_table_names_data_pred_101_160_with_delete_all() { - run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(101, 160), vec![]).await; -} - #[tokio::test] async fn list_table_names_data_pred_250_300() { run_table_names_test_case(TwoMeasurements {}, tsp(250, 300), vec![]).await; } -#[tokio::test] -async fn list_table_names_data_pred_250_300_with_delete() { - run_table_names_test_case(TwoMeasurementsWithDelete {}, tsp(250, 300), vec![]).await; -} - -#[tokio::test] -async fn list_table_names_data_pred_250_300_with_delete_all() { - run_table_names_test_case(TwoMeasurementsWithDeleteAll {}, tsp(250, 300), vec![]).await; -} - #[tokio::test] async fn list_table_names_max_time_included() { run_table_names_test_case( diff --git a/query_tests/src/influxrpc/tag_keys.rs b/query_tests/src/influxrpc/tag_keys.rs index da21ca52d7..a15672fde0 100644 --- a/query_tests/src/influxrpc/tag_keys.rs +++ b/query_tests/src/influxrpc/tag_keys.rs @@ -169,24 +169,6 @@ async fn list_tag_name_end_to_end() { run_tag_keys_test_case(EndToEndTest {}, predicate, expected_tag_keys).await; } -#[tokio::test] -async fn list_tag_name_end_to_end_with_delete_and_pred() { - let predicate = Predicate::default() - .with_range(0, 10000) - .with_expr(col("host").eq(lit("server01"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - let expected_tag_keys = vec!["host", "region"]; - run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await; -} - -#[tokio::test] -async fn list_tag_name_end_to_end_with_delete() { - let predicate = Predicate::default().with_expr(col("_measurement").eq(lit("swap"))); - let predicate = InfluxRpcPredicate::new(None, predicate); - let expected_tag_keys = vec!["host", "name"]; - run_tag_keys_test_case(EndToEndTestWithDelete {}, predicate, expected_tag_keys).await; -} - #[tokio::test] async fn list_tag_name_max_time() { test_helpers::maybe_start_logging(); diff --git a/query_tests/src/influxrpc/tag_values.rs b/query_tests/src/influxrpc/tag_values.rs index 7a99ab59e7..0e9e2c532a 100644 --- a/query_tests/src/influxrpc/tag_values.rs +++ b/query_tests/src/influxrpc/tag_values.rs @@ -80,32 +80,6 @@ async fn list_tag_values_no_predicate_state_col() { .await; } -#[tokio::test] -async fn list_tag_values_no_predicate_state_col_with_delete() { - let tag_name = "state"; - let expected_tag_keys = vec!["CA", "MA"]; - run_tag_values_test_case( - OneMeasurementManyNullTagsWithDelete {}, - tag_name, - InfluxRpcPredicate::default(), - expected_tag_keys, - ) - .await; -} - -#[tokio::test] -async fn list_tag_values_no_predicate_state_col_with_delete_all() { - let tag_name = "state"; - let expected_tag_keys = vec![]; - run_tag_values_test_case( - OneMeasurementManyNullTagsWithDeleteAll {}, - tag_name, - InfluxRpcPredicate::default(), - expected_tag_keys, - ) - .await; -} - #[tokio::test] async fn list_tag_values_no_predicate_city_col() { let tag_name = "city"; diff --git a/query_tests/src/scenarios/library.rs b/query_tests/src/scenarios/library.rs index 5e52a1ba9c..f3fb22c756 100644 --- a/query_tests/src/scenarios/library.rs +++ b/query_tests/src/scenarios/library.rs @@ -6,7 +6,6 @@ use super::{ }; use crate::scenarios::util::{make_n_chunks_scenario, ChunkData}; use async_trait::async_trait; -use data_types::{DeleteExpr, DeletePredicate, Op, Scalar, TimestampRange}; use iox_query::frontend::sql::SqlQueryPlanner; #[derive(Debug)] @@ -83,82 +82,6 @@ impl DbSetup for OneMeasurementManyNullTags { } } -#[derive(Debug)] -pub struct OneMeasurementManyNullTagsWithDelete {} -#[async_trait] -impl DbSetup for OneMeasurementManyNullTagsWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let lp_lines = vec![ - "h2o,state=CA,city=LA,county=LA temp=70.4 100", - "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250", - "h2o,state=MA,city=Boston temp=50.4 200", - "h2o,state=CA temp=79.0 300", - "h2o,state=NY temp=60.8 400", - "h2o,state=NY,city=NYC temp=61.0 500", - "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600", - ]; - - // pred: delete from h2o where 400 <= time <= 602 and state=NY - // 3 rows of h2o & NY state will be deleted - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(400, 602), - exprs: vec![DeleteExpr::new( - "state".to_string(), - Op::Eq, - Scalar::String(("NY").to_string()), - )], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - -#[derive(Debug)] -pub struct OneMeasurementManyNullTagsWithDeleteAll {} -#[async_trait] -impl DbSetup for OneMeasurementManyNullTagsWithDeleteAll { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let lp_lines = vec![ - "h2o,state=CA,city=LA,county=LA temp=70.4 100", - "h2o,state=MA,city=Boston,county=Suffolk temp=72.4 250", - "h2o,state=MA,city=Boston temp=50.4 200", - "h2o,state=CA temp=79.0 300", - "h2o,state=NY temp=60.8 400", - "h2o,state=NY,city=NYC temp=61.0 500", - "h2o,state=NY,city=NYC,borough=Brooklyn temp=61.0 600", - ]; - - // pred: delete from h2o where 100 <= time <= 602 - // all rows of h2o will be deleted - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(100, 602), - exprs: vec![], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - /// Two measurements data in different chunk scenarios #[derive(Debug)] pub struct TwoMeasurements {} @@ -177,85 +100,6 @@ impl DbSetup for TwoMeasurements { } } -/// Two measurements data in different chunk scenarios -/// with one delete applied at different stages of the chunk -#[derive(Debug)] -pub struct TwoMeasurementsWithDelete {} -#[async_trait] -impl DbSetup for TwoMeasurementsWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let lp_lines = vec![ - "cpu,region=west user=23.2 100", - "cpu,region=west user=21.0 150", - "disk,region=east bytes=99i 200", - ]; - - // pred: delete from cpu where 120 <= time <= 160 and region="west" - // delete 1 row from cpu with timestamp 150 - let table_name = "cpu"; - let pred = DeletePredicate { - range: TimestampRange::new(120, 160), - exprs: vec![DeleteExpr::new( - "region".to_string(), - Op::Eq, - Scalar::String("west".to_string()), - )], - }; - - // return all possible combination scenarios of a chunk stage and when the delete - // predicates are applied - all_scenarios_for_one_chunk(vec![&pred], vec![], lp_lines, table_name, partition_key).await - } -} - -/// Two measurements data in different chunk scenarios -/// with 2 deletes that remove all data from one table -#[derive(Debug)] -pub struct TwoMeasurementsWithDeleteAll {} -#[async_trait] -impl DbSetup for TwoMeasurementsWithDeleteAll { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let lp_lines = vec![ - "cpu,region=west user=23.2 100", - "cpu,region=west user=21.0 150", - "disk,region=east bytes=99i 200", - ]; - - // pred: delete from cpu where 120 <= time <= 160 and region="west" - // which will delete second row of the cpu - let table_name = "cpu"; - let pred1 = DeletePredicate { - range: TimestampRange::new(120, 160), - exprs: vec![DeleteExpr::new( - "region".to_string(), - Op::Eq, - Scalar::String("west".to_string()), - )], - }; - - // delete the first row of the cpu - let pred2 = DeletePredicate { - range: TimestampRange::new(0, 110), - exprs: vec![], - }; - - // return all possible combination scenarios of a chunk stage and when the delete - // predicates are applied - all_scenarios_for_one_chunk( - vec![&pred1], - vec![&pred2], - lp_lines, - table_name, - partition_key, - ) - .await - } -} - #[derive(Debug)] pub struct TwoMeasurementsUnsignedType {} #[async_trait] @@ -710,44 +554,6 @@ impl DbSetup for OneMeasurementManyFields { all_scenarios_for_one_chunk(vec![], vec![], lp_lines, "h2o", partition_key).await } } - -#[derive(Debug)] -pub struct OneMeasurementManyFieldsWithDelete {} -#[async_trait] -impl DbSetup for OneMeasurementManyFieldsWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - // Order this so field3 comes before field2 - // (and thus the columns need to get reordered) - let lp_lines = vec![ - "h2o,tag1=foo,tag2=bar field1=70.6,field3=2 100", - "h2o,tag1=foo,tag2=bar field1=70.4,field2=\"ss\" 100", - "h2o,tag1=foo,tag2=bar field1=70.5,field2=\"ss\" 100", - "h2o,tag1=foo,tag2=bar field1=70.6,field4=true 1000", - "h2o,tag1=foo,tag2=bar field1=70.3,field5=false 3000", - ]; - - // pred: delete from h2o where 1000 <= time <= 1100 - // 1 rows of h2o with timestamp 1000 will be deleted which means - // field4 no longer available - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(1000, 1100), - exprs: vec![], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - /// This data (from end to end test) #[derive(Debug)] pub struct EndToEndTest {} @@ -772,48 +578,6 @@ impl DbSetup for EndToEndTest { } } -#[derive(Debug)] -pub struct EndToEndTestWithDelete {} -#[async_trait] -impl DbSetup for EndToEndTestWithDelete { - async fn make(&self) -> Vec { - let lp_lines = vec![ - "cpu_load_short,host=server01,region=us-west value=0.64 0000", - "cpu_load_short,host=server01 value=27.99 1000", - "cpu_load_short,host=server02,region=us-west value=3.89 2000", - "cpu_load_short,host=server01,region=us-east value=1234567.891011 3000", - "cpu_load_short,host=server01,region=us-west value=0.000003 4000", - "system,host=server03 uptime=1303385 5000", - "swap,host=server01,name=disk0 in=3,out=4 6000", - "status active=t 7000", - "attributes color=\"blue\" 8000", - ]; - - let partition_key = "1970-01-01T00"; - - // pred: delete from swap where 6000 <= time <= 6000 and name=disk0 - // 1 rows of swap with name=disk0 will be deleted - let delete_table_name = "swap"; - let pred = DeletePredicate { - range: TimestampRange::new(6000, 6000), - exprs: vec![DeleteExpr::new( - "name".to_string(), - Op::Eq, - Scalar::String(("disk0").to_string()), - )], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - #[derive(Debug)] pub struct TwoMeasurementsMultiSeries {} #[async_trait] @@ -838,84 +602,6 @@ impl DbSetup for TwoMeasurementsMultiSeries { } } -#[derive(Debug)] -pub struct TwoMeasurementsMultiSeriesWithDelete {} -#[async_trait] -impl DbSetup for TwoMeasurementsMultiSeriesWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let mut lp_lines = vec![ - "h2o,state=MA,city=Boston temp=70.4 100", // to row 2 - "h2o,state=MA,city=Boston temp=72.4 250", // to row 1 - "h2o,state=CA,city=LA temp=90.0 200", // to row 0 - "h2o,state=CA,city=LA temp=90.0 350", // to row 3 - "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5 - "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4 - ]; - - // Swap around data is not inserted in series order - lp_lines.swap(0, 2); - lp_lines.swap(4, 5); - - // pred: delete from h2o where 120 <= time <= 250 - // 2 rows of h2o with timestamp 200 and 350 will be deleted - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(120, 250), - exprs: vec![], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - -#[derive(Debug)] -pub struct TwoMeasurementsMultiSeriesWithDeleteAll {} -#[async_trait] -impl DbSetup for TwoMeasurementsMultiSeriesWithDeleteAll { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let mut lp_lines = vec![ - "h2o,state=MA,city=Boston temp=70.4 100", // to row 2 - "h2o,state=MA,city=Boston temp=72.4 250", // to row 1 - "h2o,state=CA,city=LA temp=90.0 200", // to row 0 - "h2o,state=CA,city=LA temp=90.0 350", // to row 3 - "o2,state=MA,city=Boston temp=50.4,reading=50 100", // to row 5 - "o2,state=MA,city=Boston temp=53.4,reading=51 250", // to row 4 - ]; - - // Swap around data is not inserted in series order - lp_lines.swap(0, 2); - lp_lines.swap(4, 5); - - // Delete all data form h2o - // pred: delete from h20 where 100 <= time <= 360 - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(100, 360), - exprs: vec![], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - pub struct MeasurementStatusCode {} #[async_trait] impl DbSetup for MeasurementStatusCode { @@ -950,44 +636,6 @@ impl DbSetup for MeasurementsSortableTags { } } -#[derive(Debug)] -pub struct MeasurementsSortableTagsWithDelete {} -#[async_trait] -impl DbSetup for MeasurementsSortableTagsWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - - let lp_lines = vec![ - "h2o,zz_tag=A,state=MA,city=Kingston temp=70.1 800", - "h2o,state=MA,city=Kingston,zz_tag=B temp=70.2 100", - "h2o,state=CA,city=Boston temp=70.3 250", // soft deleted - "h2o,state=MA,city=Boston,zz_tag=A temp=70.4 1000", - "h2o,state=MA,city=Boston temp=70.5,other=5.0 250", - ]; - - // pred: delete from h2o where 120 <= time <= 350 and state=CA - // 1 rows of h2o with timestamp 250 will be deleted - let delete_table_name = "h2o"; - let pred = DeletePredicate { - range: TimestampRange::new(120, 350), - exprs: vec![DeleteExpr::new( - "state".to_string(), - Op::Eq, - Scalar::String(("CA").to_string()), - )], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - // See issue: https://github.com/influxdata/influxdb_iox/issues/2845 #[derive(Debug)] pub struct MeasurementsForDefect2845 {} @@ -1019,65 +667,6 @@ impl DbSetup for OneMeasurementNoTags2 { } } -pub struct OneMeasurementNoTagsWithDelete {} -#[async_trait] -impl DbSetup for OneMeasurementNoTagsWithDelete { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"]; - - // pred: delete from m0 where 1 <= time <= 1 and foo=1.0 - // 1 row of m0 with timestamp 1 - let delete_table_name = "m0"; - let pred = DeletePredicate { - range: TimestampRange::new(1, 1), - exprs: vec![DeleteExpr::new( - "foo".to_string(), - Op::Eq, - Scalar::F64((1.0).into()), - )], - }; - - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - -/// This will create many scenarios: some have a chunk with soft deleted data, some have no chunks -/// because there is no point to create compacted chunks with all deleted data. -pub struct OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk {} -#[async_trait] -impl DbSetup for OneMeasurementNoTagsWithDeleteAllWithAndWithoutChunk { - async fn make(&self) -> Vec { - let partition_key = "1970-01-01T00"; - let lp_lines = vec!["m0 foo=1.0 1", "m0 foo=2.0 2"]; - - // pred: delete from m0 where 1 <= time <= 2 - let delete_table_name = "m0"; - let pred = DeletePredicate { - range: TimestampRange::new(1, 2), - exprs: vec![], - }; - - // Apply predicate before the chunk is moved if any. There will be scenarios without chunks - // as a consequence of not-compacting-deleted-data - all_scenarios_for_one_chunk( - vec![&pred], - vec![], - lp_lines, - delete_table_name, - partition_key, - ) - .await - } -} - pub struct OneMeasurementForAggs {} #[async_trait] impl DbSetup for OneMeasurementForAggs { @@ -1310,65 +899,6 @@ impl DbSetup for MeasurementForDefect2697 { } } -pub struct MeasurementForDefect2697WithDelete {} -#[async_trait] -impl DbSetup for MeasurementForDefect2697WithDelete { - async fn make(&self) -> Vec { - let partition_key = "2021-01-01T00"; - - let lp = vec![ - "mm,section=1a bar=5.0 1609459201000000011", - "mm,section=1a bar=0.28 1609459201000000031", - "mm,section=2b bar=4.0 1609459201000000009", - "mm,section=2b bar=6.0 1609459201000000015", - "mm,section=2b bar=1.2 1609459201000000022", - "mm,section=1a foo=1.0 1609459201000000001", - "mm,section=1a foo=3.0 1609459201000000005", - "mm,section=1a foo=11.24 1609459201000000024", - "mm,section=2b foo=2.0 1609459201000000002", - ]; - - // pred: delete from mm where 1609459201000000022 <= time <= 1609459201000000022 - // 1 row of m0 with timestamp 1609459201000000022 (section=2b bar=1.2) - let delete_table_name = "mm"; - let pred = DeletePredicate { - range: TimestampRange::new(1609459201000000022, 1609459201000000022), - exprs: vec![], - }; - - all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await - } -} - -pub struct MeasurementForDefect2697WithDeleteAll {} -#[async_trait] -impl DbSetup for MeasurementForDefect2697WithDeleteAll { - async fn make(&self) -> Vec { - let partition_key = "2021-01-01T00"; - - let lp = vec![ - "mm,section=1a bar=5.0 1609459201000000011", - "mm,section=1a bar=0.28 1609459201000000031", - "mm,section=2b bar=4.0 1609459201000000009", - "mm,section=2b bar=6.0 1609459201000000015", - "mm,section=2b bar=1.2 1609459201000000022", - "mm,section=1a foo=1.0 1609459201000000001", - "mm,section=1a foo=3.0 1609459201000000005", - "mm,section=1a foo=11.24 1609459201000000024", - "mm,section=2b foo=2.0 1609459201000000002", - ]; - - // pred: delete from mm where 1 <= time <= 1609459201000000031 - let delete_table_name = "mm"; - let pred = DeletePredicate { - range: TimestampRange::new(1, 1609459201000000031), - exprs: vec![], - }; - - all_scenarios_for_one_chunk(vec![&pred], vec![], lp, delete_table_name, partition_key).await - } -} - // Test data to validate fix for: // https://github.com/influxdata/influxdb_iox/issues/2890 pub struct MeasurementForDefect2890 {} diff --git a/query_tests/src/scenarios/util.rs b/query_tests/src/scenarios/util.rs index f9a687f03a..477503504b 100644 --- a/query_tests/src/scenarios/util.rs +++ b/query_tests/src/scenarios/util.rs @@ -14,12 +14,9 @@ use generated_types::{ }; use influxdb_iox_client::flight::{low_level::LowLevelMessage, Error as FlightError}; use ingester::{ - data::{ - partition::resolver::CatalogPartitionResolver, FlatIngesterQueryResponse, IngesterData, - IngesterQueryResponse, Persister, - }, - lifecycle::mock_handle::NoopLifecycleHandle, - querier_handler::prepare_data_to_querier, + data::{partition::resolver::CatalogPartitionResolver, IngesterData, Persister}, + lifecycle::mock_handle::MockLifecycleHandle, + querier_handler::{prepare_data_to_querier, FlatIngesterQueryResponse, IngesterQueryResponse}, }; use iox_catalog::interface::get_schema_by_name; use iox_query::exec::{Executor, ExecutorConfig}; @@ -722,7 +719,7 @@ impl MockIngester { /// Takes `&self mut` because our partioning implementation does not work with concurrent /// access. async fn buffer_operation(&mut self, dml_operation: DmlOperation) { - let lifecycle_handle = NoopLifecycleHandle {}; + let lifecycle_handle = MockLifecycleHandle::default(); let should_pause = self .ingester_data @@ -752,7 +749,32 @@ impl MockIngester { .map(|f| f.id) .collect(); - self.ingester_data.persist(*partition_id).await; + let p = self + .catalog + .catalog + .repositories() + .await + .partitions() + .get_by_id(*partition_id) + .await + .unwrap() + .expect("partition not found"); + + let namespace_id = self + .catalog + .catalog + .repositories() + .await + .tables() + .get_by_id(p.table_id) + .await + .unwrap() + .expect("table does not exist") + .namespace_id; + + self.ingester_data + .persist(p.shard_id, namespace_id, p.table_id, *partition_id) + .await; result.extend( self.catalog @@ -1023,9 +1045,6 @@ impl QueryDataAdapter { parquet_max_sequence_number: status .parquet_max_sequence_number .map(|x| x.get()), - tombstone_max_sequence_number: status - .tombstone_max_sequence_number - .map(|x| x.get()), }), }, ), diff --git a/query_tests/src/table_schema.rs b/query_tests/src/table_schema.rs index f01a1b8b7d..359ba1ce49 100644 --- a/query_tests/src/table_schema.rs +++ b/query_tests/src/table_schema.rs @@ -38,7 +38,7 @@ async fn run_table_schema_test_case( let ctx = db.new_query_context(None); let chunks = db - .chunks(table_name, &Default::default(), ctx) + .chunks(table_name, &Default::default(), &None, ctx) .await .expect("error getting chunks"); for chunk in chunks { diff --git a/router/Cargo.toml b/router/Cargo.toml index 7b655f9f91..d19ecf8b4d 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -20,7 +20,7 @@ metric = { path = "../metric" } mutable_batch = { path = "../mutable_batch" } mutable_batch_lp = { path = "../mutable_batch_lp" } mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parking_lot = "0.12" predicate = { path = "../predicate" } @@ -47,7 +47,7 @@ pretty_assertions = "1.3.0" rand = "0.8.3" schema = { path = "../schema" } test_helpers = { version = "0.1.0", path = "../test_helpers", features = ["future_timeout"] } -tokio-stream = { version = "0.1.10", default_features = false, features = [] } +tokio-stream = { version = "0.1.11", default_features = false, features = [] } [lib] # Allow --save-baseline to work diff --git a/service_common/src/planner.rs b/service_common/src/planner.rs index 6431963aad..e1bc5adf71 100644 --- a/service_common/src/planner.rs +++ b/service_common/src/planner.rs @@ -60,7 +60,7 @@ impl Planner { planner .table_names(database, predicate) .await - .map_err(|e| Error::Plan(format!("table_names error: {}", e))) + .map_err(|e| e.to_df_error("table_names")) }) .await } @@ -82,7 +82,7 @@ impl Planner { planner .tag_keys(database, predicate) .await - .map_err(|e| Error::Plan(format!("tag_keys error: {}", e))) + .map_err(|e| e.to_df_error("tag_keys")) }) .await } @@ -106,7 +106,7 @@ impl Planner { planner .tag_values(database, &tag_name, predicate) .await - .map_err(|e| Error::Plan(format!("tag_values error: {}", e))) + .map_err(|e| e.to_df_error("tag_values")) }) .await } @@ -128,7 +128,7 @@ impl Planner { planner .field_columns(database, predicate) .await - .map_err(|e| Error::Plan(format!("field_columns error: {}", e))) + .map_err(|e| e.to_df_error("field_columns")) }) .await } @@ -150,7 +150,7 @@ impl Planner { planner .read_filter(database, predicate) .await - .map_err(|e| Error::Plan(format!("read_filter error: {}", e))) + .map_err(|e| e.to_df_error("read_filter")) }) .await } @@ -174,7 +174,7 @@ impl Planner { planner .read_group(database, predicate, agg, &group_columns) .await - .map_err(|e| Error::Plan(format!("read_group error: {}", e))) + .map_err(|e| e.to_df_error("read_group")) }) .await } @@ -199,7 +199,7 @@ impl Planner { planner .read_window_aggregate(database, predicate, agg, every, offset) .await - .map_err(|e| Error::Plan(format!("read_window_aggregate error: {}", e))) + .map_err(|e| e.to_df_error("read_window_aggregate")) }) .await } diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml index 172e89b560..b9999514e1 100644 --- a/service_grpc_flight/Cargo.toml +++ b/service_grpc_flight/Cargo.toml @@ -26,7 +26,7 @@ futures = "0.3" pin-project = "1.0" prost = "0.11" serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tonic = "0.8" diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs index f88ce0d184..f4d84266e6 100644 --- a/service_grpc_flight/src/lib.rs +++ b/service_grpc_flight/src/lib.rs @@ -9,7 +9,7 @@ use arrow_flight::{ use arrow_util::optimize::{optimize_record_batch, optimize_schema}; use bytes::{Bytes, BytesMut}; use data_types::{DatabaseName, DatabaseNameError}; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan}; use futures::{SinkExt, Stream, StreamExt}; use generated_types::influxdata::iox::querier::v1 as proto; use iox_query::{ @@ -54,7 +54,7 @@ pub enum Error { ))] Query { database_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Invalid database name: {}", source))] @@ -91,29 +91,40 @@ impl From for tonic::Status { Error::Optimize { .. } | Error::Planning { .. } | Error::Serialization { .. } => warn!(?err, msg), } - err.to_status() + err.into_status() } } impl Error { /// Converts a result from the business logic into the appropriate tonic /// status - fn to_status(&self) -> tonic::Status { - use tonic::Status; - match &self { - Self::InvalidTicket { .. } => Status::invalid_argument(self.to_string()), - Self::InvalidTicketLegacy { .. } => Status::invalid_argument(self.to_string()), - Self::InvalidQuery { .. } => Status::invalid_argument(self.to_string()), - Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()), - Self::Query { .. } => Status::internal(self.to_string()), - Self::InvalidDatabaseName { .. } => Status::invalid_argument(self.to_string()), - Self::Planning { - source: service_common::planner::Error::External(_), - } => Status::internal(self.to_string()), - Self::Planning { .. } => Status::invalid_argument(self.to_string()), - Self::Optimize { .. } => Status::internal(self.to_string()), - Self::Serialization { .. } => Status::internal(self.to_string()), - } + fn into_status(self) -> tonic::Status { + let msg = self.to_string(); + + let code = match self { + Self::DatabaseNotFound { .. } => tonic::Code::NotFound, + Self::InvalidTicket { .. } + | Self::InvalidTicketLegacy { .. } + | Self::InvalidQuery { .. } + | Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument, + Self::Planning { source, .. } | Self::Query { source, .. } => { + // traverse context chain + let mut source = source; + while let DataFusionError::Context(_msg, inner) = source { + source = *inner; + } + + match source { + DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, + DataFusionError::Plan(_) => tonic::Code::InvalidArgument, + DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented, + _ => tonic::Code::Internal, + } + } + Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal, + }; + + tonic::Status::new(code, msg) } } @@ -334,7 +345,6 @@ impl GetStream { let mut stream_record_batches = ctx .execute_stream(Arc::clone(&physical_plan)) .await - .map_err(|e| Box::new(e) as _) .context(QuerySnafu { database_name: &database_name, })?; @@ -382,7 +392,7 @@ impl GetStream { // failure sending here is OK because we're cutting the stream anyways tx.send(Err(Error::Query { database_name: database_name.clone(), - source: Box::new(e), + source: DataFusionError::ArrowError(e), } .into())) .await diff --git a/service_grpc_influxrpc/Cargo.toml b/service_grpc_influxrpc/Cargo.toml index ea4169e05c..00c5df645f 100644 --- a/service_grpc_influxrpc/Cargo.toml +++ b/service_grpc_influxrpc/Cargo.toml @@ -26,7 +26,7 @@ pin-project = "1.0" prost = "0.11" regex = "1.6.0" serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tokio-stream = { version = "0.1", features = ["net"] } diff --git a/service_grpc_influxrpc/src/expr.rs b/service_grpc_influxrpc/src/expr.rs index 58a5806b4e..8da9cebc67 100644 --- a/service_grpc_influxrpc/src/expr.rs +++ b/service_grpc_influxrpc/src/expr.rs @@ -906,6 +906,7 @@ mod tests { let schema = SchemaBuilder::new() .tag("t1") .tag("t2") + .tag("host") .field("foo", DataType::Int64) .field("bar", DataType::Int64) .build() diff --git a/service_grpc_influxrpc/src/service.rs b/service_grpc_influxrpc/src/service.rs index f8d54e2d05..734f856b88 100644 --- a/service_grpc_influxrpc/src/service.rs +++ b/service_grpc_influxrpc/src/service.rs @@ -12,6 +12,7 @@ use crate::{ StorageService, }; use data_types::{org_and_bucket_to_database, DatabaseName}; +use datafusion::error::DataFusionError; use futures::Stream; use generated_types::{ google::protobuf::Empty, literal_or_regex::Value as RegexOrLiteralValue, @@ -54,43 +55,43 @@ pub enum Error { #[snafu(display("Error listing tables in database '{}': {}", db_name, source))] ListingTables { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error listing columns in database '{}': {}", db_name, source))] ListingColumns { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error listing fields in database '{}': {}", db_name, source))] ListingFields { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error creating series plans for database '{}': {}", db_name, source))] PlanningFilteringSeries { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error creating group plans for database '{}': {}", db_name, source))] PlanningGroupSeries { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error running series plans for database '{}': {}", db_name, source))] FilteringSeries { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error running grouping plans for database '{}': {}", db_name, source))] GroupingSeries { db_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display( @@ -102,7 +103,7 @@ pub enum Error { ListingTagValues { db_name: String, tag_name: String, - source: Box, + source: DataFusionError, }, #[snafu(display("Error converting Predicate '{}: {}", rpc_predicate_string, source))] @@ -177,44 +178,56 @@ impl From for tonic::Status { /// status fn from(err: Error) -> Self { error!("Error handling gRPC request: {}", err); - err.to_status() + err.into_status() } } impl Error { /// Converts a result from the business logic into the appropriate tonic /// status - fn to_status(&self) -> tonic::Status { - match &self { - Self::DatabaseNotFound { .. } => Status::not_found(self.to_string()), - Self::ListingTables { .. } => Status::internal(self.to_string()), - Self::ListingColumns { .. } => { - // TODO: distinguish between input errors and internal errors - Status::invalid_argument(self.to_string()) + fn into_status(self) -> tonic::Status { + let msg = self.to_string(); + + let code = match self { + Self::DatabaseNotFound { .. } => tonic::Code::NotFound, + Self::ListingTables { source, .. } + | Self::ListingColumns { source, .. } + | Self::ListingFields { source, .. } + | Self::PlanningFilteringSeries { source, .. } + | Self::PlanningGroupSeries { source, .. } + | Self::FilteringSeries { source, .. } + | Self::GroupingSeries { source, .. } + | Self::ListingTagValues { source, .. } => { + // traverse context chain + let mut source = source; + while let DataFusionError::Context(_msg, inner) = source { + source = *inner; + } + + match source { + DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, + DataFusionError::Plan(_) => tonic::Code::InvalidArgument, + DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented, + _ => tonic::Code::Internal, + } } - Self::ListingFields { .. } => { - // TODO: distinguish between input errors and internal errors - Status::invalid_argument(self.to_string()) + Self::ConvertingPredicate { .. } + | Self::ConvertingReadGroupAggregate { .. } + | Self::ConvertingReadGroupType { .. } + | Self::ConvertingWindowAggregate { .. } + | Self::ConvertingTagKeyInTagValues { .. } + | Self::ComputingGroupedSeriesSet { .. } + | Self::ConvertingFieldList { .. } + | Self::MeasurementLiteralOrRegex { .. } + | Self::MissingTagKeyPredicate {} + | Self::InvalidTagKeyRegex { .. } => tonic::Code::InvalidArgument, + Self::SendingResults { .. } | Self::InternalHintsFieldNotSupported { .. } => { + tonic::Code::Internal } - Self::PlanningFilteringSeries { .. } => Status::invalid_argument(self.to_string()), - Self::PlanningGroupSeries { .. } => Status::invalid_argument(self.to_string()), - Self::FilteringSeries { .. } => Status::invalid_argument(self.to_string()), - Self::GroupingSeries { .. } => Status::invalid_argument(self.to_string()), - Self::ListingTagValues { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingPredicate { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingReadGroupAggregate { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingReadGroupType { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingWindowAggregate { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingTagKeyInTagValues { .. } => Status::invalid_argument(self.to_string()), - Self::ComputingGroupedSeriesSet { .. } => Status::invalid_argument(self.to_string()), - Self::ConvertingFieldList { .. } => Status::invalid_argument(self.to_string()), - Self::SendingResults { .. } => Status::internal(self.to_string()), - Self::InternalHintsFieldNotSupported { .. } => Status::internal(self.to_string()), - Self::NotYetImplemented { .. } => Status::internal(self.to_string()), - Self::MeasurementLiteralOrRegex { .. } => Status::invalid_argument(self.to_string()), - Self::MissingTagKeyPredicate {} => Status::invalid_argument(self.to_string()), - Self::InvalidTagKeyRegex { .. } => Status::invalid_argument(self.to_string()), - } + Self::NotYetImplemented { .. } => tonic::Code::Unimplemented, + }; + + tonic::Status::new(code, msg) } } @@ -341,7 +354,7 @@ where &ctx, ) .await - .map_err(|e| e.to_status())? + .map_err(|e| e.into_status())? .into_iter() .map(Ok) .collect::>(); @@ -423,7 +436,7 @@ where &ctx, ) .await - .map_err(|e| e.to_status())? + .map_err(|e| e.into_status())? .into_iter() .map(Ok) .collect::>(); @@ -489,7 +502,7 @@ where &ctx, ) .await - .map_err(|e| e.to_status()); + .map_err(|e| e.into_status()); if response.is_ok() { query_completed_token.set_success(); @@ -560,7 +573,7 @@ where operation: "tag_value for a measurement, with general predicate" .to_string(), } - .to_status()); + .into_status()); } measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx).await @@ -593,7 +606,7 @@ where } }; - let response = response.map_err(|e| e.to_status()); + let response = response.map_err(|e| e.into_status()); if response.is_ok() { query_completed_token.set_success(); @@ -652,7 +665,7 @@ where let results = tag_values_grouped_by_measurement_and_tag_key_impl(Arc::clone(&db), db_name, req, &ctx) .await - .map_err(|e| e.to_status())? + .map_err(|e| e.into_status())? .into_iter() .map(Ok) .collect::>(); @@ -762,7 +775,7 @@ where let response = measurement_name_impl(Arc::clone(&db), db_name, range, predicate, &ctx) .await - .map_err(|e| e.to_status()); + .map_err(|e| e.into_status()); if response.is_ok() { query_completed_token.set_success(); @@ -833,7 +846,7 @@ where &ctx, ) .await - .map_err(|e| e.to_status()); + .map_err(|e| e.into_status()); if response.is_ok() { query_completed_token.set_success(); @@ -907,7 +920,7 @@ where &ctx, ) .await - .map_err(|e| e.to_status()); + .map_err(|e| e.into_status()); if response.is_ok() { query_completed_token.set_success(); @@ -981,9 +994,9 @@ where .map(|fieldlist| { fieldlist_to_measurement_fields_response(fieldlist) .context(ConvertingFieldListSnafu) - .map_err(|e| e.to_status()) + .map_err(|e| e.into_status()) }) - .map_err(|e| e.to_status())?; + .map_err(|e| e.into_status())?; if response.is_ok() { query_completed_token.set_success(); @@ -1048,13 +1061,11 @@ where let plan = Planner::new(ctx) .table_names(db, predicate) .await - .map_err(|e| Box::new(e) as _) .context(ListingTablesSnafu { db_name })?; let table_names = ctx .to_string_set(plan) .await - .map_err(|e| Box::new(e) as _) .context(ListingTablesSnafu { db_name })?; // Map the resulting collection of Strings into a Vec>for return @@ -1095,13 +1106,11 @@ where let tag_key_plan = Planner::new(ctx) .tag_keys(db, predicate) .await - .map_err(|e| Box::new(e) as _) .context(ListingColumnsSnafu { db_name })?; let tag_keys = ctx .to_string_set(tag_key_plan) .await - .map_err(|e| Box::new(e) as _) .context(ListingColumnsSnafu { db_name })?; // Map the resulting collection of Strings into a Vec>for return @@ -1142,13 +1151,11 @@ where let tag_value_plan = Planner::new(ctx) .tag_values(db, tag_name, predicate) .await - .map_err(|e| Box::new(e) as _) .context(ListingTagValuesSnafu { db_name, tag_name })?; let tag_values = ctx .to_string_set(tag_value_plan) .await - .map_err(|e| Box::new(e) as _) .context(ListingTagValuesSnafu { db_name, tag_name })?; // Map the resulting collection of Strings into a Vec>for return @@ -1266,14 +1273,12 @@ where let series_plan = Planner::new(ctx) .read_filter(db, predicate) .await - .map_err(|e| Box::new(e) as _) .context(PlanningFilteringSeriesSnafu { db_name })?; // Execute the plans. let series_or_groups = ctx .to_series_and_groups(series_plan) .await - .map_err(|e| Box::new(e) as _) .context(FilteringSeriesSnafu { db_name }) .log_if_error("Running series set plan")?; @@ -1319,9 +1324,8 @@ where .await } }; - let grouped_series_set_plan = grouped_series_set_plan - .map_err(|e| Box::new(e) as _) - .context(PlanningGroupSeriesSnafu { db_name })?; + let grouped_series_set_plan = + grouped_series_set_plan.context(PlanningGroupSeriesSnafu { db_name })?; // PERF - This used to send responses to the client before execution had // completed, but now it doesn't. We may need to revisit this in the future @@ -1331,7 +1335,6 @@ where let series_or_groups = ctx .to_series_and_groups(grouped_series_set_plan) .await - .map_err(|e| Box::new(e) as _) .context(GroupingSeriesSnafu { db_name }) .log_if_error("Running Grouped SeriesSet Plan")?; @@ -1370,13 +1373,11 @@ where let field_list_plan = Planner::new(ctx) .field_columns(db, predicate) .await - .map_err(|e| Box::new(e) as _) .context(ListingFieldsSnafu { db_name })?; let field_list = ctx .to_field_list(field_list_plan) .await - .map_err(|e| Box::new(e) as _) .context(ListingFieldsSnafu { db_name })?; trace!(field_names=?field_list, "Field names response"); @@ -1801,11 +1802,13 @@ mod tests { // Note multiple tables / measureemnts: let chunk0 = TestChunk::new("m1") .with_id(0) + .with_tag_column("state") .with_tag_column("k1") .with_tag_column("k2"); let chunk1 = TestChunk::new("m2") .with_id(1) + .with_tag_column("state") .with_tag_column("k3") .with_tag_column("k4"); @@ -1825,7 +1828,7 @@ mod tests { }; let actual_tag_keys = fixture.storage_client.tag_keys(request).await.unwrap(); - let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"]; + let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"]; assert_eq!(actual_tag_keys, expected_tag_keys,); @@ -1878,7 +1881,7 @@ mod tests { let response = fixture.storage_client.tag_keys(request).await; assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down"); - grpc_request_metric_has_count(&fixture, "TagKeys", "client_error", 1); + grpc_request_metric_has_count(&fixture, "TagKeys", "server_error", 1); } /// test the plumbing of the RPC layer for measurement_tag_keys-- @@ -1897,6 +1900,7 @@ mod tests { .with_tag_column("k0"); let chunk1 = TestChunk::new("m4") + .with_tag_column("state") .with_tag_column("k1") .with_tag_column("k2") .with_tag_column("k3") @@ -1926,7 +1930,7 @@ mod tests { .measurement_tag_keys(request) .await .unwrap(); - let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4"]; + let expected_tag_keys = vec!["_f(0xff)", "_m(0x00)", "k1", "k2", "k3", "k4", "state"]; assert_eq!( actual_tag_keys, expected_tag_keys, @@ -1984,7 +1988,7 @@ mod tests { let response = fixture.storage_client.measurement_tag_keys(request).await; assert_contains!(response.unwrap_err().to_string(), "This is an error"); - grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "client_error", 1); + grpc_request_metric_has_count(&fixture, "MeasurementTagKeys", "server_error", 1); } /// test the plumbing of the RPC layer for tag_values -- specifically that @@ -2173,7 +2177,8 @@ mod tests { "Error converting tag_key to UTF-8 in tag_values request" ); - grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 2); + grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1); + grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1); } #[tokio::test] @@ -2524,7 +2529,7 @@ mod tests { assert_contains!(response_string, "Sugar we are going down"); - grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "client_error", 1); + grpc_request_metric_has_count(&fixture, "MeasurementTagValues", "server_error", 1); } #[tokio::test] @@ -2730,7 +2735,7 @@ mod tests { let response = fixture.storage_client.read_filter(request).await; assert_contains!(response.unwrap_err().to_string(), "Sugar we are going down"); - grpc_request_metric_has_count(&fixture, "ReadFilter", "client_error", 1); + grpc_request_metric_has_count(&fixture, "ReadFilter", "server_error", 1); } #[tokio::test] @@ -2822,7 +2827,7 @@ mod tests { .to_string(); assert_contains!(response_string, "Sugar we are going down"); - grpc_request_metric_has_count(&fixture, "ReadGroup", "client_error", 1); + grpc_request_metric_has_count(&fixture, "ReadGroup", "server_error", 1); } #[tokio::test] @@ -2988,7 +2993,7 @@ mod tests { assert_contains!(response_string, "Sugar we are going down"); - grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "client_error", 1); + grpc_request_metric_has_count(&fixture, "ReadWindowAggregate", "server_error", 1); } #[tokio::test] diff --git a/service_grpc_object_store/Cargo.toml b/service_grpc_object_store/Cargo.toml index d25393c791..6a2bcac921 100644 --- a/service_grpc_object_store/Cargo.toml +++ b/service_grpc_object_store/Cargo.toml @@ -8,7 +8,7 @@ data_types = { path = "../data_types" } futures = "0.3" generated_types = { path = "../generated_types" } iox_catalog = { path = "../iox_catalog" } -object_store = "0.5.0" +object_store = "0.5.1" observability_deps = { path = "../observability_deps" } parquet_file = { path = "../parquet_file" } tokio = { version = "1", features = ["rt-multi-thread", "macros"] } diff --git a/test_fixtures/cpu.parquet b/test_fixtures/cpu.parquet new file mode 100644 index 0000000000..86cae861b6 Binary files /dev/null and b/test_fixtures/cpu.parquet differ diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs index 0f4567a973..5017b0bbba 100644 --- a/test_helpers_end_to_end/src/client.rs +++ b/test_helpers_end_to_end/src/client.rs @@ -1,12 +1,12 @@ //! Client helpers for writing end to end ng tests use arrow::record_batch::RecordBatch; use futures::{stream::FuturesUnordered, StreamExt}; +use generated_types::influxdata::pbdata::v1::WriteResponse; use http::Response; use hyper::{Body, Client, Request}; use influxdb_iox_client::{ connection::Connection, flight::generated_types::ReadInfo, - write::generated_types::WriteResponse, write_info::generated_types::{merge_responses, GetWriteInfoResponse, ShardStatus}, }; use observability_deps::tracing::info; diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs index b3dc091a93..d4597e8584 100644 --- a/test_helpers_end_to_end/src/config.rs +++ b/test_helpers_end_to_end/src/config.rs @@ -290,6 +290,11 @@ impl TestConfig { self.with_env("INFLUXDB_IOX_FLIGHT_DO_GET_PANIC", times.to_string()) } + /// Configure maximum per-table query bytes for the querier. + pub fn with_querier_max_table_query_bytes(self, bytes: usize) -> Self { + self.with_env("INFLUXDB_IOX_MAX_TABLE_QUERY_BYTES", bytes.to_string()) + } + /// Changes the log to JSON for easier parsing. pub fn with_json_logs(self) -> Self { self.with_env("LOG_FORMAT", "json") diff --git a/write_summary/Cargo.toml b/write_summary/Cargo.toml index d3313a19ee..d303ad5b50 100644 --- a/write_summary/Cargo.toml +++ b/write_summary/Cargo.toml @@ -9,7 +9,7 @@ data_types = { path = "../data_types" } dml = { path = "../dml" } generated_types = { path = "../generated_types" } observability_deps = { path = "../observability_deps" } -serde_json = "1.0.83" +serde_json = "1.0.86" snafu = "0.7" workspace-hack = { path = "../workspace-hack"}