diff --git a/Cargo.lock b/Cargo.lock index 2394a12240..33b41a73c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1822,6 +1822,18 @@ dependencies = [ "num-traits", ] +[[package]] +name = "flume" +version = "0.10.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" +dependencies = [ + "futures-core", + "futures-sink", + "pin-project", + "spin 0.9.4", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2820,9 +2832,11 @@ dependencies = [ "mutable_batch", "mutable_batch_lp", "observability_deps", + "parking_lot 0.12.1", "paste", "pretty_assertions", "rand", + "serde", "snafu", "sqlx", "sqlx-hotswap-pool", @@ -3316,6 +3330,17 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +[[package]] +name = "libsqlite3-sys" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898745e570c7d0453cc1fbc4a701eb6c662ed54e8fec8b7d14be137ebeeb9d14" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "link-cplusplus" version = "1.0.8" @@ -5442,8 +5467,10 @@ dependencies = [ "dotenvy", "either", "event-listener", + "flume", "futures-channel", "futures-core", + "futures-executor", "futures-intrusive", "futures-util", "hashlink", @@ -5453,6 +5480,7 @@ dependencies = [ "indexmap", "itoa 1.0.5", "libc", + "libsqlite3-sys", "log", "md-5", "memchr", @@ -6372,6 +6400,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -6724,6 +6758,7 @@ dependencies = [ "flate2", "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", diff --git a/clap_blocks/src/catalog_dsn.rs b/clap_blocks/src/catalog_dsn.rs index c23882004e..40ef51c23d 100644 --- a/clap_blocks/src/catalog_dsn.rs +++ b/clap_blocks/src/catalog_dsn.rs @@ -1,4 +1,5 @@ //! Catalog-DSN-related configs. +use iox_catalog::sqlite::{SqliteCatalog, SqliteConnectionOptions}; use iox_catalog::{ create_or_get_default_records, interface::Catalog, @@ -15,6 +16,9 @@ pub enum Error { #[snafu(display("A Postgres connection string in --catalog-dsn is required."))] ConnectionStringRequired, + #[snafu(display("A SQLite connection string in --catalog-dsn is required."))] + ConnectionStringSqliteRequired, + #[snafu(display("A catalog error occurred: {}", source))] Catalog { source: iox_catalog::interface::Error, @@ -44,7 +48,7 @@ fn default_hotswap_poll_interval_timeout() -> &'static str { } /// CLI config for catalog DSN. -#[derive(Debug, Clone, clap::Parser)] +#[derive(Debug, Clone, Default, clap::Parser)] pub struct CatalogDsnConfig { /// The type of catalog to use. "memory" is only useful for testing purposes. #[clap( @@ -110,13 +114,17 @@ pub struct CatalogDsnConfig { } /// Catalog type. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum)] +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum)] pub enum CatalogType { /// PostgreSQL. + #[default] Postgres, /// In-memory. Memory, + + /// SQLite. + Sqlite, } impl CatalogDsnConfig { @@ -127,12 +135,7 @@ impl CatalogDsnConfig { Self { catalog_type_: CatalogType::Memory, - dsn: None, - max_catalog_connections: PostgresConnectionOptions::DEFAULT_MAX_CONNS, - postgres_schema_name: PostgresConnectionOptions::DEFAULT_SCHEMA_NAME.to_string(), - connect_timeout: PostgresConnectionOptions::DEFAULT_CONNECT_TIMEOUT, - idle_timeout: PostgresConnectionOptions::DEFAULT_IDLE_TIMEOUT, - hotswap_poll_interval: PostgresConnectionOptions::DEFAULT_HOTSWAP_POLL_INTERVAL, + ..Self::default() } } @@ -151,6 +154,17 @@ impl CatalogDsnConfig { } } + /// Create a new Postgres instance for all-in-one mode if a catalog DSN is specified + pub fn new_sqlite(dsn: String) -> Self { + info!("Catalog: SQLite at `{}`", dsn); + + Self { + catalog_type_: CatalogType::Sqlite, + dsn: Some(dsn), + ..Self::default() + } + } + /// Get config-dependent catalog. pub async fn get_catalog( &self, @@ -189,6 +203,20 @@ impl CatalogDsnConfig { Arc::new(mem) as Arc } + CatalogType::Sqlite => { + let options = SqliteConnectionOptions { + dsn: self + .dsn + .as_ref() + .context(ConnectionStringSqliteRequiredSnafu)? + .clone(), + }; + Arc::new( + SqliteCatalog::connect(options, metrics) + .await + .context(CatalogSnafu)?, + ) as Arc + } }; Ok(catalog) diff --git a/compactor2/src/compactor_tests.rs b/compactor2/src/compactor_tests.rs index 6afa42729d..45aa2eb16a 100644 --- a/compactor2/src/compactor_tests.rs +++ b/compactor2/src/compactor_tests.rs @@ -3,7 +3,7 @@ mod tests { use std::{num::NonZeroUsize, sync::Arc, time::Duration}; use arrow_util::assert_batches_sorted_eq; - use data_types::CompactionLevel; + use data_types::{CompactionLevel, ParquetFile}; use iox_query::exec::ExecutorType; use tracker::AsyncSemaphoreMetrics; @@ -46,16 +46,10 @@ mod tests { setup.set_compact_version(AlgoVersion::AllAtOnce); // verify 6 files - let files = setup.list_by_table_not_to_delete().await; - assert_eq!(files.len(), 6); - // // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - assert_eq!( - files_and_levels, + let files = setup.list_by_table_not_to_delete().await; + assert_levels( + &files, vec![ (1, CompactionLevel::FileNonOverlapped), (2, CompactionLevel::Initial), @@ -63,30 +57,21 @@ mod tests { (4, CompactionLevel::FileNonOverlapped), (5, CompactionLevel::Initial), (6, CompactionLevel::Initial), - ] + ], ); + // verify ID and max_l0_created_at - let time_provider = Arc::clone(&setup.config.time_provider); - - let time_1_minute_future = time_provider.minutes_into_future(1).timestamp_nanos(); - let time_2_minutes_future = time_provider.minutes_into_future(2).timestamp_nanos(); - let time_3_minutes_future = time_provider.minutes_into_future(3).timestamp_nanos(); - let time_5_minutes_future = time_provider.minutes_into_future(5).timestamp_nanos(); - - let files_and_max_l0_created_ats: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.max_l0_created_at.get())) - .collect(); - assert_eq!( - files_and_max_l0_created_ats, + let times = setup.test_times(); + assert_max_l0_created_at( + &files, vec![ - (1, time_1_minute_future), - (2, time_2_minutes_future), - (3, time_5_minutes_future), - (4, time_3_minutes_future), - (5, time_5_minutes_future), - (6, time_2_minutes_future), - ] + (1, times.time_1_minute_future), + (2, times.time_2_minutes_future), + (3, times.time_5_minutes_future), + (4, times.time_3_minutes_future), + (5, times.time_5_minutes_future), + (6, times.time_2_minutes_future), + ], ); // compact @@ -94,30 +79,21 @@ mod tests { // verify number of files: 6 files are compacted into 2 files let files = setup.list_by_table_not_to_delete().await; - assert_eq!(files.len(), 2); - // - // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - println!("{files_and_levels:?}"); - assert_eq!( - files_and_levels, + assert_levels( + &files, vec![ (7, CompactionLevel::FileNonOverlapped), (8, CompactionLevel::FileNonOverlapped), - ] + ], ); - // verify ID and max_l0_created_at - let files_and_max_l0_created_ats: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.max_l0_created_at.get())) - .collect(); - // both files have max_l0_created time_5_minutes_future which is the max of all L0 input's max_l0_created_at - assert_eq!( - files_and_max_l0_created_ats, - vec![(7, time_5_minutes_future), (8, time_5_minutes_future),] + assert_max_l0_created_at( + &files, + // both files have max_l0_created time_5_minutes_future + // which is the max of all L0 input's max_l0_created_at + vec![ + (7, times.time_5_minutes_future), + (8, times.time_5_minutes_future), + ], ); // verify the content of files @@ -170,15 +146,8 @@ mod tests { // verify 6 files let files = setup.list_by_table_not_to_delete().await; - assert_eq!(files.len(), 6); - // - // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - assert_eq!( - files_and_levels, + assert_levels( + &files, vec![ (1, CompactionLevel::FileNonOverlapped), (2, CompactionLevel::Initial), @@ -186,30 +155,21 @@ mod tests { (4, CompactionLevel::FileNonOverlapped), (5, CompactionLevel::Initial), (6, CompactionLevel::Initial), - ] + ], ); + // verify ID and max_l0_created_at - let time_provider = Arc::clone(&setup.config.time_provider); - - let time_1_minute_future = time_provider.minutes_into_future(1).timestamp_nanos(); - let time_2_minutes_future = time_provider.minutes_into_future(2).timestamp_nanos(); - let time_3_minutes_future = time_provider.minutes_into_future(3).timestamp_nanos(); - let time_5_minutes_future = time_provider.minutes_into_future(5).timestamp_nanos(); - - let files_and_max_l0_created_ats: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.max_l0_created_at.get())) - .collect(); - assert_eq!( - files_and_max_l0_created_ats, + let times = setup.test_times(); + assert_max_l0_created_at( + &files, vec![ - (1, time_1_minute_future), - (2, time_2_minutes_future), - (3, time_5_minutes_future), - (4, time_3_minutes_future), - (5, time_5_minutes_future), - (6, time_2_minutes_future), - ] + (1, times.time_1_minute_future), + (2, times.time_2_minutes_future), + (3, times.time_5_minutes_future), + (4, times.time_3_minutes_future), + (5, times.time_5_minutes_future), + (6, times.time_2_minutes_future), + ], ); // compact @@ -218,29 +178,23 @@ mod tests { // verify number of files: 6 files are compacted into 2 files let files = setup.list_by_table_not_to_delete().await; assert_eq!(files.len(), 2); - // - // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - println!("{files_and_levels:?}"); - // This is the result of 2-round compaction fomr L0s -> L1s and then L1s -> L2s - // The first round will create two L1 files IDs 7 and 8 - // The second round will create tow L2 file IDs 9 and 10 - assert_eq!( - files_and_levels, - vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final),] + + assert_levels( + &files, + // This is the result of 2-round compaction fomr L0s -> L1s and then L1s -> L2s + // The first round will create two L1 files IDs 7 and 8 + // The second round will create tow L2 file IDs 9 and 10 + vec![(9, CompactionLevel::Final), (10, CompactionLevel::Final)], ); - // verify ID and max_l0_created_at - let files_and_max_l0_created_ats: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.max_l0_created_at.get())) - .collect(); - // both files have max_l0_created time_5_minutes_future which is the max of all L0 input's max_l0_created_at - assert_eq!( - files_and_max_l0_created_ats, - vec![(9, time_5_minutes_future), (10, time_5_minutes_future),] + + assert_max_l0_created_at( + &files, + // both files have max_l0_created time_5_minutes_future + // which is the max of all L0 input's max_l0_created_at + vec![ + (9, times.time_5_minutes_future), + (10, times.time_5_minutes_future), + ], ); // verify the content of files @@ -289,26 +243,18 @@ mod tests { // Create a test setup with 6 files let setup = TestSetup::builder().with_files().build().await; + let expected_files_and_levels = vec![ + (1, CompactionLevel::FileNonOverlapped), + (2, CompactionLevel::Initial), + (3, CompactionLevel::Initial), + (4, CompactionLevel::FileNonOverlapped), + (5, CompactionLevel::Initial), + (6, CompactionLevel::Initial), + ]; + // verify 6 files let files = setup.list_by_table_not_to_delete().await; - assert_eq!(files.len(), 6); - // - // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - assert_eq!( - files_and_levels, - vec![ - (1, CompactionLevel::FileNonOverlapped), - (2, CompactionLevel::Initial), - (3, CompactionLevel::Initial), - (4, CompactionLevel::FileNonOverlapped), - (5, CompactionLevel::Initial), - (6, CompactionLevel::Initial), - ] - ); + assert_levels(&files, expected_files_and_levels.clone()); // add the partition into skipped compaction setup @@ -321,24 +267,7 @@ mod tests { // verify still 6 files let files = setup.list_by_table_not_to_delete().await; - assert_eq!(files.len(), 6); - // - // verify ID and compaction level of the files - let files_and_levels: Vec<_> = files - .iter() - .map(|f| (f.id.get(), f.compaction_level)) - .collect(); - assert_eq!( - files_and_levels, - vec![ - (1, CompactionLevel::FileNonOverlapped), - (2, CompactionLevel::Initial), - (3, CompactionLevel::Initial), - (4, CompactionLevel::FileNonOverlapped), - (5, CompactionLevel::Initial), - (6, CompactionLevel::Initial), - ] - ); + assert_levels(&files, expected_files_and_levels.clone()); } #[tokio::test] @@ -478,4 +407,39 @@ mod tests { ) .await; } + + #[track_caller] + fn assert_levels<'a>( + files: impl IntoIterator, + expected_files_and_levels: impl IntoIterator, + ) { + let files_and_levels: Vec<_> = files + .into_iter() + .map(|f| (f.id.get(), f.compaction_level)) + .collect(); + + let expected_files_and_levels: Vec<_> = expected_files_and_levels.into_iter().collect(); + + assert_eq!(files_and_levels, expected_files_and_levels); + } + + #[track_caller] + /// Asserts each parquet file has (id, max_l0_created_at) + fn assert_max_l0_created_at<'a>( + files: impl IntoIterator, + expected_files_and_max_l0_created_ats: impl IntoIterator, + ) { + let files_and_max_l0_created_ats: Vec<_> = files + .into_iter() + .map(|f| (f.id.get(), f.max_l0_created_at.get())) + .collect(); + + let expected_files_and_max_l0_created_ats: Vec<_> = + expected_files_and_max_l0_created_ats.into_iter().collect(); + + assert_eq!( + files_and_max_l0_created_ats, + expected_files_and_max_l0_created_ats + ); + } } diff --git a/compactor2/src/components/divide_initial/mod.rs b/compactor2/src/components/divide_initial/mod.rs index c0c0be7018..237c000dab 100644 --- a/compactor2/src/components/divide_initial/mod.rs +++ b/compactor2/src/components/divide_initial/mod.rs @@ -5,5 +5,10 @@ use data_types::ParquetFile; pub mod single_branch; pub trait DivideInitial: Debug + Display + Send + Sync { + /// Divides a group of files that should be compacted into + /// potentially smaller groups called "branches", + /// + /// Each branch is compacted together in a single plan, and each + /// compact plan may produce one or more parquet files. fn divide(&self, files: Vec) -> Vec>; } diff --git a/compactor2/src/components/files_split/mod.rs b/compactor2/src/components/files_split/mod.rs index 964ae6e436..bb75fa61cb 100644 --- a/compactor2/src/components/files_split/mod.rs +++ b/compactor2/src/components/files_split/mod.rs @@ -10,7 +10,12 @@ pub mod target_level_target_level_split; pub mod target_level_upgrade_split; pub trait FilesSplit: Debug + Display + Send + Sync { - /// Split provided files into 2 groups of files. There will be different split needs: + /// Split provided files into 2 groups of files: + /// (files_to_compact, files_to_keep) + /// + /// Only files in files_to_compact are considered for compaction this round + /// + /// There will be different split needs: /// . `[files <= target_level]` and `[files > target_level]` /// . `[overlapping_files]` and `[non_overlapping_files]` /// . `[files_to_upgrade]` and `[files_to_compact]` diff --git a/compactor2/src/components/partition_filter/mod.rs b/compactor2/src/components/partition_filter/mod.rs index e7b02523bc..e9afa9a25d 100644 --- a/compactor2/src/components/partition_filter/mod.rs +++ b/compactor2/src/components/partition_filter/mod.rs @@ -20,11 +20,16 @@ pub mod or; /// /// May return an error. In this case, the partition will be marked as "skipped". /// -/// If you only plan to inspect the ID but not the files and not perform any IO, check -/// [`IdOnlyPartitionFilter`](crate::components::id_only_partition_filter::IdOnlyPartitionFilter) which usually runs -/// earlier in the pipeline and hence is more efficient. +/// If you only plan to inspect the ID but not the files and not +/// perform any IO, check +/// [`IdOnlyPartitionFilter`](crate::components::id_only_partition_filter::IdOnlyPartitionFilter) +/// which usually runs earlier in the pipeline and hence is more +/// efficient. #[async_trait] pub trait PartitionFilter: Debug + Display + Send + Sync { + /// Return `true` if the if the compactor should run a + /// compaction on this partition. Return `false` if this partition + /// does not need any more compaction. async fn apply( &self, partition_id: PartitionId, diff --git a/compactor2/src/driver.rs b/compactor2/src/driver.rs index cd07ba8554..7f7253802c 100644 --- a/compactor2/src/driver.rs +++ b/compactor2/src/driver.rs @@ -4,6 +4,7 @@ use data_types::{CompactionLevel, ParquetFile, ParquetFileParams, PartitionId}; use datafusion::physical_plan::SendableRecordBatchStream; use futures::{stream::FuturesOrdered, StreamExt, TryFutureExt, TryStreamExt}; use iox_time::Time; +use observability_deps::tracing::info; use parquet_file::ParquetFilePath; use tracker::InstrumentedAsyncSemaphore; @@ -100,8 +101,8 @@ async fn compact_partition( /// The files are split into non-time-overlaped branches, each is compacted in parallel. /// The output of each branch is then combined and re-branch in next round until /// they should not be compacted based on defined stop conditions. -// -// Example: Partition has 7 files: f1, f2, f3, f4, f5, f6, f7 +/// +/// Example: Partition has 7 files: f1, f2, f3, f4, f5, f6, f7 /// Input: shown by their time range /// |--f1--| |----f3----| |-f4-||-f5-||-f7-| /// |------f2----------| |--f6--| @@ -193,6 +194,7 @@ async fn try_compact_partition( // fetch partition info only if we need it let mut lazy_partition_info = None; + // loop for each "Round", consider each file in the partition loop { files = components.files_filter.apply(files); @@ -218,15 +220,18 @@ async fn try_compact_partition( let mut branches = components.divide_initial.divide(files_now); let mut files_next = files_later; + // loop for each "Branch" while let Some(branch) = branches.pop() { - let input_paths: Vec = branch.iter().map(|f| f.into()).collect(); + let input_paths: Vec = + branch.iter().map(ParquetFilePath::from).collect(); - // Identify the target level and files that should be compacted, upgraded, and - // kept for next round of compaction - let compaction_plan = buil_compaction_plan(branch, Arc::clone(&components))?; + // Identify the target level and files that should be + // compacted together, upgraded, and kept for next round of + // compaction + let compaction_plan = build_compaction_plan(branch, Arc::clone(&components))?; // Compact - let created_file_params = compact_files( + let created_file_params = run_compaction_plan( &compaction_plan.files_to_compact, partition_info, &components, @@ -265,24 +270,30 @@ async fn try_compact_partition( } } -/// Each CompactionPlan specifies the target level and files that should be compacted, upgraded, and -/// kept for next round of compaction +/// A CompactionPlan specifies the parameters for a single, which may +/// generate one or more new parquet files. It includes the target +/// [`CompactionLevel`], the specific files that should be compacted +/// together to form new file(s), files that should be upgraded +/// without chainging, files that should be left unmodified. struct CompactionPlan { - /// Target level to compact to + /// The target level of file resulting from compaction target_level: CompactionLevel, - /// Small and/or overlapped files to compact + /// Files which should be compacted into a new single parquet + /// file, often the small and/or overlapped files files_to_compact: Vec, - /// Non-overlapped and large enough files to upgrade + /// Non-overlapped files that should be upgraded to the target + /// level without rewriting (for example they are of sufficient + /// size) files_to_upgrade: Vec, - /// Non-overlapped or higher-target-level files to keep for next round of compaction + /// files which should not be modified. For example, + /// non-overlapped or higher-target-level files files_to_keep: Vec, } -/// Build compaction plan for a given set of files -/// This function will determine the target level to compact to and split the files into -/// files_to_compact, files_to_upgrade, and files_to_keep +/// Build [`CompactionPlan`] for a for a given set of files. +/// +/// # Example: /// -/// Example: /// . Input: /// |--L0.1--| |--L0.2--| |--L0.3--| |--L0.4--| --L0.5--| /// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--| @@ -294,7 +305,7 @@ struct CompactionPlan { /// . files_to_upgrade = [L0.1, L0.5] /// . files_to_compact = [L0.2, L0.3, L0.4, L1.2, L1.3] /// -fn buil_compaction_plan( +fn build_compaction_plan( files: Vec, components: Arc, ) -> Result { @@ -308,7 +319,7 @@ fn buil_compaction_plan( // Since output of one compaction is used as input of next compaction, all files that are not // compacted or upgraded are still kept to consider in next round of compaction - // Split atctual files to compact from its higher-target-level files + // Split actual files to compact from its higher-target-level files // The higher-target-level files are kept for next round of compaction let (files_to_compact, mut files_to_keep) = components .target_level_split @@ -326,6 +337,14 @@ fn buil_compaction_plan( .upgrade_split .apply(files_to_compact, target_level); + info!( + target_level = target_level.to_string(), + files_to_compacts = files_to_compact.len(), + files_to_upgrade = files_to_upgrade.len(), + files_to_keep = files_to_keep.len(), + "Compaction Plan" + ); + Ok(CompactionPlan { target_level, files_to_compact, @@ -334,10 +353,8 @@ fn buil_compaction_plan( }) } -/// Compact into the given target_level -/// This function assumes the input files only include overlapped files of `target_level - 1` -/// and files of target_level. -async fn compact_files( +/// Compact `files` into a new parquet file of the the given target_level +async fn run_compaction_plan( files: &[ParquetFile], partition_info: &Arc, components: &Arc, diff --git a/compactor2/src/test_util.rs b/compactor2/src/test_util.rs index 7b675b9178..1be928c6ff 100644 --- a/compactor2/src/test_util.rs +++ b/compactor2/src/test_util.rs @@ -550,6 +550,34 @@ impl TestSetup { let mut config = Arc::get_mut(&mut self.config).unwrap(); config.min_num_l1_files_to_compact = min_num_l1_files_to_compact; } + + /// return a set of times relative to config.time_provider.now() + pub fn test_times(&self) -> TestTimes { + TestTimes::new(self.config.time_provider.as_ref()) + } +} + +/// A collection of nanosecond timestamps relative to now +pub struct TestTimes { + pub time_1_minute_future: i64, + pub time_2_minutes_future: i64, + pub time_3_minutes_future: i64, + pub time_5_minutes_future: i64, +} + +impl TestTimes { + fn new(time_provider: &dyn TimeProvider) -> Self { + let time_1_minute_future = time_provider.minutes_into_future(1).timestamp_nanos(); + let time_2_minutes_future = time_provider.minutes_into_future(2).timestamp_nanos(); + let time_3_minutes_future = time_provider.minutes_into_future(3).timestamp_nanos(); + let time_5_minutes_future = time_provider.minutes_into_future(5).timestamp_nanos(); + Self { + time_1_minute_future, + time_2_minutes_future, + time_3_minutes_future, + time_5_minutes_future, + } + } } pub async fn list_object_store(store: &Arc) -> HashSet { diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index d5ff391953..402ab6e125 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -858,11 +858,8 @@ impl From<&str> for PartitionKey { } } -impl sqlx::Type for PartitionKey -where - DB: sqlx::Database, -{ - fn type_info() -> DB::TypeInfo { +impl sqlx::Type for PartitionKey { + fn type_info() -> sqlx::postgres::PgTypeInfo { // Store this type as VARCHAR sqlx::postgres::PgTypeInfo::with_name("VARCHAR") } @@ -887,6 +884,31 @@ impl sqlx::Decode<'_, sqlx::Postgres> for PartitionKey { } } +impl sqlx::Type for PartitionKey { + fn type_info() -> sqlx::sqlite::SqliteTypeInfo { + >::type_info() + } +} + +impl sqlx::Encode<'_, sqlx::Sqlite> for PartitionKey { + fn encode_by_ref( + &self, + buf: &mut >::ArgumentBuffer, + ) -> sqlx::encode::IsNull { + >::encode(self.0.to_string(), buf) + } +} + +impl sqlx::Decode<'_, sqlx::Sqlite> for PartitionKey { + fn decode( + value: >::ValueRef, + ) -> Result> { + Ok(Self( + >::decode(value)?.into(), + )) + } +} + /// Data object for a partition. The combination of shard, table and key are unique (i.e. only /// one record can exist for each combo) #[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)] diff --git a/influxdb_iox/src/commands/compactor.rs b/influxdb_iox/src/commands/compactor.rs index c8a3d3f25e..9a28a7d9e4 100644 --- a/influxdb_iox/src/commands/compactor.rs +++ b/influxdb_iox/src/commands/compactor.rs @@ -14,8 +14,6 @@ use std::{collections::HashMap, sync::Arc}; use crate::process_info::{setup_metric_registry, USIZE_MAX}; -mod generate; - #[derive(Debug, clap::Parser)] pub struct Config { #[clap(subcommand)] @@ -53,21 +51,6 @@ pub enum Command { )] exec_mem_pool_bytes: usize, }, - - /// Generate Parquet files and catalog entries with different characteristics for the purposes - /// of investigating how the compactor handles them. - /// - /// Only works with `--object-store file` because this is for generating local development - /// data. - /// - /// Within the directory specified by `--data-dir`, will generate a - /// `compactor_data/line_protocol` subdirectory to avoid interfering with other existing IOx - /// files that may be in the `--data-dir`. - /// - /// WARNING: On every run of this tool, the `compactor_data/line_protocol` subdirectory will be - /// removed. If you want to keep any previously generated files, move or copy them before - /// running this tool again. - Generate(generate::Config), } pub async fn command(config: Config) -> Result<()> { @@ -121,9 +104,6 @@ pub async fn command(config: Config) -> Result<()> { compactor::handler::run_compactor_once(compactor).await; } - Command::Generate(config) => { - generate::run(config).await?; - } } Ok(()) @@ -143,9 +123,6 @@ pub enum Error { #[snafu(context(false))] Compacting { source: ioxd_compactor::Error }, - - #[snafu(context(false))] - Generating { source: generate::Error }, } pub type Result = std::result::Result; diff --git a/influxdb_iox/src/commands/compactor/generate.rs b/influxdb_iox/src/commands/compactor/generate.rs deleted file mode 100644 index de6107790d..0000000000 --- a/influxdb_iox/src/commands/compactor/generate.rs +++ /dev/null @@ -1,685 +0,0 @@ -//! Implements the `compactor generate` command. - -use bytes::Bytes; -use clap::ValueEnum; -use clap_blocks::{ - catalog_dsn::CatalogDsnConfig, - object_store::{make_object_store, ObjectStoreConfig, ObjectStoreType}, -}; -use object_store::DynObjectStore; -use snafu::prelude::*; -use std::{ - ffi::OsStr, fmt::Write, fs, num::NonZeroUsize, path::PathBuf, process::Command, sync::Arc, -}; - -#[derive(Debug, clap::Parser)] -pub struct Config { - #[clap(flatten)] - object_store_config: ObjectStoreConfig, - - #[clap(flatten)] - catalog_dsn: CatalogDsnConfig, - - /// The type of compaction to be done on the files. If `hot` is specified, the generated - /// files will have compaction level 0, will overlap with each other slightly, and will be - /// marked that they were created within the last (approximately) 30 minutes. If `cold` is - /// specified, the generated files will have compaction level 1, won't overlap with each other, - /// and will be marked that they were created between 8 and 24 hours ago. - #[clap( - value_enum, - value_parser, - long = "compaction-type", - env = "INFLUXDB_IOX_COMPACTOR_GENERATE_TYPE", - default_value = "hot", - action - )] - compaction_type: CompactionType, - - /// The number of IOx partitions to generate files for. Each partition will have the number - /// of files specified by `--num-files` generated. - #[clap( - long = "num-partitions", - env = "INFLUXDB_IOX_COMPACTOR_GENERATE_NUM_PARTITIONS", - default_value = "1", - action - )] - num_partitions: NonZeroUsize, - - /// The number of parquet files to generate per partition. - #[clap( - long = "num-files", - env = "INFLUXDB_IOX_COMPACTOR_GENERATE_NUM_FILES", - default_value = "1", - action - )] - num_files: NonZeroUsize, - - /// The number of columns to generate in each file. One column will always be the - /// timestamp. Additional columns will be given a type in I64, F64, String, Bool, and - /// Tag in equal proportion. - #[clap( - long = "num-cols", - env = "INFLUXDB_IOX_COMPACTOR_GENERATE_NUM_COLS", - default_value = "6", - action - )] - num_columns: NonZeroUsize, - - /// The number of rows to generate in each file. - #[clap( - long = "num-rows", - env = "INFLUXDB_IOX_COMPACTOR_GENERATE_NUM_ROWS", - default_value = "1", - action - )] - num_rows: NonZeroUsize, -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] -pub enum CompactionType { - Hot, - Cold, -} - -pub async fn run(config: Config) -> Result<()> { - if !matches!( - &config.object_store_config.object_store, - Some(ObjectStoreType::File) - ) { - panic!("Sorry, this tool only works with 'file' object stores."); - } - - let object_store = make_object_store(&config.object_store_config)?; - - let root_dir: PathBuf = config - .object_store_config - .database_directory - .as_ref() - .expect("--data-dir is required and has already been checked") - .into(); - - let compactor_data_dir = root_dir.join("compactor_data"); - let parquet_dir = compactor_data_dir.join("parquet"); - - if compactor_data_dir - .try_exists() - .context(FileExistenceSnafu { - path: &compactor_data_dir, - })? - { - fs::remove_dir_all(&compactor_data_dir).context(RemoveSnafu { - path: &compactor_data_dir, - })?; - } - - let spec_location = "compactor_data/spec.toml"; - let spec_in_root = compactor_data_dir.join("spec.toml"); - - let Config { - compaction_type, - num_rows, - num_files, - .. - } = config; - - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(compaction_type, num_rows.get(), num_files.get()); - - for (file_id, &start_end) in start_end_args - .iter() - .enumerate() - .take(config.num_files.get()) - { - write_data_generation_spec( - file_id, - Arc::clone(&object_store), - config.num_columns.get(), - sampling_interval_ns, - spec_location, - ) - .await?; - - let StartEndMinutesAgo { start, end } = start_end; - - generate_data(&spec_in_root, &parquet_dir, num_rows.get(), start, end)?; - } - - Ok(()) -} - -#[derive(Debug, Snafu)] -pub enum Error { - #[snafu(display("Could not parse the object store configuration"))] - #[snafu(context(false))] - ObjectStoreConfigParsing { - source: clap_blocks::object_store::ParseError, - }, - - #[snafu(display("Could not write file to object storage"))] - ObjectStoreWriting { source: object_store::Error }, - - #[snafu(display("Could not parse object store path"))] - ObjectStorePathParsing { source: object_store::path::Error }, - - #[snafu(display("Subcommand failed: {status}"))] - Subcommand { status: String }, - - #[snafu(display("Could not check for existence of path {}", path.display()))] - FileExistence { - path: PathBuf, - source: std::io::Error, - }, - - #[snafu(display("Could not remove directory {}", path.display()))] - Remove { - path: PathBuf, - source: std::io::Error, - }, -} - -pub type Result = std::result::Result; - -async fn write_data_generation_spec( - file_id: usize, - object_store: Arc, - num_columns: usize, - sampling_interval_ns: usize, - spec_location: &str, -) -> Result<()> { - let object_store_spec_path = - object_store::path::Path::parse(spec_location).context(ObjectStorePathParsingSnafu)?; - - let contents = data_generation_spec_contents(file_id, sampling_interval_ns, num_columns); - let data = Bytes::from(contents); - - object_store - .put(&object_store_spec_path, data) - .await - .context(ObjectStoreWritingSnafu)?; - - Ok(()) -} - -fn generate_data( - spec_in_root: impl AsRef, - parquet_dir: impl AsRef, - num_rows: usize, - start: usize, - end: usize, -) -> Result<()> { - let status = Command::new("cargo") - .arg("run") - .arg("-p") - .arg("iox_data_generator") - .arg("--") - .arg("--specification") - .arg(&spec_in_root) - .arg("--parquet") - .arg(&parquet_dir) - .arg("--start") - .arg(&format!("{start} minutes ago")) - .arg("--end") - .arg(&format!("{end} minutes ago")) - .arg("--batch-size") - .arg(num_rows.to_string()) - .status() - .expect("Running the data generator should have worked"); - - ensure!( - status.success(), - SubcommandSnafu { - status: status.to_string() - } - ); - - Ok(()) -} - -fn data_generation_spec_contents( - file_id: usize, - sampling_interval_ns: usize, - num_columns: usize, -) -> String { - let mut spec = format!( - r#" -name = "for_compaction" - -[[database_writers]] -database_ratio = 1.0 -agents = [{{name = "data_{file_id}", sampling_interval = "{sampling_interval_ns}ns"}}] - -[[agents]] -name = "data_{file_id}" - -[[agents.measurements]] -name = "measure" -"# - ); - - // The 1st column is always time, and the data generator always generates a timestamp without - // any configuration needed, so the number of columns that need configuration is one less. - let num_columns = num_columns - 1; - - // Every 5th column will be a tag. - let num_tags = num_columns / 5; - // The remaining columns will be fields of various types. - let num_fields = num_columns - num_tags; - - // Tags go with the measurement, so they have to be specified in the config first. - if num_tags > 0 { - spec.push_str("tag_pairs = [\n"); - for tag_id in 1..=num_tags { - let _ = write!( - spec, - r#" {{key = "tag_{tag_id}", template = "{{{{random 1}}}}", regenerate_after_lines = 1}},"# - ); - spec.push('\n'); - } - spec.push_str("]\n") - } - - for field_id in 0..num_fields { - spec.push_str(&field_spec(field_id)); - spec.push('\n'); - } - - spec -} - -fn field_spec(field_id: usize) -> String { - match field_id % 4 { - 0 => format!( - r#" -[[agents.measurements.fields]] -name = "i64_{field_id}" -i64_range = [0, 100]"# - ), - 1 => format!( - r#" -[[agents.measurements.fields]] -name = "f64_{field_id}" -f64_range = [0.0, 100.0]"# - ), - 2 => format!( - r#" -[[agents.measurements.fields]] -name = "string_{field_id}" -template = "{{{{random 4}}}}""# - ), - 3 => format!( - r#" -[[agents.measurements.fields]] -name = "bool_{field_id}" -bool = true"# - ), - _ => unreachable!("% 4 can only result in 0 - 3"), - } -} - -#[derive(Debug, PartialEq, Clone)] -struct TimeValues { - sampling_interval_ns: usize, - start_end_args: Vec, -} - -#[derive(Debug, PartialEq, Copy, Clone)] -struct StartEndMinutesAgo { - start: usize, - end: usize, -} - -impl TimeValues { - fn new(compaction_type: CompactionType, num_rows: usize, num_files: usize) -> Self { - match compaction_type { - CompactionType::Hot => { - // Make the range approximately 30 min ago to now. - let full_range_start_minutes = 30; - let full_range_end_minutes = 0; - - // Overlap each file by this many minutes on the start and end with other files to - // create realistic level 0 files for hot compaction. - let overlap_minutes = 1; - - Self::inner( - full_range_start_minutes, - full_range_end_minutes, - overlap_minutes, - num_rows, - num_files, - ) - } - CompactionType::Cold => { - // Make the range approximately 24 hours ago to 8 hours ago. - let full_range_start_minutes = 24 * 60; - let full_range_end_minutes = 8 * 60; - - // Don't overlap level 1 files - let overlap_minutes = 0; - - Self::inner( - full_range_start_minutes, - full_range_end_minutes, - overlap_minutes, - num_rows, - num_files, - ) - } - } - } - - // Clippy suggests changing `if overlap_minutes == 0 { 1 } else { 0 }` to - // `usize::from(overlap_minutes == 0)`, but I think the original is clearer - #[allow(clippy::bool_to_int_with_if)] - fn inner( - full_range_start_minutes: usize, - full_range_end_minutes: usize, - overlap_minutes: usize, - num_rows: usize, - num_files: usize, - ) -> Self { - // Divide the full range evenly across all files, plus the overlap on each end. - let full_range_length_minutes = full_range_start_minutes - full_range_end_minutes; - let minutes_per_file = full_range_length_minutes / num_files + overlap_minutes * 2; - - // Tell the generator to create one point every this many nanoseconds to create the - // specified number of rows in each file. - let fencepost_num_rows = if num_rows != 1 { - num_rows - 1 - } else { - num_rows - }; - let sampling_interval_ns = (minutes_per_file * 60 * 1_000_000_000) / fencepost_num_rows; - - let start_end_args = (0..num_files) - .rev() - .map(|file_id| StartEndMinutesAgo { - start: minutes_per_file * (file_id + 1) - overlap_minutes * file_id - + full_range_end_minutes, - end: minutes_per_file * file_id - overlap_minutes * file_id - + full_range_end_minutes - // When the overlap is 0, subtract 1 because the data generator is inclusive - - (if overlap_minutes == 0 { 1 } else { 0 }), - }) - .collect(); - - Self { - sampling_interval_ns, - start_end_args, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - mod hot { - use super::*; - - const COMPACTION_TYPE: CompactionType = CompactionType::Hot; - - #[test] - fn one_row_one_file() { - let num_rows = 1; - let num_files = 1; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 1_920_000_000_000); - assert_eq!( - start_end_args, - vec![StartEndMinutesAgo { start: 32, end: 0 }] - ); - } - - #[test] - fn one_thousand_rows_one_file() { - let num_rows = 1_000; - let num_files = 1; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 1_921_921_921); - assert_eq!( - start_end_args, - vec![StartEndMinutesAgo { start: 32, end: 0 }] - ); - } - - #[test] - fn one_row_three_files() { - let num_rows = 1; - let num_files = 3; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 720_000_000_000); - assert_eq!( - start_end_args, - vec![ - StartEndMinutesAgo { start: 34, end: 22 }, - StartEndMinutesAgo { start: 23, end: 11 }, - StartEndMinutesAgo { start: 12, end: 0 }, - ] - ); - } - - #[test] - fn one_thousand_rows_three_files() { - let num_rows = 1_000; - let num_files = 3; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 720_720_720); - assert_eq!( - start_end_args, - vec![ - StartEndMinutesAgo { start: 34, end: 22 }, - StartEndMinutesAgo { start: 23, end: 11 }, - StartEndMinutesAgo { start: 12, end: 0 }, - ] - ); - } - } - - mod cold { - use super::*; - - const COMPACTION_TYPE: CompactionType = CompactionType::Cold; - - #[test] - fn one_row_one_file() { - let num_rows = 1; - let num_files = 1; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 57_600_000_000_000); - assert_eq!( - start_end_args, - vec![StartEndMinutesAgo { - start: 24 * 60, - end: 8 * 60 - 1, - }] - ); - } - - #[test] - fn one_thousand_rows_one_file() { - let num_rows = 1_000; - let num_files = 1; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 57_657_657_657); - assert_eq!( - start_end_args, - vec![StartEndMinutesAgo { - start: 24 * 60, - end: 8 * 60 - 1, - }] - ); - } - - #[test] - fn one_row_three_files() { - let num_rows = 1; - let num_files = 3; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 19_200_000_000_000); - assert_eq!( - start_end_args, - vec![ - StartEndMinutesAgo { - start: 1440, - end: 1119, - }, - StartEndMinutesAgo { - start: 1120, - end: 799, - }, - StartEndMinutesAgo { - start: 800, - end: 479, - }, - ] - ); - } - - #[test] - fn one_thousand_rows_three_files() { - let num_rows = 1_000; - let num_files = 3; - let TimeValues { - sampling_interval_ns, - start_end_args, - } = TimeValues::new(COMPACTION_TYPE, num_rows, num_files); - - assert_eq!(sampling_interval_ns, 19_219_219_219); - assert_eq!( - start_end_args, - vec![ - StartEndMinutesAgo { - start: 1440, - end: 1119, - }, - StartEndMinutesAgo { - start: 1120, - end: 799, - }, - StartEndMinutesAgo { - start: 800, - end: 479, - }, - ] - ); - } - } - - #[test] - fn minimal_spec_contents() { - let spec = data_generation_spec_contents(1, 1, 2); - - assert_eq!( - spec, - r#" -name = "for_compaction" - -[[database_writers]] -database_ratio = 1.0 -agents = [{name = "data_1", sampling_interval = "1ns"}] - -[[agents]] -name = "data_1" - -[[agents.measurements]] -name = "measure" - -[[agents.measurements.fields]] -name = "i64_0" -i64_range = [0, 100] -"# - ); - } - - #[test] - fn many_columns_spec_contents() { - let spec = data_generation_spec_contents(3, 100, 12); - - assert_eq!( - spec, - r#" -name = "for_compaction" - -[[database_writers]] -database_ratio = 1.0 -agents = [{name = "data_3", sampling_interval = "100ns"}] - -[[agents]] -name = "data_3" - -[[agents.measurements]] -name = "measure" -tag_pairs = [ - {key = "tag_1", template = "{{random 1}}", regenerate_after_lines = 1}, - {key = "tag_2", template = "{{random 1}}", regenerate_after_lines = 1}, -] - -[[agents.measurements.fields]] -name = "i64_0" -i64_range = [0, 100] - -[[agents.measurements.fields]] -name = "f64_1" -f64_range = [0.0, 100.0] - -[[agents.measurements.fields]] -name = "string_2" -template = "{{random 4}}" - -[[agents.measurements.fields]] -name = "bool_3" -bool = true - -[[agents.measurements.fields]] -name = "i64_4" -i64_range = [0, 100] - -[[agents.measurements.fields]] -name = "f64_5" -f64_range = [0.0, 100.0] - -[[agents.measurements.fields]] -name = "string_6" -template = "{{random 4}}" - -[[agents.measurements.fields]] -name = "bool_7" -bool = true - -[[agents.measurements.fields]] -name = "i64_8" -i64_range = [0, 100] -"# - ); - } -} diff --git a/influxdb_iox/tests/end_to_end_cases/compactor.rs b/influxdb_iox/tests/end_to_end_cases/compactor.rs deleted file mode 100644 index f1796ddafc..0000000000 --- a/influxdb_iox/tests/end_to_end_cases/compactor.rs +++ /dev/null @@ -1,153 +0,0 @@ -use arrow::record_batch::RecordBatch; -use assert_cmd::Command; -use datafusion::datasource::object_store::ObjectStoreUrl; -use futures::TryStreamExt; -use object_store::{local::LocalFileSystem, path::Path as ObjectStorePath, ObjectStore}; -use parquet_to_line_protocol::ParquetFileReader; -use predicates::prelude::*; -use std::sync::Arc; -use test_helpers_end_to_end::maybe_skip_integration; - -#[tokio::test] -async fn compactor_generate_has_defaults() { - let database_url = maybe_skip_integration!(); - let dir = tempfile::tempdir() - .expect("could not get temporary directory") - .into_path(); - - Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("compactor") - .arg("generate") - .arg("--catalog-dsn") - .arg(&database_url) - .arg("--object-store") - .arg("file") - .arg("--data-dir") - .arg(&dir) - .assert() - .success(); - let data_generation_spec = dir.join("compactor_data/spec.toml"); - assert!(data_generation_spec.exists()); -} - -#[tokio::test] -async fn compactor_generate_zeroes_are_invalid() { - let database_url = maybe_skip_integration!(); - let dir = tempfile::tempdir().expect("could not get temporary directory"); - - Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("compactor") - .arg("generate") - .arg("--catalog-dsn") - .arg(&database_url) - .arg("--object-store") - .arg("file") - .arg("--data-dir") - .arg(dir.path()) - .arg("--num-partitions") - .arg("0") - .arg("--num-files") - .arg("0") - .arg("--num-cols") - .arg("0") - .arg("--num-rows") - .arg("0") - .assert() - .failure() - .stderr(predicate::str::contains( - "number would be zero for non-zero type", - )); -} - -#[tokio::test] -async fn compactor_generate_creates_files_and_catalog_entries() { - let database_url = maybe_skip_integration!(); - let dir = tempfile::tempdir().expect("could not get temporary directory"); - - Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("compactor") - .arg("generate") - .arg("--catalog-dsn") - .arg(&database_url) - .arg("--object-store") - .arg("file") - .arg("--data-dir") - .arg(dir.path()) - .assert() - .success(); - - let data_generation_spec = dir.path().join("compactor_data/spec.toml"); - assert!(data_generation_spec.exists()); -} - -#[tokio::test] -async fn running_compactor_generate_twice_overwrites_existing_files() { - let database_url = maybe_skip_integration!(); - let dir = tempfile::tempdir().expect("could not get temporary directory"); - - Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("compactor") - .arg("generate") - .arg("--catalog-dsn") - .arg(&database_url) - .arg("--object-store") - .arg("file") - .arg("--data-dir") - .arg(dir.path()) - .assert() - .success(); - - let first_run_data_path = dir - .path() - .join("compactor_data/parquet/data_0_measure.parquet"); - let first_run_record_batches = read_record_batches(&first_run_data_path).await; - assert_eq!(first_run_record_batches.len(), 1); - - let first_run_record_batch = &first_run_record_batches[0]; - let first_run_num_lines = first_run_record_batch.num_rows(); - - Command::cargo_bin("influxdb_iox") - .unwrap() - .arg("compactor") - .arg("generate") - .arg("--catalog-dsn") - .arg(&database_url) - .arg("--object-store") - .arg("file") - .arg("--data-dir") - .arg(dir.path()) - .assert() - .success(); - - let second_run_data_path = dir - .path() - .join("compactor_data/parquet/data_0_measure.parquet"); - let second_run_record_batches = read_record_batches(&second_run_data_path).await; - assert_eq!(second_run_record_batches.len(), 1); - - let second_run_record_batch = &second_run_record_batches[0]; - let second_run_num_lines = second_run_record_batch.num_rows(); - - // If generation is appending instead of overwriting, this will fail. - assert_eq!(first_run_num_lines, second_run_num_lines); - - // If generation isn't creating different data every time it's invoked, this will fail. - assert_ne!(first_run_record_batch, second_run_record_batch); -} - -async fn read_record_batches(path: impl AsRef) -> Vec { - let object_store_path = ObjectStorePath::from_filesystem_path(path).unwrap(); - let object_store = Arc::new(LocalFileSystem::new()) as Arc; - let object_store_url = ObjectStoreUrl::local_filesystem(); - let object_meta = object_store.head(&object_store_path).await.unwrap(); - - let reader = ParquetFileReader::try_new(object_store, object_store_url, object_meta) - .await - .unwrap(); - - reader.read().await.unwrap().try_collect().await.unwrap() -} diff --git a/influxdb_iox/tests/end_to_end_cases/mod.rs b/influxdb_iox/tests/end_to_end_cases/mod.rs index 26bb23ccfd..1b3a11d2a2 100644 --- a/influxdb_iox/tests/end_to_end_cases/mod.rs +++ b/influxdb_iox/tests/end_to_end_cases/mod.rs @@ -3,7 +3,6 @@ mod all_in_one; // loading shared libraries: libjemalloc.so.2: cannot open shared object file: No such file or directory" #[cfg(not(feature = "heappy"))] mod cli; -mod compactor; mod debug; mod error; mod flightsql; diff --git a/iox_catalog/.gitignore b/iox_catalog/.gitignore new file mode 100644 index 0000000000..1dc091a735 --- /dev/null +++ b/iox_catalog/.gitignore @@ -0,0 +1 @@ +iox_catalog.sqlite3 \ No newline at end of file diff --git a/iox_catalog/Cargo.toml b/iox_catalog/Cargo.toml index d31c19ab24..8d28999b89 100644 --- a/iox_catalog/Cargo.toml +++ b/iox_catalog/Cargo.toml @@ -14,8 +14,10 @@ log = "0.4" metric = { version = "0.1.0", path = "../metric" } mutable_batch = { path = "../mutable_batch" } observability_deps = { path = "../observability_deps" } +parking_lot = { version = "0.12" } +serde = { version = "1.0", features = ["derive"] } snafu = "0.7" -sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] } +sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid", "sqlite" ] } sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" } thiserror = "1.0.38" tokio = { version = "1.25", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] } diff --git a/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql b/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql new file mode 100644 index 0000000000..3196f80cce --- /dev/null +++ b/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql @@ -0,0 +1,233 @@ +create table if not exists topic +( + id INTEGER not null + constraint kafka_topic_pkey + primary key autoincrement, + name VARCHAR not null + constraint topic_name_unique unique +); + +create table if not exists query_pool +( + id INTEGER NOT NULL + constraint query_pool_pkey + primary key autoincrement, + name varchar not null + constraint query_pool_name_unique + unique +); + +create table if not exists namespace +( + id INTEGER + constraint namespace_pkey + primary key autoincrement, + name varchar not null + constraint namespace_name_unique + unique, + topic_id numeric not null + constraint namespace_kafka_topic_id_fkey + references topic, + query_pool_id numeric not null + references query_pool, + max_tables integer default 10000 not null, + max_columns_per_table integer default 200 not null, + retention_period_ns numeric +); + +create table if not exists table_name +( + id INTEGER + constraint table_name_pkey + primary key autoincrement, + namespace_id numeric not null + references namespace + on delete cascade, + name varchar not null, + constraint table_name_unique + unique (namespace_id, name) +); + + +create index if not exists table_name_namespace_idx + on table_name (namespace_id); + +create table if not exists column_name +( + id INTEGER + constraint column_name_pkey + primary key autoincrement, + table_id numeric not null + references table_name + on delete cascade, + name varchar not null, + column_type smallint not null, + constraint column_name_unique + unique (table_id, name) +); + + +create index if not exists column_name_table_idx + on column_name (table_id); + +create table if not exists shard +( + id INTEGER + constraint sequencer_pkey + primary key autoincrement, + topic_id numeric not null + constraint sequencer_kafka_topic_id_fkey + references topic, + shard_index integer not null, + min_unpersisted_sequence_number numeric, + constraint shard_unique + unique (topic_id, shard_index) +); + + +create table if not exists sharding_rule_override +( + id INTEGER + constraint sharding_rule_override_pkey + primary key autoincrement, + namespace_id numeric not null + references namespace, + table_id numeric not null + references table_name, + column_id numeric not null + references column_name +); + + +create table if not exists partition +( + id INTEGER + constraint partition_pkey + primary key autoincrement, + shard_id numeric not null + constraint partition_sequencer_id_fkey + references shard, + table_id numeric not null + references table_name + on delete cascade, + partition_key varchar not null, + sort_key text [] not null, + persisted_sequence_number numeric, + to_delete numeric, + new_file_at numeric, + constraint partition_key_unique + unique (table_id, partition_key) +); + + +create table if not exists parquet_file +( + id INTEGER + constraint parquet_file_pkey + primary key autoincrement, + shard_id numeric not null + constraint parquet_file_sequencer_id_fkey + references shard, + table_id numeric not null + references table_name, + partition_id numeric not null + references partition, + object_store_id uuid not null + constraint parquet_location_unique + unique, + max_sequence_number numeric, + min_time numeric, + max_time numeric, + to_delete numeric, + row_count numeric default 0 not null, + file_size_bytes numeric default 0 not null, + compaction_level smallint default 0 not null, + created_at numeric, + namespace_id numeric not null + references namespace + on delete cascade, + column_set numeric[] not null, + max_l0_created_at numeric default 0 not null +); + + +create index if not exists parquet_file_deleted_at_idx + on parquet_file (to_delete); + +create index if not exists parquet_file_partition_idx + on parquet_file (partition_id); + +create index if not exists parquet_file_table_idx + on parquet_file (table_id); + +create index if not exists parquet_file_shard_compaction_delete_idx + on parquet_file (shard_id, compaction_level, to_delete); + +create index if not exists parquet_file_shard_compaction_delete_created_idx + on parquet_file (shard_id, compaction_level, to_delete, created_at); + +create index if not exists parquet_file_partition_created_idx + on parquet_file (partition_id, created_at); + +create table if not exists tombstone +( + id INTEGER + constraint tombstone_pkey + primary key autoincrement, + table_id numeric not null + references table_name + on delete cascade, + shard_id numeric not null + constraint tombstone_sequencer_id_fkey + references shard, + sequence_number numeric not null, + min_time numeric not null, + max_time numeric not null, + serialized_predicate text not null, + constraint tombstone_unique + unique (table_id, shard_id, sequence_number) +); + + +create table if not exists processed_tombstone +( + tombstone_id INTEGER not null + references tombstone, + parquet_file_id numeric not null + references parquet_file + on delete cascade, + primary key (tombstone_id, parquet_file_id) +); + + +create table if not exists skipped_compactions +( + partition_id INTEGER not null + constraint skipped_compactions_pkey + primary key + references partition + on delete cascade, + reason text not null, + skipped_at numeric not null, + num_files numeric, + limit_num_files numeric, + estimated_bytes numeric, + limit_bytes numeric, + limit_num_files_first_in_partition numeric +); + + +create table if not exists billing_summary +( + namespace_id integer not null + constraint billing_summary_pkey + primary key + references namespace + on delete cascade, + total_file_size_bytes numeric not null +); + + +create index if not exists billing_summary_namespace_idx + on billing_summary (namespace_id); + diff --git a/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql b/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql new file mode 100644 index 0000000000..dfea3ace46 --- /dev/null +++ b/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql @@ -0,0 +1,31 @@ +create trigger if not exists update_partition + after insert + on parquet_file + for each row + when NEW.compaction_level < 2 +begin + UPDATE partition set new_file_at = NEW.created_at WHERE id = NEW.partition_id; +end; + +create trigger if not exists update_billing + after insert + on parquet_file + for each row +begin + INSERT INTO billing_summary (namespace_id, total_file_size_bytes) + VALUES (NEW.namespace_id, NEW.file_size_bytes) + ON CONFLICT (namespace_id) DO UPDATE + SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes + WHERE billing_summary.namespace_id = NEW.namespace_id; +end; + +create trigger if not exists decrement_summary + after update + on parquet_file + for each row + when OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL +begin + UPDATE billing_summary + SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes + WHERE billing_summary.namespace_id = OLD.namespace_id; +end; \ No newline at end of file diff --git a/iox_catalog/src/lib.rs b/iox_catalog/src/lib.rs index 21cc2030ab..19d54ddf63 100644 --- a/iox_catalog/src/lib.rs +++ b/iox_catalog/src/lib.rs @@ -42,6 +42,7 @@ pub mod interface; pub mod mem; pub mod metrics; pub mod postgres; +pub mod sqlite; /// An [`crate::interface::Error`] scoped to a single table for schema validation errors. #[derive(Debug, Error)] diff --git a/iox_catalog/src/sqlite.rs b/iox_catalog/src/sqlite.rs new file mode 100644 index 0000000000..3380c810c4 --- /dev/null +++ b/iox_catalog/src/sqlite.rs @@ -0,0 +1,2920 @@ +//! A SQLite backed implementation of the Catalog + +use crate::{ + interface::{ + self, sealed::TransactionFinalize, CasFailure, Catalog, ColumnRepo, + ColumnTypeMismatchSnafu, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo, + ProcessedTombstoneRepo, QueryPoolRepo, RepoCollection, Result, ShardRepo, TableRepo, + TombstoneRepo, TopicMetadataRepo, Transaction, + }, + metrics::MetricDecorator, + DEFAULT_MAX_COLUMNS_PER_TABLE, DEFAULT_MAX_TABLES, SHARED_TOPIC_ID, SHARED_TOPIC_NAME, +}; +use async_trait::async_trait; +use data_types::{ + Column, ColumnId, ColumnSet, ColumnType, ColumnTypeCount, CompactionLevel, Namespace, + NamespaceId, ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionId, + PartitionKey, PartitionParam, ProcessedTombstone, QueryPool, QueryPoolId, SequenceNumber, + Shard, ShardId, ShardIndex, SkippedCompaction, Table, TableId, TablePartition, Timestamp, + Tombstone, TombstoneId, TopicId, TopicMetadata, TRANSITION_SHARD_ID, TRANSITION_SHARD_INDEX, +}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::ops::Deref; + +use iox_time::{SystemProvider, TimeProvider}; +use metric::Registry; +use observability_deps::tracing::{debug, warn}; +use parking_lot::Mutex; +use snafu::prelude::*; +use sqlx::types::Json; +use sqlx::{ + migrate::Migrator, sqlite::SqliteConnectOptions, types::Uuid, Executor, Pool, Row, Sqlite, + SqlitePool, +}; +use std::str::FromStr; +use std::sync::Arc; + +static MIGRATOR: Migrator = sqlx::migrate!("sqlite/migrations"); + +/// Maximum number of files deleted by [`ParquetFileRepo::delete_old_ids_only]. +const MAX_PARQUET_FILES_DELETED_ONCE: i64 = 1_000; + +/// SQLite connection options. +#[derive(Debug, Clone)] +pub struct SqliteConnectionOptions { + /// DSN. + pub dsn: String, +} + +/// SQLite catalog. +#[derive(Debug)] +pub struct SqliteCatalog { + metrics: Arc, + pool: Pool, + time_provider: Arc, +} + +// struct to get return value from "select count(id) ..." query +#[derive(sqlx::FromRow)] +struct Count { + count: i64, +} + +/// transaction for [`SqliteCatalog`]. +#[derive(Debug)] +pub struct SqliteTxn { + inner: Mutex, + time_provider: Arc, +} + +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +enum SqliteTxnInner { + Txn(Option>), + Oneshot(Pool), +} + +impl<'c> Executor<'c> for &'c mut SqliteTxnInner { + type Database = Sqlite; + + #[allow(clippy::type_complexity)] + fn fetch_many<'e, 'q: 'e, E: 'q>( + self, + query: E, + ) -> futures::stream::BoxStream< + 'e, + Result< + sqlx::Either< + ::QueryResult, + ::Row, + >, + sqlx::Error, + >, + > + where + 'c: 'e, + E: sqlx::Execute<'q, Self::Database>, + { + match self { + SqliteTxnInner::Txn(txn) => txn.as_mut().expect("Not yet finalized").fetch_many(query), + SqliteTxnInner::Oneshot(pool) => pool.fetch_many(query), + } + } + + fn fetch_optional<'e, 'q: 'e, E: 'q>( + self, + query: E, + ) -> futures::future::BoxFuture< + 'e, + Result::Row>, sqlx::Error>, + > + where + 'c: 'e, + E: sqlx::Execute<'q, Self::Database>, + { + match self { + SqliteTxnInner::Txn(txn) => txn + .as_mut() + .expect("Not yet finalized") + .fetch_optional(query), + SqliteTxnInner::Oneshot(pool) => pool.fetch_optional(query), + } + } + + fn prepare_with<'e, 'q: 'e>( + self, + sql: &'q str, + parameters: &'e [::TypeInfo], + ) -> futures::future::BoxFuture< + 'e, + Result<>::Statement, sqlx::Error>, + > + where + 'c: 'e, + { + match self { + SqliteTxnInner::Txn(txn) => txn + .as_mut() + .expect("Not yet finalized") + .prepare_with(sql, parameters), + SqliteTxnInner::Oneshot(pool) => pool.prepare_with(sql, parameters), + } + } + + fn describe<'e, 'q: 'e>( + self, + sql: &'q str, + ) -> futures::future::BoxFuture<'e, Result, sqlx::Error>> + where + 'c: 'e, + { + match self { + SqliteTxnInner::Txn(txn) => txn.as_mut().expect("Not yet finalized").describe(sql), + SqliteTxnInner::Oneshot(pool) => pool.describe(sql), + } + } +} + +impl Drop for SqliteTxn { + fn drop(&mut self) { + if let SqliteTxnInner::Txn(Some(_)) = self.inner.lock().deref() { + warn!("Dropping SqliteTxn w/o finalizing (commit or abort)"); + + // SQLx ensures that the inner transaction enqueues a rollback when it is dropped, so + // we don't need to spawn a task here to call `rollback` manually. + } + } +} + +#[async_trait] +impl TransactionFinalize for SqliteTxn { + async fn commit_inplace(&mut self) -> Result<(), Error> { + match self.inner.get_mut() { + SqliteTxnInner::Txn(txn) => txn + .take() + .expect("Not yet finalized") + .commit() + .await + .map_err(|e| Error::SqlxError { source: e }), + SqliteTxnInner::Oneshot(_) => { + panic!("cannot commit oneshot"); + } + } + } + + async fn abort_inplace(&mut self) -> Result<(), Error> { + match self.inner.get_mut() { + SqliteTxnInner::Txn(txn) => txn + .take() + .expect("Not yet finalized") + .rollback() + .await + .map_err(|e| Error::SqlxError { source: e }), + SqliteTxnInner::Oneshot(_) => { + panic!("cannot abort oneshot"); + } + } + } +} + +impl SqliteCatalog { + /// Connect to the catalog store. + pub async fn connect(options: SqliteConnectionOptions, metrics: Arc) -> Result { + let opts = SqliteConnectOptions::from_str(&options.dsn) + .map_err(|e| Error::SqlxError { source: e })? + .create_if_missing(true); + + let pool = SqlitePool::connect_with(opts) + .await + .map_err(|e| Error::SqlxError { source: e })?; + Ok(Self { + metrics, + pool, + time_provider: Arc::new(SystemProvider::new()), + }) + } +} + +#[async_trait] +impl Catalog for SqliteCatalog { + async fn setup(&self) -> Result<()> { + MIGRATOR + .run(&self.pool) + .await + .map_err(|e| Error::Setup { source: e.into() })?; + + if std::env::var("INFLUXDB_IOX_RPC_MODE").is_ok() { + // We need to manually insert the topic here so that we can create the transition shard below. + sqlx::query( + r#" +INSERT INTO topic (name) +VALUES ($1) +ON CONFLICT (name) +DO NOTHING; + "#, + ) + .bind(SHARED_TOPIC_NAME) + .execute(&self.pool) + .await + .map_err(|e| Error::Setup { source: e })?; + + // The transition shard must exist and must have magic ID and INDEX. + sqlx::query( + r#" +INSERT INTO shard (id, topic_id, shard_index, min_unpersisted_sequence_number) +VALUES ($1, $2, $3, 0) +ON CONFLICT (topic_id, shard_index) +DO NOTHING; + "#, + ) + .bind(TRANSITION_SHARD_ID) + .bind(SHARED_TOPIC_ID) + .bind(TRANSITION_SHARD_INDEX) + .execute(&self.pool) + .await + .map_err(|e| Error::Setup { source: e })?; + } + + Ok(()) + } + + async fn start_transaction(&self) -> Result> { + let transaction = self + .pool + .begin() + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(Box::new(MetricDecorator::new( + SqliteTxn { + inner: Mutex::new(SqliteTxnInner::Txn(Some(transaction))), + time_provider: Arc::clone(&self.time_provider), + }, + Arc::clone(&self.metrics), + ))) + } + + async fn repositories(&self) -> Box { + Box::new(MetricDecorator::new( + SqliteTxn { + inner: Mutex::new(SqliteTxnInner::Oneshot(self.pool.clone())), + time_provider: Arc::clone(&self.time_provider), + }, + Arc::clone(&self.metrics), + )) + } + + fn metrics(&self) -> Arc { + Arc::clone(&self.metrics) + } + + fn time_provider(&self) -> Arc { + Arc::clone(&self.time_provider) + } +} + +#[async_trait] +impl RepoCollection for SqliteTxn { + fn topics(&mut self) -> &mut dyn TopicMetadataRepo { + self + } + + fn query_pools(&mut self) -> &mut dyn QueryPoolRepo { + self + } + + fn namespaces(&mut self) -> &mut dyn NamespaceRepo { + self + } + + fn tables(&mut self) -> &mut dyn TableRepo { + self + } + + fn columns(&mut self) -> &mut dyn ColumnRepo { + self + } + + fn shards(&mut self) -> &mut dyn ShardRepo { + self + } + + fn partitions(&mut self) -> &mut dyn PartitionRepo { + self + } + + fn tombstones(&mut self) -> &mut dyn TombstoneRepo { + self + } + + fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo { + self + } + + fn processed_tombstones(&mut self) -> &mut dyn ProcessedTombstoneRepo { + self + } +} + +#[async_trait] +impl TopicMetadataRepo for SqliteTxn { + async fn create_or_get(&mut self, name: &str) -> Result { + let rec = sqlx::query_as::<_, TopicMetadata>( + r#" +INSERT INTO topic ( name ) +VALUES ( $1 ) +ON CONFLICT (name) +DO UPDATE SET name = topic.name +RETURNING *; + "#, + ) + .bind(name) // $1 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn get_by_name(&mut self, name: &str) -> Result> { + let rec = sqlx::query_as::<_, TopicMetadata>( + r#" +SELECT * +FROM topic +WHERE name = $1; + "#, + ) + .bind(name) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let topic = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(topic)) + } +} + +#[async_trait] +impl QueryPoolRepo for SqliteTxn { + async fn create_or_get(&mut self, name: &str) -> Result { + let rec = sqlx::query_as::<_, QueryPool>( + r#" +INSERT INTO query_pool ( name ) +VALUES ( $1 ) +ON CONFLICT (name) +DO UPDATE SET name = query_pool.name +RETURNING *; + "#, + ) + .bind(name) // $1 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } +} + +#[async_trait] +impl NamespaceRepo for SqliteTxn { + async fn create( + &mut self, + name: &str, + retention_period_ns: Option, + topic_id: TopicId, + query_pool_id: QueryPoolId, + ) -> Result { + let rec = sqlx::query_as::<_, Namespace>( + r#" + INSERT INTO namespace ( name, topic_id, query_pool_id, retention_period_ns, max_tables ) + VALUES ( $1, $2, $3, $4, $5 ) + RETURNING *; + "#, + ) + .bind(name) // $1 + .bind(topic_id) // $2 + .bind(query_pool_id) // $3 + .bind(retention_period_ns) // $4 + .bind(DEFAULT_MAX_TABLES); // $5 + + let rec = rec.fetch_one(self.inner.get_mut()).await.map_err(|e| { + if is_unique_violation(&e) { + Error::NameExists { + name: name.to_string(), + } + } else if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + })?; + + // Ensure the column default values match the code values. + debug_assert_eq!(rec.max_tables, DEFAULT_MAX_TABLES); + debug_assert_eq!(rec.max_columns_per_table, DEFAULT_MAX_COLUMNS_PER_TABLE); + + Ok(rec) + } + + async fn list(&mut self) -> Result> { + let rec = sqlx::query_as::<_, Namespace>( + r#" +SELECT * +FROM namespace; + "#, + ) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn get_by_id(&mut self, id: NamespaceId) -> Result> { + let rec = sqlx::query_as::<_, Namespace>( + r#" +SELECT * +FROM namespace +WHERE id = $1; + "#, + ) + .bind(id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(namespace)) + } + + async fn get_by_name(&mut self, name: &str) -> Result> { + let rec = sqlx::query_as::<_, Namespace>( + r#" +SELECT * +FROM namespace +WHERE name = $1; + "#, + ) + .bind(name) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(namespace)) + } + + async fn delete(&mut self, name: &str) -> Result<()> { + // note that there is a uniqueness constraint on the name column in the DB + sqlx::query( + r#" +DELETE FROM namespace +WHERE name = $1; + "#, + ) + .bind(name) + .execute(self.inner.get_mut()) + .await + .context(interface::CouldNotDeleteNamespaceSnafu) + .map(|_| ()) + } + + async fn update_table_limit(&mut self, name: &str, new_max: i32) -> Result { + let rec = sqlx::query_as::<_, Namespace>( + r#" +UPDATE namespace +SET max_tables = $1 +WHERE name = $2 +RETURNING *; + "#, + ) + .bind(new_max) + .bind(name) + .fetch_one(self.inner.get_mut()) + .await; + + let namespace = rec.map_err(|e| match e { + sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { + name: name.to_string(), + }, + _ => Error::SqlxError { source: e }, + })?; + + Ok(namespace) + } + + async fn update_column_limit(&mut self, name: &str, new_max: i32) -> Result { + let rec = sqlx::query_as::<_, Namespace>( + r#" +UPDATE namespace +SET max_columns_per_table = $1 +WHERE name = $2 +RETURNING *; + "#, + ) + .bind(new_max) + .bind(name) + .fetch_one(self.inner.get_mut()) + .await; + + let namespace = rec.map_err(|e| match e { + sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { + name: name.to_string(), + }, + _ => Error::SqlxError { source: e }, + })?; + + Ok(namespace) + } + + async fn update_retention_period( + &mut self, + name: &str, + retention_period_ns: Option, + ) -> Result { + let rec = sqlx::query_as::<_, Namespace>( + r#"UPDATE namespace SET retention_period_ns = $1 WHERE name = $2 RETURNING *;"#, + ) + .bind(retention_period_ns) // $1 + .bind(name) // $2 + .fetch_one(self.inner.get_mut()) + .await; + + let namespace = rec.map_err(|e| match e { + sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { + name: name.to_string(), + }, + _ => Error::SqlxError { source: e }, + })?; + + Ok(namespace) + } +} + +#[async_trait] +impl TableRepo for SqliteTxn { + async fn create_or_get(&mut self, name: &str, namespace_id: NamespaceId) -> Result { + // A simple insert statement becomes quite complicated in order to avoid checking the table + // limits in a select and then conditionally inserting (which would be racey). + // + // from https://www.postgresql.org/docs/current/sql-insert.html + // "INSERT inserts new rows into a table. One can insert one or more rows specified by + // value expressions, or zero or more rows resulting from a query." + // By using SELECT rather than VALUES it will insert zero rows if it finds a null in the + // subquery, i.e. if count >= max_tables. fetch_one() will return a RowNotFound error if + // nothing was inserted. Not pretty! + let rec = sqlx::query_as::<_, Table>( + r#" +INSERT INTO table_name ( name, namespace_id ) +SELECT $1, id FROM ( + SELECT namespace.id AS id, max_tables, COUNT(table_name.id) AS count + FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id + WHERE namespace.id = $2 + GROUP BY namespace.max_tables, table_name.namespace_id, namespace.id +) AS get_count WHERE count < max_tables +ON CONFLICT (namespace_id, name) +DO UPDATE SET name = table_name.name +RETURNING *; + "#, + ) + .bind(name) // $1 + .bind(namespace_id) // $2 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| match e { + sqlx::Error::RowNotFound => Error::TableCreateLimitError { + table_name: name.to_string(), + namespace_id, + }, + _ => { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + } + })?; + + Ok(rec) + } + + async fn get_by_id(&mut self, table_id: TableId) -> Result> { + let rec = sqlx::query_as::<_, Table>( + r#" +SELECT * +FROM table_name +WHERE id = $1; + "#, + ) + .bind(table_id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let table = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(table)) + } + + async fn get_by_namespace_and_name( + &mut self, + namespace_id: NamespaceId, + name: &str, + ) -> Result> { + let rec = sqlx::query_as::<_, Table>( + r#" +SELECT * +FROM table_name +WHERE namespace_id = $1 AND name = $2; + "#, + ) + .bind(namespace_id) // $1 + .bind(name) // $2 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let table = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(table)) + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + let rec = sqlx::query_as::<_, Table>( + r#" +SELECT * +FROM table_name +WHERE namespace_id = $1; + "#, + ) + .bind(namespace_id) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn list(&mut self) -> Result> { + let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;") + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } +} + +#[async_trait] +impl ColumnRepo for SqliteTxn { + async fn create_or_get( + &mut self, + name: &str, + table_id: TableId, + column_type: ColumnType, + ) -> Result { + let rec = sqlx::query_as::<_, Column>( + r#" +INSERT INTO column_name ( name, table_id, column_type ) +SELECT $1, table_id, $3 FROM ( + SELECT max_columns_per_table, namespace.id, table_name.id as table_id, COUNT(column_name.id) AS count + FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id + LEFT JOIN column_name ON table_name.id = column_name.table_id + WHERE table_name.id = $2 + GROUP BY namespace.max_columns_per_table, namespace.id, table_name.id +) AS get_count WHERE count < max_columns_per_table +ON CONFLICT (table_id, name) +DO UPDATE SET name = column_name.name +RETURNING *; + "#, + ) + .bind(name) // $1 + .bind(table_id) // $2 + .bind(column_type) // $3 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| match e { + sqlx::Error::RowNotFound => Error::ColumnCreateLimitError { + column_name: name.to_string(), + table_id, + }, + _ => { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + }})?; + + ensure!( + rec.column_type == column_type, + ColumnTypeMismatchSnafu { + name, + existing: rec.column_type, + new: column_type, + } + ); + + Ok(rec) + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + let rec = sqlx::query_as::<_, Column>( + r#" +SELECT column_name.* FROM table_name +INNER JOIN column_name on column_name.table_id = table_name.id +WHERE table_name.namespace_id = $1; + "#, + ) + .bind(namespace_id) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + let rec = sqlx::query_as::<_, Column>( + r#" +SELECT * FROM column_name +WHERE table_id = $1; + "#, + ) + .bind(table_id) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn list(&mut self) -> Result> { + let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;") + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(rec) + } + + async fn create_or_get_many_unchecked( + &mut self, + table_id: TableId, + columns: HashMap<&str, ColumnType>, + ) -> Result> { + let num_columns = columns.len(); + #[derive(Deserialize, Serialize)] + struct NameType<'a> { + name: &'a str, + column_type: i8, + } + impl<'a> NameType<'a> { + fn from(value: (&&'a str, &ColumnType)) -> Self { + Self { + name: value.0, + column_type: *value.1 as i8, + } + } + } + let cols = columns.iter().map(NameType::<'_>::from).collect::>(); + + // The `ORDER BY` in this statement is important to avoid deadlocks during concurrent + // writes to the same IOx table that each add many new columns. See: + // + // - + // - + // - + let out = sqlx::query_as::<_, Column>( + r#" +INSERT INTO column_name ( name, table_id, column_type ) +SELECT a.value ->> 'name' AS name, $1, a.value ->> 'column_type' AS column_type +FROM json_each($2) as a +ORDER BY name +ON CONFLICT (table_id, name) +DO UPDATE SET name = column_name.name +RETURNING *; + "#, + ) + .bind(table_id) // $1 + .bind(&Json(cols)) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + })?; + + assert_eq!(num_columns, out.len()); + + for existing in &out { + let want = columns.get(existing.name.as_str()).unwrap(); + ensure!( + existing.column_type == *want, + ColumnTypeMismatchSnafu { + name: &existing.name, + existing: existing.column_type, + new: *want, + } + ); + } + + Ok(out) + } + + async fn list_type_count_by_table_id( + &mut self, + table_id: TableId, + ) -> Result> { + sqlx::query_as::<_, ColumnTypeCount>( + r#" +select column_type as col_type, count(1) AS count from column_name where table_id = $1 group by 1; + "#, + ) + .bind(table_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } +} + +#[async_trait] +impl ShardRepo for SqliteTxn { + async fn create_or_get( + &mut self, + topic: &TopicMetadata, + shard_index: ShardIndex, + ) -> Result { + sqlx::query_as::<_, Shard>( + r#" +INSERT INTO shard + ( topic_id, shard_index, min_unpersisted_sequence_number ) +VALUES + ( $1, $2, 0 ) +ON CONFLICT (topic_id, shard_index) +DO UPDATE SET topic_id = shard.topic_id +RETURNING *; + "#, + ) + .bind(topic.id) // $1 + .bind(shard_index) // $2 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + }) + } + + async fn get_by_topic_id_and_shard_index( + &mut self, + topic_id: TopicId, + shard_index: ShardIndex, + ) -> Result> { + let rec = sqlx::query_as::<_, Shard>( + r#" +SELECT * +FROM shard +WHERE topic_id = $1 + AND shard_index = $2; + "#, + ) + .bind(topic_id) // $1 + .bind(shard_index) // $2 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let shard = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(shard)) + } + + async fn list(&mut self) -> Result> { + sqlx::query_as::<_, Shard>(r#"SELECT * FROM shard;"#) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn list_by_topic(&mut self, topic: &TopicMetadata) -> Result> { + sqlx::query_as::<_, Shard>(r#"SELECT * FROM shard WHERE topic_id = $1;"#) + .bind(topic.id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn update_min_unpersisted_sequence_number( + &mut self, + shard_id: ShardId, + sequence_number: SequenceNumber, + ) -> Result<()> { + let _ = sqlx::query( + r#" +UPDATE shard +SET min_unpersisted_sequence_number = $1 +WHERE id = $2; + "#, + ) + .bind(sequence_number.get()) // $1 + .bind(shard_id) // $2 + .execute(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(()) + } +} + +// We can't use [`Partition`], as uses Vec which the Sqlite +// driver cannot serialise + +#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)] +struct PartitionPod { + id: PartitionId, + shard_id: ShardId, + table_id: TableId, + partition_key: PartitionKey, + sort_key: Json>, + persisted_sequence_number: Option, + new_file_at: Option, +} + +impl From for Partition { + fn from(value: PartitionPod) -> Self { + Self { + id: value.id, + shard_id: value.shard_id, + table_id: value.table_id, + partition_key: value.partition_key, + sort_key: value.sort_key.0, + persisted_sequence_number: value.persisted_sequence_number, + new_file_at: value.new_file_at, + } + } +} + +#[async_trait] +impl PartitionRepo for SqliteTxn { + async fn create_or_get( + &mut self, + key: PartitionKey, + shard_id: ShardId, + table_id: TableId, + ) -> Result { + // Note: since sort_key is now an array, we must explicitly insert '{}' which is an empty + // array rather than NULL which sqlx will throw `UnexpectedNullError` while is is doing + // `ColumnDecode` + + let v = sqlx::query_as::<_, PartitionPod>( + r#" +INSERT INTO partition + ( partition_key, shard_id, table_id, sort_key) +VALUES + ( $1, $2, $3, '[]') +ON CONFLICT (table_id, partition_key) +DO UPDATE SET partition_key = partition.partition_key +RETURNING *; + "#, + ) + .bind(key) // $1 + .bind(shard_id) // $2 + .bind(table_id) // $3 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + })?; + + // If the partition_key_unique constraint was hit because there was an + // existing record for (table_id, partition_key) ensure the partition + // key in the DB is mapped to the same shard_id the caller + // requested. + assert_eq!( + v.shard_id, shard_id, + "attempted to overwrite partition with different shard ID" + ); + + Ok(v.into()) + } + + async fn get_by_id(&mut self, partition_id: PartitionId) -> Result> { + let rec = sqlx::query_as::<_, PartitionPod>(r#"SELECT * FROM partition WHERE id = $1;"#) + .bind(partition_id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let partition = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(partition.into())) + } + + async fn list_by_shard(&mut self, shard_id: ShardId) -> Result> { + Ok( + sqlx::query_as::<_, PartitionPod>(r#"SELECT * FROM partition WHERE shard_id = $1;"#) + .bind(shard_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect(), + ) + } + + async fn list_by_namespace(&mut self, namespace_id: NamespaceId) -> Result> { + Ok(sqlx::query_as::<_, PartitionPod>( + r#" +SELECT partition.* +FROM table_name +INNER JOIN partition on partition.table_id = table_name.id +WHERE table_name.namespace_id = $1; + "#, + ) + .bind(namespace_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + Ok(sqlx::query_as::<_, PartitionPod>( + r#" +SELECT * +FROM partition +WHERE table_id = $1; + "#, + ) + .bind(table_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + /// Update the sort key for `partition_id` if and only if `old_sort_key` + /// matches the current value in the database. + /// + /// This compare-and-swap operation is allowed to spuriously return + /// [`CasFailure::ValueMismatch`] for performance reasons (avoiding multiple + /// round trips to service a transaction in the happy path). + async fn cas_sort_key( + &mut self, + partition_id: PartitionId, + old_sort_key: Option>, + new_sort_key: &[&str], + ) -> Result>> { + let old_sort_key = old_sort_key.unwrap_or_default(); + let res = sqlx::query_as::<_, PartitionPod>( + r#" +UPDATE partition +SET sort_key = $1 +WHERE id = $2 AND sort_key = $3 +RETURNING *; + "#, + ) + .bind(Json(new_sort_key)) // $1 + .bind(partition_id) // $2 + .bind(Json(&old_sort_key)) // $3 + .fetch_one(self.inner.get_mut()) + .await; + + let partition = match res { + Ok(v) => v, + Err(sqlx::Error::RowNotFound) => { + // This update may have failed either because: + // + // * A row with the specified ID did not exist at query time + // (but may exist now!) + // * The sort key does not match. + // + // To differentiate, we submit a get partition query, returning + // the actual sort key if successful. + // + // NOTE: this is racy, but documented - this might return "Sort + // key differs! Old key: " + return Err(CasFailure::ValueMismatch( + PartitionRepo::get_by_id(self, partition_id) + .await + .map_err(CasFailure::QueryError)? + .ok_or(CasFailure::QueryError(Error::PartitionNotFound { + id: partition_id, + }))? + .sort_key, + )); + } + Err(e) => return Err(CasFailure::QueryError(Error::SqlxError { source: e })), + }; + + debug!( + ?partition_id, + ?old_sort_key, + ?new_sort_key, + "partition sort key cas successful" + ); + + Ok(partition.into()) + } + + async fn record_skipped_compaction( + &mut self, + partition_id: PartitionId, + reason: &str, + num_files: usize, + limit_num_files: usize, + limit_num_files_first_in_partition: usize, + estimated_bytes: u64, + limit_bytes: u64, + ) -> Result<()> { + sqlx::query( + r#" +INSERT INTO skipped_compactions + ( partition_id, reason, num_files, limit_num_files, limit_num_files_first_in_partition, estimated_bytes, limit_bytes, skipped_at ) +VALUES + ( $1, $2, $3, $4, $5, $6, $7, $8 ) +ON CONFLICT ( partition_id ) +DO UPDATE +SET +reason = EXCLUDED.reason, +num_files = EXCLUDED.num_files, +limit_num_files = EXCLUDED.limit_num_files, +limit_num_files_first_in_partition = EXCLUDED.limit_num_files_first_in_partition, +estimated_bytes = EXCLUDED.estimated_bytes, +limit_bytes = EXCLUDED.limit_bytes, +skipped_at = EXCLUDED.skipped_at; + "#, + ) + .bind(partition_id) // $1 + .bind(reason) + .bind(num_files as i64) + .bind(limit_num_files as i64) + .bind(limit_num_files_first_in_partition as i64) + .bind(estimated_bytes as i64) + .bind(limit_bytes as i64) + .bind(std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs() as i64) + .execute(self.inner.get_mut()) + .await + .context(interface::CouldNotRecordSkippedCompactionSnafu { partition_id })?; + Ok(()) + } + + async fn get_in_skipped_compaction( + &mut self, + partition_id: PartitionId, + ) -> Result> { + let rec = sqlx::query_as::<_, SkippedCompaction>( + r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#, + ) + .bind(partition_id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let skipped_partition_record = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(skipped_partition_record)) + } + + async fn list_skipped_compactions(&mut self) -> Result> { + sqlx::query_as::<_, SkippedCompaction>( + r#" +SELECT * FROM skipped_compactions + "#, + ) + .fetch_all(self.inner.get_mut()) + .await + .context(interface::CouldNotListSkippedCompactionsSnafu) + } + + async fn delete_skipped_compactions( + &mut self, + partition_id: PartitionId, + ) -> Result> { + sqlx::query_as::<_, SkippedCompaction>( + r#" +DELETE FROM skipped_compactions +WHERE partition_id = $1 +RETURNING * + "#, + ) + .bind(partition_id) + .fetch_optional(self.inner.get_mut()) + .await + .context(interface::CouldNotDeleteSkippedCompactionsSnafu) + } + + async fn update_persisted_sequence_number( + &mut self, + partition_id: PartitionId, + sequence_number: SequenceNumber, + ) -> Result<()> { + let _ = sqlx::query( + r#" +UPDATE partition +SET persisted_sequence_number = $1 +WHERE id = $2; + "#, + ) + .bind(sequence_number.get()) // $1 + .bind(partition_id) // $2 + .execute(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(()) + } + + async fn most_recent_n(&mut self, n: usize, shards: &[ShardId]) -> Result> { + Ok(sqlx::query_as::<_, PartitionPod>( + r#"SELECT * FROM partition WHERE shard_id IN (SELECT value FROM json_each($1)) ORDER BY id DESC LIMIT $2;"#, + ) + .bind(&Json(shards.iter().map(|v| v.get()).collect::>())) + .bind(n as i64) + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn partitions_with_recent_created_files( + &mut self, + time_in_the_past: Timestamp, + max_num_partitions: usize, + ) -> Result> { + sqlx::query_as( + r#" + SELECT p.id as partition_id, p.table_id, t.namespace_id, p.shard_id + FROM partition p, table_name t + WHERE p.new_file_at > $1 + AND p.table_id = t.id + LIMIT $2; + "#, + ) + .bind(time_in_the_past) // $1 + .bind(max_num_partitions as i64) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn partitions_to_compact(&mut self, recent_time: Timestamp) -> Result> { + sqlx::query_as( + r#" + SELECT p.id as partition_id + FROM partition p + WHERE p.new_file_at > $1 + "#, + ) + .bind(recent_time) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } +} + +#[async_trait] +impl TombstoneRepo for SqliteTxn { + async fn create_or_get( + &mut self, + table_id: TableId, + shard_id: ShardId, + sequence_number: SequenceNumber, + min_time: Timestamp, + max_time: Timestamp, + predicate: &str, + ) -> Result { + let v = sqlx::query_as::<_, Tombstone>( + r#" +INSERT INTO tombstone + ( table_id, shard_id, sequence_number, min_time, max_time, serialized_predicate ) +VALUES + ( $1, $2, $3, $4, $5, $6 ) +ON CONFLICT (table_id, shard_id, sequence_number) +DO UPDATE SET table_id = tombstone.table_id +RETURNING *; + "#, + ) + .bind(table_id) // $1 + .bind(shard_id) // $2 + .bind(sequence_number) // $3 + .bind(min_time) // $4 + .bind(max_time) // $5 + .bind(predicate) // $6 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| { + if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + })?; + + // If tombstone_unique is hit, a record with (table_id, shard_id, + // sequence_number) already exists. + // + // Ensure the caller does not falsely believe they have created the + // record with the provided values if the DB row contains different + // values. + assert_eq!( + v.min_time, min_time, + "attempted to overwrite min_time in tombstone record" + ); + assert_eq!( + v.max_time, max_time, + "attempted to overwrite max_time in tombstone record" + ); + assert_eq!( + v.serialized_predicate, predicate, + "attempted to overwrite predicate in tombstone record" + ); + + Ok(v) + } + + async fn list_by_namespace(&mut self, namespace_id: NamespaceId) -> Result> { + sqlx::query_as::<_, Tombstone>( + r#" +SELECT + tombstone.id as id, + tombstone.table_id as table_id, + tombstone.shard_id as shard_id, + tombstone.sequence_number as sequence_number, + tombstone.min_time as min_time, + tombstone.max_time as max_time, + tombstone.serialized_predicate as serialized_predicate +FROM table_name +INNER JOIN tombstone on tombstone.table_id = table_name.id +WHERE table_name.namespace_id = $1; + "#, + ) + .bind(namespace_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn list_by_table(&mut self, table_id: TableId) -> Result> { + sqlx::query_as::<_, Tombstone>( + r#" +SELECT * +FROM tombstone +WHERE table_id = $1 +ORDER BY id; + "#, + ) + .bind(table_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn get_by_id(&mut self, id: TombstoneId) -> Result> { + let rec = sqlx::query_as::<_, Tombstone>( + r#" +SELECT * +FROM tombstone +WHERE id = $1; + "#, + ) + .bind(id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let tombstone = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(tombstone)) + } + + async fn list_tombstones_by_shard_greater_than( + &mut self, + shard_id: ShardId, + sequence_number: SequenceNumber, + ) -> Result> { + sqlx::query_as::<_, Tombstone>( + r#" +SELECT * +FROM tombstone +WHERE shard_id = $1 + AND sequence_number > $2 +ORDER BY id; + "#, + ) + .bind(shard_id) // $1 + .bind(sequence_number) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn remove(&mut self, tombstone_ids: &[TombstoneId]) -> Result<()> { + let ids: Vec<_> = tombstone_ids.iter().map(|t| t.get()).collect(); + + // Remove processed tombstones first + sqlx::query( + r#" +DELETE +FROM processed_tombstone +WHERE tombstone_id IN (SELECT value FROM json_each($1)); + "#, + ) + .bind(Json(&ids[..])) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + // Remove tombstones + sqlx::query( + r#" +DELETE +FROM tombstone +WHERE id IN (SELECT value FROM json_each($1)); + "#, + ) + .bind(Json(&ids[..])) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(()) + } + + async fn list_tombstones_for_time_range( + &mut self, + shard_id: ShardId, + table_id: TableId, + sequence_number: SequenceNumber, + min_time: Timestamp, + max_time: Timestamp, + ) -> Result> { + sqlx::query_as::<_, Tombstone>( + r#" +SELECT * +FROM tombstone +WHERE shard_id = $1 + AND table_id = $2 + AND sequence_number > $3 + AND ((min_time <= $4 AND max_time >= $4) + OR (min_time > $4 AND min_time <= $5)) +ORDER BY id; + "#, + ) + .bind(shard_id) // $1 + .bind(table_id) // $2 + .bind(sequence_number) // $3 + .bind(min_time) // $4 + .bind(max_time) // $5 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } +} + +fn from_column_set(v: &ColumnSet) -> Json> { + Json((*v).iter().map(ColumnId::get).collect()) +} + +fn to_column_set(v: &Json>) -> ColumnSet { + ColumnSet::new(v.0.iter().map(|v| ColumnId::new(*v))) +} + +#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)] +struct ParquetFilePod { + id: ParquetFileId, + shard_id: ShardId, + namespace_id: NamespaceId, + table_id: TableId, + partition_id: PartitionId, + object_store_id: Uuid, + max_sequence_number: SequenceNumber, + min_time: Timestamp, + max_time: Timestamp, + to_delete: Option, + file_size_bytes: i64, + row_count: i64, + compaction_level: CompactionLevel, + created_at: Timestamp, + column_set: Json>, + max_l0_created_at: Timestamp, +} + +impl From for ParquetFile { + fn from(value: ParquetFilePod) -> Self { + Self { + id: value.id, + shard_id: value.shard_id, + namespace_id: value.namespace_id, + table_id: value.table_id, + partition_id: value.partition_id, + object_store_id: value.object_store_id, + max_sequence_number: value.max_sequence_number, + min_time: value.min_time, + max_time: value.max_time, + to_delete: value.to_delete, + file_size_bytes: value.file_size_bytes, + row_count: value.row_count, + compaction_level: value.compaction_level, + created_at: value.created_at, + column_set: to_column_set(&value.column_set), + max_l0_created_at: value.max_l0_created_at, + } + } +} + +#[async_trait] +impl ParquetFileRepo for SqliteTxn { + async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result { + let ParquetFileParams { + shard_id, + namespace_id, + table_id, + partition_id, + object_store_id, + max_sequence_number, + min_time, + max_time, + file_size_bytes, + row_count, + compaction_level, + created_at, + column_set, + max_l0_created_at, + } = parquet_file_params; + + let rec = sqlx::query_as::<_, ParquetFilePod>( + r#" +INSERT INTO parquet_file ( + shard_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, file_size_bytes, + row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at ) +VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14 ) +RETURNING *; + "#, + ) + .bind(shard_id) // $1 + .bind(table_id) // $2 + .bind(partition_id) // $3 + .bind(object_store_id) // $4 + .bind(max_sequence_number) // $5 + .bind(min_time) // $6 + .bind(max_time) // $7 + .bind(file_size_bytes) // $8 + .bind(row_count) // $9 + .bind(compaction_level) // $10 + .bind(created_at) // $11 + .bind(namespace_id) // $12 + .bind(from_column_set(&column_set)) // $13 + .bind(max_l0_created_at) // $14 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| { + if is_unique_violation(&e) { + Error::FileExists { object_store_id } + } else if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + })?; + + Ok(rec.into()) + } + + async fn flag_for_delete(&mut self, id: ParquetFileId) -> Result<()> { + let marked_at = Timestamp::from(self.time_provider.now()); + + let _ = sqlx::query(r#"UPDATE parquet_file SET to_delete = $1 WHERE id = $2;"#) + .bind(marked_at) // $1 + .bind(id) // $2 + .execute(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(()) + } + + async fn flag_for_delete_by_retention(&mut self) -> Result> { + let flagged_at = Timestamp::from(self.time_provider.now()); + // TODO - include check of table retention period once implemented + let flagged = sqlx::query( + r#" + UPDATE parquet_file + SET to_delete = $1 + FROM namespace + WHERE namespace.retention_period_ns IS NOT NULL + AND parquet_file.to_delete IS NULL + AND parquet_file.max_time < $1 - namespace.retention_period_ns + AND namespace.id = parquet_file.namespace_id + RETURNING parquet_file.id; + "#, + ) + .bind(flagged_at) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + let flagged = flagged.into_iter().map(|row| row.get("id")).collect(); + Ok(flagged) + } + + async fn list_by_shard_greater_than( + &mut self, + shard_id: ShardId, + sequence_number: SequenceNumber, + ) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE shard_id = $1 + AND max_sequence_number > $2 +ORDER BY id; + "#, + ) + .bind(shard_id) // $1 + .bind(sequence_number) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn list_by_namespace_not_to_delete( + &mut self, + namespace_id: NamespaceId, + ) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT parquet_file.id, parquet_file.shard_id, parquet_file.namespace_id, + parquet_file.table_id, parquet_file.partition_id, parquet_file.object_store_id, + parquet_file.max_sequence_number, parquet_file.min_time, + parquet_file.max_time, parquet_file.to_delete, parquet_file.file_size_bytes, + parquet_file.row_count, parquet_file.compaction_level, parquet_file.created_at, parquet_file.column_set, + parquet_file.max_l0_created_at +FROM parquet_file +INNER JOIN table_name on table_name.id = parquet_file.table_id +WHERE table_name.namespace_id = $1 + AND parquet_file.to_delete IS NULL; + "#, + ) + .bind(namespace_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE table_id = $1 AND to_delete IS NULL; + "#, + ) + .bind(table_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn delete_old(&mut self, older_than: Timestamp) -> Result> { + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +DELETE FROM parquet_file +WHERE to_delete < $1 +RETURNING *; + "#, + ) + .bind(older_than) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-sqlite-ctes-to-the-rescue + let deleted = sqlx::query( + r#" +WITH parquet_file_ids as ( + SELECT id + FROM parquet_file + WHERE to_delete < $1 + LIMIT $2 +) +DELETE FROM parquet_file +WHERE id IN (SELECT id FROM parquet_file_ids) +RETURNING id; + "#, + ) + .bind(older_than) // $1 + .bind(MAX_PARQUET_FILES_DELETED_ONCE) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + let deleted = deleted.into_iter().map(|row| row.get("id")).collect(); + Ok(deleted) + } + + async fn level_0(&mut self, shard_id: ShardId) -> Result> { + // this intentionally limits the returned files to 10,000 as it is used to make + // a decision on the highest priority partitions. If compaction has never been + // run this could end up returning millions of results and taking too long to run. + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE parquet_file.shard_id = $1 + AND parquet_file.compaction_level = $2 + AND parquet_file.to_delete IS NULL + LIMIT 1000; + "#, + ) + .bind(shard_id) // $1 + .bind(CompactionLevel::Initial) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn level_1( + &mut self, + table_partition: TablePartition, + min_time: Timestamp, + max_time: Timestamp, + ) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE parquet_file.shard_id = $1 + AND parquet_file.table_id = $2 + AND parquet_file.partition_id = $3 + AND parquet_file.compaction_level = $4 + AND parquet_file.to_delete IS NULL + AND ((parquet_file.min_time <= $5 AND parquet_file.max_time >= $5) + OR (parquet_file.min_time > $5 AND parquet_file.min_time <= $6)); + "#, + ) + .bind(table_partition.shard_id) // $1 + .bind(table_partition.table_id) // $2 + .bind(table_partition.partition_id) // $3 + .bind(CompactionLevel::FileNonOverlapped) // $4 + .bind(min_time) // $5 + .bind(max_time) // $6 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn recent_highest_throughput_partitions( + &mut self, + shard_id: Option, + time_in_the_past: Timestamp, + min_num_files: usize, + num_partitions: usize, + ) -> Result> { + let min_num_files = min_num_files as i32; + let num_partitions = num_partitions as i32; + + match shard_id { + Some(shard_id) => { + sqlx::query_as::<_, PartitionParam>( + r#" +SELECT parquet_file.partition_id, parquet_file.table_id, parquet_file.shard_id, + parquet_file.namespace_id, count(parquet_file.id) +FROM parquet_file +LEFT OUTER JOIN skipped_compactions ON parquet_file.partition_id = skipped_compactions.partition_id +WHERE compaction_level = $5 +AND to_delete is null +AND shard_id = $1 +AND created_at > $2 +AND skipped_compactions.partition_id IS NULL +GROUP BY 1, 2, 3, 4 +HAVING count(id) >= $3 +ORDER BY 5 DESC +LIMIT $4; + "#, + ) + .bind(shard_id) // $1 + .bind(time_in_the_past) //$2 + .bind(min_num_files) // $3 + .bind(num_partitions) // $4 + .bind(CompactionLevel::Initial) // $5 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + None => { + sqlx::query_as::<_, PartitionParam>( + r#" +SELECT parquet_file.partition_id, parquet_file.table_id, parquet_file.shard_id, + parquet_file.namespace_id, count(parquet_file.id) +FROM parquet_file +LEFT OUTER JOIN skipped_compactions ON parquet_file.partition_id = skipped_compactions.partition_id +WHERE compaction_level = $4 +AND to_delete is null +AND created_at > $1 +AND skipped_compactions.partition_id IS NULL +GROUP BY 1, 2, 3, 4 +HAVING count(id) >= $2 +ORDER BY 5 DESC +LIMIT $3; + "#, + ) + .bind(time_in_the_past) //$1 + .bind(min_num_files) // $2 + .bind(num_partitions) // $3 + .bind(CompactionLevel::Initial) // $4 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + } + } + + async fn partitions_with_small_l1_file_count( + &mut self, + shard_id: Option, + small_size_threshold_bytes: i64, + min_small_file_count: usize, + num_partitions: usize, + ) -> Result> { + // This query returns partitions with at least `min_small_file_count` small L1 files, + // where "small" means no bigger than `small_size_threshold_bytes`, limited to the top `num_partitions`. + sqlx::query_as::<_, PartitionParam>( + r#" +SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id, + parquet_file.table_id, + COUNT(1) AS l1_file_count +FROM parquet_file +LEFT OUTER JOIN skipped_compactions ON parquet_file.partition_id = skipped_compactions.partition_id +WHERE compaction_level = $5 +AND to_delete IS NULL +AND shard_id = $1 +AND skipped_compactions.partition_id IS NULL +AND file_size_bytes < $3 +GROUP BY 1, 2, 3, 4 +HAVING COUNT(1) >= $2 +ORDER BY l1_file_count DESC +LIMIT $4; + "#, + ) + .bind(shard_id) // $1 + .bind(min_small_file_count as i32) // $2 + .bind(small_size_threshold_bytes) // $3 + .bind(num_partitions as i32) // $4 + .bind(CompactionLevel::FileNonOverlapped) // $5 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + + async fn most_cold_files_partitions( + &mut self, + shard_id: Option, + time_in_the_past: Timestamp, + num_partitions: usize, + ) -> Result> { + let num_partitions = num_partitions as i32; + + // This query returns partitions with most L0+L1 files and all L0 files (both deleted and + // non deleted) are either created before the given time ($2) or not available (removed by + // garbage collector) + match shard_id { + Some(shard_id) => { + sqlx::query_as::<_, PartitionParam>( + r#" +SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id, + parquet_file.table_id, + count(case when to_delete is null then 1 end) total_count, + max(case when compaction_level= $4 then parquet_file.created_at end) +FROM parquet_file +LEFT OUTER JOIN skipped_compactions ON parquet_file.partition_id = skipped_compactions.partition_id +WHERE (compaction_level = $4 OR compaction_level = $5) +AND shard_id = $1 +AND skipped_compactions.partition_id IS NULL +GROUP BY 1, 2, 3, 4 +HAVING count(case when to_delete is null then 1 end) > 0 + AND ( max(case when compaction_level= $4 then parquet_file.created_at end) < $2 OR + max(case when compaction_level= $4 then parquet_file.created_at end) is null) +ORDER BY total_count DESC +LIMIT $3; + "#, + ) + .bind(shard_id) // $1 + .bind(time_in_the_past) // $2 + .bind(num_partitions) // $3 + .bind(CompactionLevel::Initial) // $4 + .bind(CompactionLevel::FileNonOverlapped) // $5 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + None => { + sqlx::query_as::<_, PartitionParam>( + r#" +SELECT parquet_file.partition_id, parquet_file.shard_id, parquet_file.namespace_id, + parquet_file.table_id, + count(case when to_delete is null then 1 end) total_count, + max(case when compaction_level= $4 then parquet_file.created_at end) +FROM parquet_file +LEFT OUTER JOIN skipped_compactions ON parquet_file.partition_id = skipped_compactions.partition_id +WHERE (compaction_level = $3 OR compaction_level = $4) +AND skipped_compactions.partition_id IS NULL +GROUP BY 1, 2, 3, 4 +HAVING count(case when to_delete is null then 1 end) > 0 + AND ( max(case when compaction_level= $3 then parquet_file.created_at end) < $1 OR + max(case when compaction_level= $3 then parquet_file.created_at end) is null) +ORDER BY total_count DESC +LIMIT $2; + "#, + ) + .bind(time_in_the_past) // $1 + .bind(num_partitions) // $2 + .bind(CompactionLevel::Initial) // $3 + .bind(CompactionLevel::FileNonOverlapped) // $4 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e }) + } + } + } + + async fn list_by_partition_not_to_delete( + &mut self, + partition_id: PartitionId, + ) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + Ok(sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE parquet_file.partition_id = $1 + AND parquet_file.to_delete IS NULL; + "#, + ) + .bind(partition_id) // $1 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })? + .into_iter() + .map(Into::into) + .collect()) + } + + async fn update_compaction_level( + &mut self, + parquet_file_ids: &[ParquetFileId], + compaction_level: CompactionLevel, + ) -> Result> { + // If I try to do `.bind(parquet_file_ids)` directly, I get a compile error from sqlx. + // See https://github.com/launchbadge/sqlx/issues/1744 + let ids: Vec<_> = parquet_file_ids.iter().map(|p| p.get()).collect(); + let updated = sqlx::query( + r#" +UPDATE parquet_file +SET compaction_level = $1 +WHERE id IN (SELECT value FROM json_each($2)) +RETURNING id; + "#, + ) + .bind(compaction_level) // $1 + .bind(Json(&ids[..])) // $2 + .fetch_all(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + let updated = updated.into_iter().map(|row| row.get("id")).collect(); + Ok(updated) + } + + async fn exist(&mut self, id: ParquetFileId) -> Result { + let read_result = sqlx::query_as::<_, Count>( + r#"SELECT count(1) as count FROM parquet_file WHERE id = $1;"#, + ) + .bind(id) // $1 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count > 0) + } + + async fn count(&mut self) -> Result { + let read_result = + sqlx::query_as::<_, Count>(r#"SELECT count(1) as count FROM parquet_file;"#) + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count) + } + + async fn count_by_overlaps_with_level_0( + &mut self, + table_id: TableId, + shard_id: ShardId, + min_time: Timestamp, + max_time: Timestamp, + sequence_number: SequenceNumber, + ) -> Result { + let read_result = sqlx::query_as::<_, Count>( + r#" +SELECT count(1) as count +FROM parquet_file +WHERE table_id = $1 + AND shard_id = $2 + AND max_sequence_number < $3 + AND parquet_file.to_delete IS NULL + AND compaction_level = $6 + AND ((parquet_file.min_time <= $4 AND parquet_file.max_time >= $4) + OR (parquet_file.min_time > $4 AND parquet_file.min_time <= $5)); + "#, + ) + .bind(table_id) // $1 + .bind(shard_id) // $2 + .bind(sequence_number) // $3 + .bind(min_time) // $4 + .bind(max_time) // $5 + .bind(CompactionLevel::Initial) // $6 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count) + } + + async fn count_by_overlaps_with_level_1( + &mut self, + table_id: TableId, + shard_id: ShardId, + min_time: Timestamp, + max_time: Timestamp, + ) -> Result { + let read_result = sqlx::query_as::<_, Count>( + r#" +SELECT count(1) as count +FROM parquet_file +WHERE table_id = $1 + AND shard_id = $2 + AND parquet_file.to_delete IS NULL + AND compaction_level = $5 + AND ((parquet_file.min_time <= $3 AND parquet_file.max_time >= $3) + OR (parquet_file.min_time > $3 AND parquet_file.min_time <= $4)); + "#, + ) + .bind(table_id) // $1 + .bind(shard_id) // $2 + .bind(min_time) // $3 + .bind(max_time) // $4 + .bind(CompactionLevel::FileNonOverlapped) // $5 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count) + } + + async fn get_by_object_store_id( + &mut self, + object_store_id: Uuid, + ) -> Result> { + // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large + // `parquet_metadata` column!! + let rec = sqlx::query_as::<_, ParquetFilePod>( + r#" +SELECT id, shard_id, namespace_id, table_id, partition_id, object_store_id, + max_sequence_number, min_time, max_time, to_delete, file_size_bytes, + row_count, compaction_level, created_at, column_set, max_l0_created_at +FROM parquet_file +WHERE object_store_id = $1; + "#, + ) + .bind(object_store_id) // $1 + .fetch_one(self.inner.get_mut()) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Ok(None); + } + + let parquet_file = rec.map_err(|e| Error::SqlxError { source: e })?; + + Ok(Some(parquet_file.into())) + } +} + +#[async_trait] +impl ProcessedTombstoneRepo for SqliteTxn { + async fn create( + &mut self, + parquet_file_id: ParquetFileId, + tombstone_id: TombstoneId, + ) -> Result { + sqlx::query_as::<_, ProcessedTombstone>( + r#" +INSERT INTO processed_tombstone ( tombstone_id, parquet_file_id ) +VALUES ( $1, $2 ) +RETURNING *; + "#, + ) + .bind(tombstone_id) // $1 + .bind(parquet_file_id) // $2 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| { + if is_unique_violation(&e) { + Error::ProcessTombstoneExists { + tombstone_id: tombstone_id.get(), + parquet_file_id: parquet_file_id.get(), + } + } else if is_fk_violation(&e) { + Error::ForeignKeyViolation { source: e } + } else { + Error::SqlxError { source: e } + } + }) + } + + async fn exist( + &mut self, + parquet_file_id: ParquetFileId, + tombstone_id: TombstoneId, + ) -> Result { + let read_result = sqlx::query_as::<_, Count>( + r#" +SELECT count(1) as count +FROM processed_tombstone +WHERE parquet_file_id = $1 + AND tombstone_id = $2; + "#, + ) + .bind(parquet_file_id) // $1 + .bind(tombstone_id) // $2 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count > 0) + } + + async fn count(&mut self) -> Result { + let read_result = + sqlx::query_as::<_, Count>(r#"SELECT count(1) as count FROM processed_tombstone;"#) + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count) + } + + async fn count_by_tombstone_id(&mut self, tombstone_id: TombstoneId) -> Result { + let read_result = sqlx::query_as::<_, Count>( + r#"SELECT count(1) as count FROM processed_tombstone WHERE tombstone_id = $1;"#, + ) + .bind(tombstone_id) // $1 + .fetch_one(self.inner.get_mut()) + .await + .map_err(|e| Error::SqlxError { source: e })?; + + Ok(read_result.count) + } +} + +/// The error code returned by SQLite for a unique constraint violation. +/// +/// See +const SQLITE_UNIQUE_VIOLATION: &str = "2067"; + +/// Error code returned by SQLite for a foreign key constraint violation. +/// See +const SQLITE_FK_VIOLATION: &str = "787"; + +fn is_fk_violation(e: &sqlx::Error) -> bool { + if let sqlx::Error::Database(inner) = e { + if let Some(code) = inner.code() { + if code == SQLITE_FK_VIOLATION { + return true; + } + } + } + + false +} + +/// Returns true if `e` is a unique constraint violation error. +fn is_unique_violation(e: &sqlx::Error) -> bool { + if let sqlx::Error::Database(inner) = e { + if let Some(code) = inner.code() { + if code == SQLITE_UNIQUE_VIOLATION { + return true; + } + } + } + + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::create_or_get_default_records; + use assert_matches::assert_matches; + use metric::{Attributes, DurationHistogram, Metric}; + use std::{ops::DerefMut, sync::Arc}; + + fn assert_metric_hit(metrics: &Registry, name: &'static str) { + let histogram = metrics + .get_instrument::>("catalog_op_duration") + .expect("failed to read metric") + .get_observer(&Attributes::from(&[("op", name), ("result", "success")])) + .expect("failed to get observer") + .fetch(); + + let hit_count = histogram.sample_count(); + assert!(hit_count > 0, "metric did not record any calls"); + } + + async fn setup_db() -> SqliteCatalog { + let dsn = + std::env::var("TEST_INFLUXDB_SQLITE_DSN").unwrap_or("sqlite::memory:".to_string()); + let options = SqliteConnectionOptions { dsn }; + let metrics = Arc::new(Registry::default()); + let cat = SqliteCatalog::connect(options, metrics) + .await + .expect("failed to connect to catalog"); + cat.setup().await.expect("failed to initialise database"); + cat + } + + #[tokio::test] + async fn test_catalog() { + let sqlite = setup_db().await; + let sqlite: Arc = Arc::new(sqlite); + interface::test_helpers::test_catalog(sqlite).await; + } + + #[tokio::test] + async fn test_tombstone_create_or_get_idempotent() { + let sqlite = setup_db().await; + let sqlite: Arc = Arc::new(sqlite); + + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, shards) = create_or_get_default_records(1, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table", namespace_id) + .await + .expect("create table failed") + .id; + + let shard_id = *shards.keys().next().expect("no shard"); + let sequence_number = SequenceNumber::new(3); + let min_timestamp = Timestamp::new(10); + let max_timestamp = Timestamp::new(100); + let predicate = "bananas"; + + let a = sqlite + .repositories() + .await + .tombstones() + .create_or_get( + table_id, + shard_id, + sequence_number, + min_timestamp, + max_timestamp, + predicate, + ) + .await + .expect("should create OK"); + + // Call create_or_get for the same (table_id, shard_id, + // sequence_number) triplet, setting the same metadata to ensure the + // write is idempotent. + let b = sqlite + .repositories() + .await + .tombstones() + .create_or_get( + table_id, + shard_id, + sequence_number, + min_timestamp, + max_timestamp, + predicate, + ) + .await + .expect("idempotent write should succeed"); + + assert_eq!(a, b); + } + + #[tokio::test] + #[should_panic = "attempted to overwrite predicate"] + async fn test_tombstone_create_or_get_no_overwrite() { + let sqlite = setup_db().await; + let sqlite: Arc = Arc::new(sqlite); + + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, shards) = create_or_get_default_records(1, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns2", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table2", namespace_id) + .await + .expect("create table failed") + .id; + + let shard_id = *shards.keys().next().expect("no shard"); + let sequence_number = SequenceNumber::new(3); + let min_timestamp = Timestamp::new(10); + let max_timestamp = Timestamp::new(100); + + let a = sqlite + .repositories() + .await + .tombstones() + .create_or_get( + table_id, + shard_id, + sequence_number, + min_timestamp, + max_timestamp, + "bananas", + ) + .await + .expect("should create OK"); + + // Call create_or_get for the same (table_id, shard_id, + // sequence_number) triplet with different metadata. + // + // The caller should not falsely believe it has persisted the incorrect + // predicate. + let b = sqlite + .repositories() + .await + .tombstones() + .create_or_get( + table_id, + shard_id, + sequence_number, + min_timestamp, + max_timestamp, + "some other serialized predicate which is different", + ) + .await + .expect("should panic before result evaluated"); + + assert_eq!(a, b); + } + + #[tokio::test] + async fn test_partition_create_or_get_idempotent() { + let sqlite = setup_db().await; + + let sqlite: Arc = Arc::new(sqlite); + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, shards) = create_or_get_default_records(1, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns4", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table", namespace_id) + .await + .expect("create table failed") + .id; + + let key = "bananas"; + let shard_id = *shards.keys().next().expect("no shard"); + + let a = sqlite + .repositories() + .await + .partitions() + .create_or_get(key.into(), shard_id, table_id) + .await + .expect("should create OK"); + + // Call create_or_get for the same (key, table_id, shard_id) + // triplet, setting the same shard ID to ensure the write is + // idempotent. + let b = sqlite + .repositories() + .await + .partitions() + .create_or_get(key.into(), shard_id, table_id) + .await + .expect("idempotent write should succeed"); + + assert_eq!(a, b); + } + + #[tokio::test] + #[should_panic = "attempted to overwrite partition"] + async fn test_partition_create_or_get_no_overwrite() { + let sqlite = setup_db().await; + + let sqlite: Arc = Arc::new(sqlite); + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, _) = create_or_get_default_records(2, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns3", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table", namespace_id) + .await + .expect("create table failed") + .id; + + let key = "bananas"; + + let shards = sqlite + .repositories() + .await + .shards() + .list() + .await + .expect("failed to list shards"); + assert!( + shards.len() > 1, + "expected more shards to be created, got {}", + shards.len() + ); + + let a = sqlite + .repositories() + .await + .partitions() + .create_or_get(key.into(), shards[0].id, table_id) + .await + .expect("should create OK"); + + // Call create_or_get for the same (key, table_id) tuple, setting a + // different shard ID + let b = sqlite + .repositories() + .await + .partitions() + .create_or_get(key.into(), shards[1].id, table_id) + .await + .expect("result should not be evaluated"); + + assert_eq!(a, b); + } + + macro_rules! test_column_create_or_get_many_unchecked { + ( + $name:ident, + calls = {$([$($col_name:literal => $col_type:expr),+ $(,)?]),+}, + want = $($want:tt)+ + ) => { + paste::paste! { + #[tokio::test] + async fn []() { + let sqlite = setup_db().await; + let metrics = Arc::clone(&sqlite.metrics); + + let sqlite: Arc = Arc::new(sqlite); + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, _shards) = create_or_get_default_records(1, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns4", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table", namespace_id) + .await + .expect("create table failed") + .id; + + $( + let mut insert = HashMap::new(); + $( + insert.insert($col_name, $col_type); + )+ + + let got = sqlite + .repositories() + .await + .columns() + .create_or_get_many_unchecked(table_id, insert.clone()) + .await; + + // The returned columns MUST always match the requested + // column values if successful. + if let Ok(got) = &got { + assert_eq!(insert.len(), got.len()); + + for got in got { + assert_eq!(table_id, got.table_id); + let requested_column_type = insert + .get(got.name.as_str()) + .expect("Should have gotten back a column that was inserted"); + assert_eq!( + *requested_column_type, + ColumnType::try_from(got.column_type) + .expect("invalid column type") + ); + } + + assert_metric_hit(&metrics, "column_create_or_get_many_unchecked"); + } + )+ + + assert_matches!(got, $($want)+); + } + } + } + } + + // Issue a few calls to create_or_get_many that contain distinct columns and + // covers the full set of column types. + test_column_create_or_get_many_unchecked!( + insert, + calls = { + [ + "test1" => ColumnType::I64, + "test2" => ColumnType::U64, + "test3" => ColumnType::F64, + "test4" => ColumnType::Bool, + "test5" => ColumnType::String, + "test6" => ColumnType::Time, + "test7" => ColumnType::Tag, + ], + [ + "test8" => ColumnType::String, + "test9" => ColumnType::Bool, + ] + }, + want = Ok(_) + ); + + // Issue two calls with overlapping columns - request should succeed (upsert + // semantics). + test_column_create_or_get_many_unchecked!( + partial_upsert, + calls = { + [ + "test1" => ColumnType::I64, + "test2" => ColumnType::U64, + "test3" => ColumnType::F64, + "test4" => ColumnType::Bool, + ], + [ + "test1" => ColumnType::I64, + "test2" => ColumnType::U64, + "test3" => ColumnType::F64, + "test4" => ColumnType::Bool, + "test5" => ColumnType::String, + "test6" => ColumnType::Time, + "test7" => ColumnType::Tag, + "test8" => ColumnType::String, + ] + }, + want = Ok(_) + ); + + // Issue two calls with the same columns and types. + test_column_create_or_get_many_unchecked!( + full_upsert, + calls = { + [ + "test1" => ColumnType::I64, + "test2" => ColumnType::U64, + "test3" => ColumnType::F64, + "test4" => ColumnType::Bool, + ], + [ + "test1" => ColumnType::I64, + "test2" => ColumnType::U64, + "test3" => ColumnType::F64, + "test4" => ColumnType::Bool, + ] + }, + want = Ok(_) + ); + + // Issue two calls with overlapping columns with conflicting types and + // observe a correctly populated ColumnTypeMismatch error. + test_column_create_or_get_many_unchecked!( + partial_type_conflict, + calls = { + [ + "test1" => ColumnType::String, + "test2" => ColumnType::String, + "test3" => ColumnType::String, + "test4" => ColumnType::String, + ], + [ + "test1" => ColumnType::String, + "test2" => ColumnType::Bool, // This one differs + "test3" => ColumnType::String, + // 4 is missing. + "test5" => ColumnType::String, + "test6" => ColumnType::Time, + "test7" => ColumnType::Tag, + "test8" => ColumnType::String, + ] + }, + want = Err(e) => { + assert_matches!(e, Error::ColumnTypeMismatch { name, existing, new } => { + assert_eq!(name, "test2"); + assert_eq!(existing, ColumnType::String); + assert_eq!(new, ColumnType::Bool); + }) + } + ); + + #[tokio::test] + async fn test_billing_summary_on_parqet_file_creation() { + let sqlite = setup_db().await; + let pool = sqlite.pool.clone(); + + let sqlite: Arc = Arc::new(sqlite); + let mut txn = sqlite.start_transaction().await.expect("txn start"); + let (kafka, query, shards) = create_or_get_default_records(1, txn.deref_mut()) + .await + .expect("db init failed"); + txn.commit().await.expect("txn commit"); + + let namespace_id = sqlite + .repositories() + .await + .namespaces() + .create("ns4", None, kafka.id, query.id) + .await + .expect("namespace create failed") + .id; + let table_id = sqlite + .repositories() + .await + .tables() + .create_or_get("table", namespace_id) + .await + .expect("create table failed") + .id; + + let key = "bananas"; + let shard_id = *shards.keys().next().expect("no shard"); + + let partition_id = sqlite + .repositories() + .await + .partitions() + .create_or_get(key.into(), shard_id, table_id) + .await + .expect("should create OK") + .id; + + // parquet file to create- all we care about here is the size, the rest is to satisfy DB + // constraints + let time_provider = Arc::new(SystemProvider::new()); + let time_now = Timestamp::from(time_provider.now()); + let mut p1 = ParquetFileParams { + shard_id, + namespace_id, + table_id, + partition_id, + object_store_id: Uuid::new_v4(), + max_sequence_number: SequenceNumber::new(100), + min_time: Timestamp::new(1), + max_time: Timestamp::new(5), + file_size_bytes: 1337, + row_count: 0, + compaction_level: CompactionLevel::Initial, // level of file of new writes + created_at: time_now, + column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]), + max_l0_created_at: time_now, + }; + let f1 = sqlite + .repositories() + .await + .parquet_files() + .create(p1.clone()) + .await + .expect("create parquet file should succeed"); + // insert the same again with a different size; we should then have 3x1337 as total file size + p1.object_store_id = Uuid::new_v4(); + p1.file_size_bytes *= 2; + let _f2 = sqlite + .repositories() + .await + .parquet_files() + .create(p1.clone()) + .await + .expect("create parquet file should succeed"); + + // after adding two files we should have 3x1337 in the summary + let total_file_size_bytes: i64 = + sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;") + .fetch_one(&pool) + .await + .expect("fetch total file size failed"); + assert_eq!(total_file_size_bytes, 1337 * 3); + + // flag f1 for deletion and assert that the total file size is reduced accordingly. + sqlite + .repositories() + .await + .parquet_files() + .flag_for_delete(f1.id) + .await + .expect("flag parquet file for deletion should succeed"); + let total_file_size_bytes: i64 = + sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;") + .fetch_one(&pool) + .await + .expect("fetch total file size failed"); + // we marked the first file of size 1337 for deletion leaving only the second that was 2x that + assert_eq!(total_file_size_bytes, 1337 * 2); + + // actually deleting shouldn't change the total + let now = Timestamp::from(time_provider.now()); + sqlite + .repositories() + .await + .parquet_files() + .delete_old(now) + .await + .expect("parquet file deletion should succeed"); + let total_file_size_bytes: i64 = + sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;") + .fetch_one(&pool) + .await + .expect("fetch total file size failed"); + assert_eq!(total_file_size_bytes, 1337 * 2); + } +} diff --git a/iox_query/src/exec/gapfill.rs b/iox_query/src/exec/gapfill.rs index a192d3822f..38c1af9da4 100644 --- a/iox_query/src/exec/gapfill.rs +++ b/iox_query/src/exec/gapfill.rs @@ -1,6 +1,8 @@ //! This module contains code that implements //! a gap-filling extension to DataFusion +mod algo; + use std::{ fmt::{self, Debug}, ops::{Bound, Range}, @@ -15,11 +17,15 @@ use datafusion::{ logical_expr::{LogicalPlan, UserDefinedLogicalNode}, physical_expr::{create_physical_expr, execution_props::ExecutionProps, PhysicalSortExpr}, physical_plan::{ - expressions::Column, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, - PhysicalExpr, SendableRecordBatchStream, Statistics, + expressions::Column, + metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, + DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, + SendableRecordBatchStream, Statistics, }, prelude::Expr, }; +use datafusion_util::{watch::WatchedTask, AdapterStream}; +use tokio::sync::mpsc; /// A logical node that represents the gap filling operation. #[derive(Clone, Debug)] @@ -31,17 +37,60 @@ pub struct GapFill { } /// Parameters to the GapFill operation -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub(crate) struct GapFillParams { /// The stride argument from the call to DATE_BIN_GAPFILL pub stride: Expr, /// The source time column pub time_column: Expr, + /// The origin argument from the call to DATE_BIN_GAPFILL + pub origin: Expr, /// The time range of the time column inferred from predicates - /// in overall the query + /// in the overall query pub time_range: Range>, } +impl GapFillParams { + // Extract the expressions so they can be optimized. + fn expressions(&self) -> Vec { + vec![ + self.stride.clone(), + self.time_column.clone(), + self.origin.clone(), + bound_extract(&self.time_range.start) + .unwrap_or_else(|| panic!("lower time bound is required")) + .clone(), + bound_extract(&self.time_range.end) + .unwrap_or_else(|| panic!("upper time bound is required")) + .clone(), + ] + } + + #[allow(clippy::wrong_self_convention)] // follows convention of UserDefinedLogicalNode + fn from_template(&self, exprs: &[Expr]) -> Self { + assert!( + exprs.len() >= 3, + "should be a at least stride, source and origin in params" + ); + let mut iter = exprs.iter().cloned(); + let stride = iter.next().unwrap(); + let time_column = iter.next().unwrap(); + let origin = iter.next().unwrap(); + let time_range = try_map_range(&self.time_range, |b| { + try_map_bound(b.as_ref(), |_| { + Ok(iter.next().expect("expr count should match template")) + }) + }) + .unwrap(); + Self { + stride, + time_column, + origin, + time_range, + } + } +} + impl GapFill { pub(crate) fn try_new( input: Arc, @@ -74,7 +123,8 @@ impl UserDefinedLogicalNode for GapFill { fn expressions(&self) -> Vec { self.group_expr .iter() - .chain(self.aggr_expr.iter()) + .chain(&self.aggr_expr) + .chain(&self.params.expressions()) .cloned() .collect() } @@ -97,14 +147,11 @@ impl UserDefinedLogicalNode for GapFill { inputs: &[LogicalPlan], ) -> Arc { let mut group_expr: Vec<_> = exprs.to_vec(); - let aggr_expr = group_expr.split_off(self.group_expr.len()); - let gapfill = Self::try_new( - Arc::new(inputs[0].clone()), - group_expr, - aggr_expr, - self.params.clone(), - ) - .expect("should not fail"); + let mut aggr_expr = group_expr.split_off(self.group_expr.len()); + let param_expr = aggr_expr.split_off(self.aggr_expr.len()); + let params = self.params.from_template(¶m_expr); + let gapfill = Self::try_new(Arc::new(inputs[0].clone()), group_expr, aggr_expr, params) + .expect("should not fail"); Arc::new(gapfill) } } @@ -162,9 +209,17 @@ pub(crate) fn plan_gap_fill( }) })?; + let origin = create_physical_expr( + &gap_fill.params.origin, + input_dfschema, + input_schema, + execution_props, + )?; + let params = GapFillExecParams { stride, time_column, + origin, time_range, }; GapFillExec::try_new( @@ -175,9 +230,9 @@ pub(crate) fn plan_gap_fill( ) } -fn try_map_range(tr: &Range, f: F) -> Result> +fn try_map_range(tr: &Range, mut f: F) -> Result> where - F: Fn(&T) -> Result, + F: FnMut(&T) -> Result, { Ok(Range { start: f(&tr.start)?, @@ -185,9 +240,9 @@ where }) } -fn try_map_bound(bt: Bound, f: F) -> Result> +fn try_map_bound(bt: Bound, mut f: F) -> Result> where - F: FnOnce(T) -> Result, + F: FnMut(T) -> Result, { Ok(match bt { Bound::Excluded(t) => Bound::Excluded(f(t)?), @@ -196,6 +251,12 @@ where }) } +fn bound_extract(b: &Bound) -> Option<&T> { + match b { + Bound::Included(t) | Bound::Excluded(t) => Some(t), + Bound::Unbounded => None, + } +} /// A physical node for the gap-fill operation. pub struct GapFillExec { input: Arc, @@ -208,6 +269,8 @@ pub struct GapFillExec { sort_expr: Vec, // Parameters (besides streaming data) to gap filling params: GapFillExecParams, + /// Metrics reporting behavior during execution. + metrics: ExecutionPlanMetricsSet, } #[derive(Clone, Debug)] @@ -216,7 +279,10 @@ struct GapFillExecParams { stride: Arc, /// The timestamp column produced by date_bin time_column: Column, - /// The time range of timestamps in the time column + /// The origin argument from the all to DATE_BIN_GAPFILL + origin: Arc, + /// The time range of source input to DATE_BIN_GAPFILL. + /// Inferred from predicates in the overall query. time_range: Range>>, } @@ -242,11 +308,9 @@ impl GapFillExec { .iter() .enumerate() .find(|(_i, e)| { - if let Some(col) = e.as_any().downcast_ref::() { - col.index() == params.time_column.index() - } else { - false - } + e.as_any() + .downcast_ref::() + .map_or(false, |c| c.index() == params.time_column.index()) }) .map(|(i, _)| i); @@ -268,6 +332,7 @@ impl GapFillExec { aggr_expr, sort_expr, params, + metrics: ExecutionPlanMetricsSet::new(), }) } } @@ -333,14 +398,29 @@ impl ExecutionPlan for GapFillExec { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> Result { if self.output_partitioning().partition_count() <= partition { return Err(DataFusionError::Internal(format!( "GapFillExec invalid partition {partition}" ))); } - Err(DataFusionError::NotImplemented("gap filling".to_string())) + + let baseline_metrics = BaselineMetrics::new(&self.metrics, partition); + let output_batch_size = context.session_config().batch_size(); + let input_stream = self.input.execute(partition, context)?; + let (tx, rx) = mpsc::channel(1); + let fut = algo::fill_gaps( + output_batch_size, + input_stream, + self.sort_expr.clone(), + self.aggr_expr.clone(), + self.params.clone(), + tx.clone(), + baseline_metrics, + ); + let handle = WatchedTask::new(fut, vec![tx], "gapfill batches"); + Ok(AdapterStream::adapt(self.schema(), rx, handle)) } fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -404,6 +484,36 @@ mod test { logical_plan::table_scan(Some("temps"), &schema, None)?.build() } + #[test] + fn test_from_template() -> Result<()> { + let scan = table_scan()?; + let gapfill = GapFill::try_new( + Arc::new(scan.clone()), + vec![col("loc"), col("time")], + vec![col("temp")], + GapFillParams { + stride: lit(ScalarValue::IntervalDayTime(Some(60_000))), + time_column: col("time"), + origin: lit_timestamp_nano(0), + time_range: Range { + start: Bound::Included(lit_timestamp_nano(1000)), + end: Bound::Excluded(lit_timestamp_nano(2000)), + }, + }, + )?; + let exprs = gapfill.expressions(); + assert_eq!(8, exprs.len()); + let gapfill_ft = gapfill.from_template(&exprs, &[scan]); + let gapfill_ft = gapfill_ft + .as_any() + .downcast_ref::() + .expect("should be a GapFill"); + assert_eq!(gapfill.group_expr, gapfill_ft.group_expr); + assert_eq!(gapfill.aggr_expr, gapfill_ft.aggr_expr); + assert_eq!(gapfill.params, gapfill_ft.params); + Ok(()) + } + #[test] fn fmt_logical_plan() -> Result<()> { // This test case does not make much sense but @@ -417,6 +527,7 @@ mod test { GapFillParams { stride: lit(ScalarValue::IntervalDayTime(Some(60_000))), time_column: col("time"), + origin: lit_timestamp_nano(0), time_range: Range { start: Bound::Included(lit_timestamp_nano(1000)), end: Bound::Excluded(lit_timestamp_nano(2000)), diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs new file mode 100644 index 0000000000..916e722833 --- /dev/null +++ b/iox_query/src/exec/gapfill/algo.rs @@ -0,0 +1,289 @@ +use std::{ops::Bound, sync::Arc}; + +use arrow::{datatypes::IntervalDayTimeType, record_batch::RecordBatch}; +use chrono::Duration; +use datafusion::{ + error::DataFusionError, + error::Result, + physical_expr::{datetime_expressions::date_bin, PhysicalSortExpr}, + physical_plan::{ + metrics::BaselineMetrics, ColumnarValue, PhysicalExpr, SendableRecordBatchStream, + }, + scalar::ScalarValue, +}; +use tokio::sync::mpsc; +use tokio_stream::StreamExt; + +use super::{try_map_bound, try_map_range, GapFillExecParams}; + +/// Fill in the gaps in a stream of records that represent +/// one or more time series. +/// +/// # Arguments +/// +/// * `output_batch_size` +/// * `input_stream` +/// * `_sort_expr` - The incoming records will be sorted by these +/// expressions. They will all be simple column references, +/// with the last one being the timestamp value for each row. +/// The last column will already have been normalized by a previous +/// call to DATE_BIN. +/// * `_aggr_expr` - A set of column expressions that are the aggregate values +/// computed by an upstream Aggregate node. +/// * `params` - The parameters for gap filling, including the stride and the +/// start and end of the time range for this operation. +/// * `_tx` - The transmit end of the channel for output. +/// * `_baseline_metrics` +pub(super) async fn fill_gaps( + _output_batch_size: usize, + mut input_stream: SendableRecordBatchStream, + _sort_expr: Vec, + _aggr_expr: Vec>, + params: GapFillExecParams, + _tx: mpsc::Sender>, + _baseline_metrics: BaselineMetrics, +) -> Result<()> { + while let Some(batch) = input_stream.next().await { + let batch = batch?; + let _params = evaluate_params(&batch, ¶ms); + } + Err(DataFusionError::NotImplemented("gap_filling".to_string())) +} + +#[derive(Debug, PartialEq)] +struct GapFillParams { + #[allow(unused)] + pub stride: i64, + #[allow(unused)] + pub first_ts: i64, + #[allow(unused)] + pub last_ts: i64, +} + +/// Figure out the actual values (as native i64) for the stride, +/// first and last timestamp for gap filling. +fn evaluate_params( + batch: &RecordBatch, + params: &super::GapFillExecParams, +) -> Result { + let stride = params.stride.evaluate(batch)?; + let origin = params.origin.evaluate(batch)?; + + // Evaluate the upper and lower bounds of the time range + let range = try_map_range(¶ms.time_range, |b| { + try_map_bound(b.as_ref(), |pe| { + extract_timestamp_nanos(&pe.evaluate(batch)?) + }) + })?; + + // Find the smallest timestamp that might appear in the + // range + let first_ts = match range.start { + Bound::Included(v) => v, + Bound::Excluded(v) => v + 1, + Bound::Unbounded => { + return Err(DataFusionError::Execution( + "missing lower time bound for gap filling".to_string(), + )) + } + }; + + // Find the largest timestamp that might appear in the + // range + let last_ts = match range.end { + Bound::Included(v) => v, + Bound::Excluded(v) => v - 1, + Bound::Unbounded => { + return Err(DataFusionError::Execution( + "missing upper time bound for gap filling".to_string(), + )) + } + }; + + // Call date_bin on the timestamps to find the first and last time bins + // for each series + let mut args = vec![stride, i64_to_columnar_ts(first_ts), origin]; + let first_ts = extract_timestamp_nanos(&date_bin(&args)?)?; + args[1] = i64_to_columnar_ts(last_ts); + let last_ts = extract_timestamp_nanos(&date_bin(&args)?)?; + + Ok(GapFillParams { + stride: extract_interval_nanos(&args[0])?, + first_ts, + last_ts, + }) +} + +fn i64_to_columnar_ts(i: i64) -> ColumnarValue { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(i), None)) +} + +fn extract_timestamp_nanos(cv: &ColumnarValue) -> Result { + Ok(match cv { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v, + _ => { + return Err(DataFusionError::Execution( + "gap filling argument must be a scalar timestamp".to_string(), + )) + } + }) +} + +fn extract_interval_nanos(cv: &ColumnarValue) -> Result { + match cv { + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => { + let (days, ms) = IntervalDayTimeType::to_parts(*v); + let nanos = + (Duration::days(days as i64) + Duration::milliseconds(ms as i64)).num_nanoseconds(); + nanos.ok_or_else(|| { + DataFusionError::Execution("gap filling argument is too large".to_string()) + }) + } + _ => Err(DataFusionError::Execution( + "gap filling expects a stride parameter to be a scalar interval".to_string(), + )), + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::{ + array::{ArrayRef, Float64Array, StringArray, TimestampNanosecondArray}, + datatypes::{DataType, Field, Schema, TimeUnit}, + error::Result as ArrowResult, + record_batch::RecordBatch, + }; + use datafusion::{ + datasource::empty::EmptyTable, error::Result, from_slice::FromSlice, sql::TableReference, + }; + + use crate::exec::{gapfill::GapFillExec, Executor, ExecutorType}; + + use super::GapFillParams; + + fn schema() -> Schema { + Schema::new(vec![ + Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new( + "other_time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("loc", DataType::Utf8, false), + Field::new("temp", DataType::Float64, false), + ]) + } + + fn record_batch() -> ArrowResult { + let columns: Vec = vec![ + Arc::new(TimestampNanosecondArray::from_slice([1000])), + Arc::new(TimestampNanosecondArray::from_slice([2000])), + Arc::new(StringArray::from_slice(["kitchen"])), + Arc::new(Float64Array::from_slice([27.1])), + ]; + RecordBatch::try_new(Arc::new(schema()), columns) + } + + async fn plan_statement_and_get_params(sql: &str) -> Result { + let executor = Executor::new_testing(); + let context = executor.new_context(ExecutorType::Query); + context.inner().register_table( + TableReference::Bare { table: "t" }, + Arc::new(EmptyTable::new(Arc::new(schema()))), + )?; + let physical_plan = context.prepare_sql(sql).await?; + let gapfill_node = &physical_plan.children()[0]; + let gapfill_node = gapfill_node.as_any().downcast_ref::().unwrap(); + let exec_params = &gapfill_node.params; + super::evaluate_params(&record_batch()?, exec_params) + } + + #[tokio::test] + async fn test_evaluate_params() -> Result<()> { + test_helpers::maybe_start_logging(); + let actual = plan_statement_and_get_params( + "select\ + \n date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') minute\ + \nfrom t\ + \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\ + \n and time <= timestamp '1984-01-01T16:00:00Z'\ + \ngroup by minute", + ).await?; + let expected = GapFillParams { + stride: 60_000_000_000, // 1 minute + first_ts: 441_820_500_000_000_000, // Sunday, January 1, 1984 3:55:00 PM + last_ts: 441_820_800_000_000_000, // Sunday, January 1, 1984 3:59:00 PM + }; + assert_eq!(expected, actual); + Ok(()) + } + + #[tokio::test] + async fn test_evaluate_params_exclude_end() -> Result<()> { + test_helpers::maybe_start_logging(); + let actual = plan_statement_and_get_params( + "select\ + \n date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') minute\ + \nfrom t\ + \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\ + \n and time < timestamp '1984-01-01T16:00:00Z'\ + \ngroup by minute", + ).await?; + let expected = GapFillParams { + stride: 60_000_000_000, // 1 minute + first_ts: 441_820_500_000_000_000, // Sunday, January 1, 1984 3:55:00 PM + // Last bin at 16:00 is excluded + last_ts: 441_820_740_000_000_000, // Sunday, January 1, 1984 3:59:00 PM + }; + assert_eq!(expected, actual); + Ok(()) + } + + #[tokio::test] + async fn test_evaluate_params_exclude_start() -> Result<()> { + test_helpers::maybe_start_logging(); + let actual = plan_statement_and_get_params( + "select\ + \n date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') minute\ + \nfrom t\ + \nwhere time > timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\ + \n and time <= timestamp '1984-01-01T16:00:00Z'\ + \ngroup by minute", + ).await?; + let expected = GapFillParams { + stride: 60_000_000_000, // 1 minute + // First bin not exluded since it truncates to 15:55:00 + first_ts: 441_820_500_000_000_000, // Sunday, January 1, 1984 3:55:00 PM + last_ts: 441_820_800_000_000_000, // Sunday, January 1, 1984 3:59:00 PM + }; + assert_eq!(expected, actual); + Ok(()) + } + + #[tokio::test] + async fn test_evaluate_params_origin() -> Result<()> { + test_helpers::maybe_start_logging(); + let actual = plan_statement_and_get_params( + // origin is 9s after the epoch + "select\ + \n date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:09Z') minute\ + \nfrom t\ + \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\ + \n and time <= timestamp '1984-01-01T16:00:00Z'\ + \ngroup by minute", + ).await?; + let expected = GapFillParams { + stride: 60_000_000_000, // 1 minute + first_ts: 441_820_449_000_000_000, // Sunday, January 1, 1984 3:54:09 PM + last_ts: 441_820_749_000_000_000, // Sunday, January 1, 1984 3:59:09 PM + }; + assert_eq!(expected, actual); + Ok(()) + } +} diff --git a/iox_query/src/logical_optimizer/handle_gapfill.rs b/iox_query/src/logical_optimizer/handle_gapfill.rs index dc488d37e8..9d21ce8e34 100644 --- a/iox_query/src/logical_optimizer/handle_gapfill.rs +++ b/iox_query/src/logical_optimizer/handle_gapfill.rs @@ -9,6 +9,7 @@ use datafusion::{ logical_expr::{ expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}, expr_visitor::{ExprVisitable, ExpressionVisitor, Recursion}, + utils::expr_to_columns, Aggregate, BuiltinScalarFunction, Extension, LogicalPlan, }, optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule}, @@ -16,6 +17,7 @@ use datafusion::{ }; use query_functions::gapfill::DATE_BIN_GAPFILL_UDF_NAME; use std::{ + collections::HashSet, ops::{Bound, Range}, sync::Arc, }; @@ -112,24 +114,6 @@ fn handle_aggregate(aggr: &Aggregate) -> Result> { return Ok(None); }; - if date_bin_gapfill_args.len() != 3 { - return Err(DataFusionError::Plan(format!( - "DATE_BIN_GAPFILL expects 3 arguments, got {}", - date_bin_gapfill_args.len() - ))); - } - - let time_col = match &date_bin_gapfill_args[1] { - Expr::Column(c) => c, - _ => { - return Err(DataFusionError::Plan( - "DATE_BIN_GAPFILL requires a column as the source argument".to_string(), - )) - } - }; - let time_range = range_predicate::find_time_range(input, time_col)?; - validate_time_range(&time_range)?; - let new_aggr_plan = { // Create the aggregate node with the same output schema as the orignal // one. This means that there will be an output column called `date_bin_gapfill(...)` @@ -146,49 +130,105 @@ fn handle_aggregate(aggr: &Aggregate) -> Result> { new_aggr_plan }; - let new_gap_fill_plan = { - let mut new_group_expr: Vec<_> = new_aggr_plan - .schema() - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect(); - let aggr_expr = new_group_expr.split_off(group_expr.len()); - let time_column = - col(new_aggr_plan.schema().fields()[date_bin_gapfill_index].qualified_column()); - let stride = date_bin_gapfill_args - .into_iter() - .next() - .expect("there are three args"); - LogicalPlan::Extension(Extension { - node: Arc::new(GapFill::try_new( - Arc::new(new_aggr_plan), - new_group_expr, - aggr_expr, - GapFillParams { - stride, - time_column, - time_range, - }, - )?), - }) - }; + let new_gap_fill_plan = + build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args)?; Ok(Some(new_gap_fill_plan)) } +fn build_gapfill_node( + new_aggr_plan: LogicalPlan, + date_bin_gapfill_index: usize, + date_bin_gapfill_args: Vec, +) -> Result { + if date_bin_gapfill_args.len() != 3 { + return Err(DataFusionError::Plan(format!( + "DATE_BIN_GAPFILL expects 3 arguments, got {}", + date_bin_gapfill_args.len() + ))); + } + + let mut args_iter = date_bin_gapfill_args.into_iter(); + + // Ensure that stride argument is a scalar + let stride = args_iter.next().unwrap(); + validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride)?; + + // Ensure that the source argument is a column + let time_col = args_iter.next().unwrap().try_into_col().map_err(|_| { + DataFusionError::Plan( + "DATE_BIN_GAPFILL requires a column as the source argument".to_string(), + ) + })?; + + // Ensure that a time range was specified and is valid for gap filling + let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col)?; + validate_time_range(&time_range)?; + + // Ensure that origin argument is a scalar + let origin = args_iter.next().unwrap(); + validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", &origin)?; + + // Make sure the time output to the gapfill node matches what the + // aggregate output was. + let time_column = + col(new_aggr_plan.schema().fields()[date_bin_gapfill_index].qualified_column()); + + let aggr = Aggregate::try_from_plan(&new_aggr_plan)?; + let mut new_group_expr: Vec<_> = aggr + .schema + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + let aggr_expr = new_group_expr.split_off(aggr.group_expr.len()); + + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(GapFill::try_new( + Arc::new(new_aggr_plan), + new_group_expr, + aggr_expr, + GapFillParams { + stride, + time_column, + origin, + time_range, + }, + )?), + })) +} + fn validate_time_range(range: &Range>) -> Result<()> { let Range { ref start, ref end } = range; - match (start, end) { - (Bound::Unbounded, Bound::Unbounded) => Err(DataFusionError::Plan( - "no time bounds found for gap fill query".to_string(), - )), + let (start, end) = match (start, end) { + (Bound::Unbounded, Bound::Unbounded) => { + return Err(DataFusionError::Plan( + "no time bounds found for gap fill query".to_string(), + )) + } (Bound::Unbounded, _) => Err(DataFusionError::Plan( "no lower time bound found for gap fill query".to_string(), )), (_, Bound::Unbounded) => Err(DataFusionError::Plan( "no upper time bound found for gap fill query".to_string(), )), - _ => Ok(()), + ( + Bound::Included(start) | Bound::Excluded(start), + Bound::Included(end) | Bound::Excluded(end), + ) => Ok((start, end)), + }?; + validate_scalar_expr("lower time bound", start)?; + validate_scalar_expr("upper time bound", end) +} + +fn validate_scalar_expr(what: &str, e: &Expr) -> Result<()> { + let mut cols = HashSet::new(); + expr_to_columns(e, &mut cols)?; + if !cols.is_empty() { + Err(DataFusionError::Plan(format!( + "{what} for gap fill query must evaluate to a scalar" + ))) + } else { + Ok(()) } } @@ -323,7 +363,7 @@ mod test { use datafusion::logical_expr::{logical_plan, LogicalPlan, LogicalPlanBuilder}; use datafusion::optimizer::optimizer::Optimizer; use datafusion::optimizer::OptimizerContext; - use datafusion::prelude::{avg, col, lit, lit_timestamp_nano, Expr}; + use datafusion::prelude::{avg, case, col, lit, lit_timestamp_nano, Expr}; use datafusion::scalar::ScalarValue; use query_functions::gapfill::DATE_BIN_GAPFILL_UDF_NAME; @@ -334,6 +374,11 @@ mod test { DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), + Field::new( + "time2", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), Field::new("loc", DataType::Utf8, false), Field::new("temp", DataType::Float64, false), ]); @@ -341,9 +386,13 @@ mod test { } fn date_bin_gapfill(interval: Expr, time: Expr) -> Result { + date_bin_gapfill_with_origin(interval, time, lit_timestamp_nano(0)) + } + + fn date_bin_gapfill_with_origin(interval: Expr, time: Expr, origin: Expr) -> Result { Ok(Expr::ScalarUDF { fun: query_functions::registry().udf(DATE_BIN_GAPFILL_UDF_NAME)?, - args: vec![interval, time, lit_timestamp_nano(0)], + args: vec![interval, time, origin], }) } @@ -417,7 +466,59 @@ mod test { } #[test] - fn no_time_range_err() -> Result<()> { + fn nonscalar_origin() -> Result<()> { + let plan = LogicalPlanBuilder::from(table_scan()?) + .filter( + col("time") + .gt_eq(lit_timestamp_nano(1000)) + .and(col("time").lt(lit_timestamp_nano(2000))), + )? + .aggregate( + vec![date_bin_gapfill_with_origin( + lit(ScalarValue::IntervalDayTime(Some(60_000))), + col("time"), + col("time2"), + )?], + vec![avg(col("temp"))], + )? + .build()?; + assert_optimizer_err( + &plan, + "Error during planning: origin argument to DATE_BIN_GAPFILL for gap fill query must evaluate to a scalar", + ); + Ok(()) + } + + #[test] + fn nonscalar_stride() -> Result<()> { + let stride = case(col("loc")) + .when( + lit("kitchen"), + lit(ScalarValue::IntervalDayTime(Some(60_000))), + ) + .otherwise(lit(ScalarValue::IntervalDayTime(Some(30_000)))) + .unwrap(); + + let plan = LogicalPlanBuilder::from(table_scan()?) + .filter( + col("time") + .gt_eq(lit_timestamp_nano(1000)) + .and(col("time").lt(lit_timestamp_nano(2000))), + )? + .aggregate( + vec![date_bin_gapfill(stride, col("time"))?], + vec![avg(col("temp"))], + )? + .build()?; + assert_optimizer_err( + &plan, + "Error during planning: stride argument to DATE_BIN_GAPFILL for gap fill query must evaluate to a scalar", + ); + Ok(()) + } + + #[test] + fn time_range_errs() -> Result<()> { let cases = vec![ ( lit(true), @@ -431,6 +532,16 @@ mod test { col("time").lt(lit_timestamp_nano(2000)), "Error during planning: no lower time bound found for gap fill query", ), + ( + col("time").gt_eq(col("time2")).and( + col("time").lt(lit_timestamp_nano(2000))), + "Error during planning: lower time bound for gap fill query must evaluate to a scalar", + ), + ( + col("time").gt_eq(lit_timestamp_nano(2000)).and( + col("time").lt(col("time2"))), + "Error during planning: upper time bound for gap fill query must evaluate to a scalar", + ) ]; for c in cases { let plan = LogicalPlanBuilder::from(table_scan()?) diff --git a/test_helpers_end_to_end/src/snapshot_comparison.rs b/test_helpers_end_to_end/src/snapshot_comparison.rs index e2bf769db8..118b33db54 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison.rs @@ -3,7 +3,6 @@ mod queries; use crate::snapshot_comparison::queries::TestQueries; use crate::{run_influxql, run_sql, MiniCluster}; -use arrow_util::{display::pretty_format_batches, test_util::sort_record_batch}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Display, Formatter}; use std::{ @@ -11,7 +10,6 @@ use std::{ path::{Path, PathBuf}, }; -use self::normalization::normalize_results; use self::queries::Query; #[derive(Debug, Snafu)] @@ -98,19 +96,7 @@ pub async fn run( for q in queries.iter() { output.push(format!("-- {}: {}", language, q.text())); - if q.sorted_compare() { - output.push("-- Results After Sorting".into()) - } - if q.normalized_uuids() { - output.push("-- Results After Normalizing UUIDs".into()) - } - if q.normalized_metrics() { - output.push("-- Results After Normalizing Metrics".into()) - } - if q.normalized_filters() { - output.push("-- Results After Normalizing Filters".into()) - } - + q.add_description(&mut output); let results = run_query(cluster, q, language).await?; output.extend(results); } @@ -233,7 +219,7 @@ async fn run_query( ) -> Result> { let query_text = query.text(); - let mut results = match language { + let results = match language { Language::Sql => { run_sql( query_text, @@ -252,22 +238,5 @@ async fn run_query( } }; - // compare against sorted results, if requested - if query.sorted_compare() && !results.is_empty() { - let schema = results[0].schema(); - let batch = - arrow::compute::concat_batches(&schema, &results).expect("concatenating batches"); - results = vec![sort_record_batch(batch)]; - } - - let current_results = pretty_format_batches(&results) - .unwrap() - .trim() - .lines() - .map(|s| s.to_string()) - .collect::>(); - - let current_results = normalize_results(query, current_results); - - Ok(current_results) + Ok(query.normalize_results(results)) } diff --git a/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs b/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs index 39ac696c58..8a01d777ab 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs @@ -1,9 +1,28 @@ -use crate::snapshot_comparison::queries::Query; +use arrow::record_batch::RecordBatch; +use arrow_util::{display::pretty_format_batches, test_util::sort_record_batch}; use once_cell::sync::Lazy; use regex::{Captures, Regex}; use std::{borrow::Cow, collections::HashMap}; use uuid::Uuid; +/// Match the parquet UUID +/// +/// For example, given +/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` +/// +/// matches `1d325760-2b20-48de-ab48-2267b034133d` +static REGEX_UUID: Lazy = Lazy::new(|| { + Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex") +}); + +/// Match the parquet directory names +/// For example, given +/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` +/// +/// matches `32/51/216/13452` +static REGEX_DIRS: Lazy = + Lazy::new(|| Regex::new(r#"[0-9]+/[0-9]+/[0-9]+/[0-9]+"#).expect("directory regex")); + /// Replace table row separators of flexible width with fixed with. This is required /// because the original timing values may differ in "printed width", so the table /// cells have different widths and hence the separators / borders. E.g.: @@ -22,93 +41,159 @@ static REGEX_LINESEP: Lazy = Lazy::new(|| Regex::new(r#"[+-]{6,}"#).expec /// ` |` -> ` |` static REGEX_COL: Lazy = Lazy::new(|| Regex::new(r#"\s+\|"#).expect("col regex")); +/// Matches line like `metrics=[foo=1, bar=2]` +static REGEX_METRICS: Lazy = + Lazy::new(|| Regex::new(r#"metrics=\[([^\]]*)\]"#).expect("metrics regex")); + +/// Matches things like `1s`, `1.2ms` and `10.2μs` +static REGEX_TIMING: Lazy = + Lazy::new(|| Regex::new(r#"[0-9]+(\.[0-9]+)?.s"#).expect("timing regex")); + +/// Matches things like `FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000` +static REGEX_FILTER: Lazy = + Lazy::new(|| Regex::new("FilterExec: .*").expect("filter regex")); + fn normalize_for_variable_width(s: Cow) -> String { let s = REGEX_LINESEP.replace_all(&s, "----------"); REGEX_COL.replace_all(&s, " |").to_string() } -pub(crate) fn normalize_results(query: &Query, mut current_results: Vec) -> Vec { - // normalize UUIDs, if requested - if query.normalized_uuids() { - let regex_uuid = Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}") - .expect("UUID regex"); - let regex_dirs = Regex::new(r#"[0-9]+/[0-9]+/[0-9]+/[0-9]+"#).expect("directory regex"); +/// A query to run with optional annotations +#[derive(Debug, PartialEq, Eq, Default)] +pub struct Normalizer { + /// If true, results are sorted first + pub sorted_compare: bool, - let mut seen: HashMap = HashMap::new(); - current_results = current_results - .into_iter() - .map(|s| { - let s = regex_uuid.replace_all(&s, |s: &Captures| { - let next = seen.len() as u128; - Uuid::from_u128( - *seen - .entry(s.get(0).unwrap().as_str().to_owned()) - .or_insert(next), - ) - .to_string() - }); + /// If true, replace UUIDs with static placeholders. + pub normalized_uuids: bool, - let s = normalize_for_variable_width(s); + /// If true, normalize timings in queries by replacing them with + /// static placeholders, for example: + /// + /// `1s` -> `1.234ms` + pub normalized_metrics: bool, - regex_dirs.replace_all(&s, "1/1/1/1").to_string() - }) - .collect(); - } - - // normalize metrics, if requested - if query.normalized_metrics() { - // Parse regex once and apply to all rows. See description around the `replace...` calls on - // why/how the regexes are used. - let regex_metrics = Regex::new(r#"metrics=\[([^\]]*)\]"#).expect("metrics regex"); - let regex_timing = Regex::new(r#"[0-9]+(\.[0-9]+)?.s"#).expect("timing regex"); - - current_results = current_results - .into_iter() - .map(|s| { - // Replace timings with fixed value, e.g.: - // - // `1s` -> `1.234ms` - // `1.2ms` -> `1.234ms` - // `10.2μs` -> `1.234ms` - let s = regex_timing.replace_all(&s, "1.234ms"); - - let s = normalize_for_variable_width(s); - - // Metrics are currently ordered by value (not by key), so different timings may - // reorder them. We "parse" the list and normalize the sorting. E.g.: - // - // `metrics=[]` => `metrics=[]` - // `metrics=[foo=1, bar=2]` => `metrics=[bar=2, foo=1]` - // `metrics=[foo=2, bar=1]` => `metrics=[bar=1, foo=2]` - regex_metrics - .replace_all(&s, |c: &Captures| { - let mut metrics: Vec<_> = c[1].split(", ").collect(); - metrics.sort(); - format!("metrics=[{}]", metrics.join(", ")) - }) - .to_string() - }) - .collect(); - } - - // normalize Filters, if requested - // - // Converts: - // FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000 - // - // to - // FilterExec: - if query.normalized_filters() { - let filter_regex = Regex::new("FilterExec: .*").expect("filter regex"); - current_results = current_results - .into_iter() - .map(|s| { - filter_regex - .replace_all(&s, |_: &Captures| "FilterExec: ") - .to_string() - }) - .collect(); - } - - current_results + /// if true, normalize filter predicates for explain plans + /// `FilterExec: ` + pub normalized_filters: bool, +} + +impl Normalizer { + #[cfg(test)] + pub fn new() -> Self { + Default::default() + } + + /// Take the output of running the query and apply the specified normalizations to them + pub fn normalize_results(&self, mut results: Vec) -> Vec { + // compare against sorted results, if requested + if self.sorted_compare && !results.is_empty() { + let schema = results[0].schema(); + let batch = + arrow::compute::concat_batches(&schema, &results).expect("concatenating batches"); + results = vec![sort_record_batch(batch)]; + } + + let mut current_results = pretty_format_batches(&results) + .unwrap() + .trim() + .lines() + .map(|s| s.to_string()) + .collect::>(); + + // normalize UUIDs, if requested + if self.normalized_uuids { + let mut seen: HashMap = HashMap::new(); + current_results = current_results + .into_iter() + .map(|s| { + // Rewrite parquet directory names like + // `51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` + // + // to: + // 1/1/1/1/00000000-0000-0000-0000-000000000000.parquet + + let s = REGEX_UUID.replace_all(&s, |s: &Captures| { + let next = seen.len() as u128; + Uuid::from_u128( + *seen + .entry(s.get(0).unwrap().as_str().to_owned()) + .or_insert(next), + ) + .to_string() + }); + + let s = normalize_for_variable_width(s); + REGEX_DIRS.replace_all(&s, "1/1/1/1").to_string() + }) + .collect(); + } + + // normalize metrics, if requested + if self.normalized_metrics { + current_results = current_results + .into_iter() + .map(|s| { + // Replace timings with fixed value, e.g.: + // + // `1s` -> `1.234ms` + // `1.2ms` -> `1.234ms` + // `10.2μs` -> `1.234ms` + let s = REGEX_TIMING.replace_all(&s, "1.234ms"); + + let s = normalize_for_variable_width(s); + + // Metrics are currently ordered by value (not by key), so different timings may + // reorder them. We "parse" the list and normalize the sorting. E.g.: + // + // `metrics=[]` => `metrics=[]` + // `metrics=[foo=1, bar=2]` => `metrics=[bar=2, foo=1]` + // `metrics=[foo=2, bar=1]` => `metrics=[bar=1, foo=2]` + REGEX_METRICS + .replace_all(&s, |c: &Captures| { + let mut metrics: Vec<_> = c[1].split(", ").collect(); + metrics.sort(); + format!("metrics=[{}]", metrics.join(", ")) + }) + .to_string() + }) + .collect(); + } + + // normalize Filters, if requested + // + // Converts: + // FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000 + // + // to + // FilterExec: + if self.normalized_filters { + current_results = current_results + .into_iter() + .map(|s| { + REGEX_FILTER + .replace_all(&s, |_: &Captures| "FilterExec: ") + .to_string() + }) + .collect(); + } + + current_results + } + + /// Adds information on what normalizations were applied to the input + pub fn add_description(&self, output: &mut Vec) { + if self.sorted_compare { + output.push("-- Results After Sorting".into()) + } + if self.normalized_uuids { + output.push("-- Results After Normalizing UUIDs".into()) + } + if self.normalized_metrics { + output.push("-- Results After Normalizing Metrics".into()) + } + if self.normalized_filters { + output.push("-- Results After Normalizing Filters".into()) + } + } } diff --git a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs index 3c76195aba..70a5f8da4c 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs @@ -1,22 +1,12 @@ +use arrow::record_batch::RecordBatch; + +use super::normalization::Normalizer; + /// A query to run with optional annotations #[derive(Debug, PartialEq, Eq, Default)] pub struct Query { - /// If true, results are sorted first prior to comparison, meaning that differences in the - /// output order compared with expected order do not cause a diff - sorted_compare: bool, - - /// If true, replace UUIDs with static placeholders. - normalized_uuids: bool, - - /// If true, normalize timings in queries by replacing them with - /// static placeholders, for example: - /// - /// `1s` -> `1.234ms` - normalized_metrics: bool, - - /// if true, normalize filter predicates for explain plans - /// `FilterExec: ` - normalized_filters: bool, + /// Describes how query text should be normalized + normalizer: Normalizer, /// The query string text: String, @@ -27,49 +17,49 @@ impl Query { fn new(text: impl Into) -> Self { let text = text.into(); Self { - sorted_compare: false, - normalized_uuids: false, - normalized_metrics: false, - normalized_filters: false, + normalizer: Normalizer::new(), text, } } - #[cfg(test)] - fn with_sorted_compare(mut self) -> Self { - self.sorted_compare = true; + pub fn text(&self) -> &str { + &self.text + } + + pub fn with_sorted_compare(mut self) -> Self { + self.normalizer.sorted_compare = true; self } - /// Get a reference to the query text. - pub fn text(&self) -> &str { - self.text.as_ref() + pub fn with_normalized_uuids(mut self) -> Self { + self.normalizer.normalized_uuids = true; + self } - /// Get the query's sorted compare. - pub fn sorted_compare(&self) -> bool { - self.sorted_compare + pub fn with_normalize_metrics(mut self) -> Self { + self.normalizer.normalized_metrics = true; + self } - /// Get queries normalized UUID - pub fn normalized_uuids(&self) -> bool { - self.normalized_uuids + pub fn with_normalize_filters(mut self) -> Self { + self.normalizer.normalized_filters = true; + self } - /// Use normalized timing values - pub fn normalized_metrics(&self) -> bool { - self.normalized_metrics + /// Take the output of running the query and apply the specified normalizations to them + pub fn normalize_results(&self, results: Vec) -> Vec { + self.normalizer.normalize_results(results) } - /// Use normalized filter plans - pub fn normalized_filters(&self) -> bool { - self.normalized_filters + /// Adds information on what normalizations were applied to the input + pub fn add_description(&self, output: &mut Vec) { + self.normalizer.add_description(output) } } #[derive(Debug, Default)] struct QueryBuilder { - query: Query, + pub query: Query, } impl QueryBuilder { @@ -85,22 +75,6 @@ impl QueryBuilder { self.query.text.push(c) } - fn sorted_compare(&mut self) { - self.query.sorted_compare = true; - } - - fn normalized_uuids(&mut self) { - self.query.normalized_uuids = true; - } - - fn normalize_metrics(&mut self) { - self.query.normalized_metrics = true; - } - - fn normalize_filters(&mut self) { - self.query.normalized_filters = true; - } - fn is_empty(&self) -> bool { self.query.text.is_empty() } @@ -125,54 +99,57 @@ impl TestQueries { S: AsRef, { let mut queries = vec![]; - let mut builder = QueryBuilder::new(); - lines.into_iter().for_each(|line| { - let line = line.as_ref().trim(); - const COMPARE_STR: &str = "-- IOX_COMPARE: "; - if line.starts_with(COMPARE_STR) { - let (_, options) = line.split_at(COMPARE_STR.len()); - for option in options.split(',') { - let option = option.trim(); - match option { - "sorted" => { - builder.sorted_compare(); + let mut builder = lines + .into_iter() + .fold(QueryBuilder::new(), |mut builder, line| { + let line = line.as_ref().trim(); + const COMPARE_STR: &str = "-- IOX_COMPARE: "; + if line.starts_with(COMPARE_STR) { + let (_, options) = line.split_at(COMPARE_STR.len()); + for option in options.split(',') { + let option = option.trim(); + match option { + "sorted" => { + builder.query = builder.query.with_sorted_compare(); + } + "uuid" => { + builder.query = builder.query.with_normalized_uuids(); + } + "metrics" => { + builder.query = builder.query.with_normalize_metrics(); + } + "filters" => { + builder.query = builder.query.with_normalize_filters(); + } + _ => {} } - "uuid" => { - builder.normalized_uuids(); - } - "metrics" => { - builder.normalize_metrics(); - } - "filters" => { - builder.normalize_filters(); - } - _ => {} } } - } - if line.starts_with("--") { - return; - } - if line.is_empty() { - return; - } - - // replace newlines - if !builder.is_empty() { - builder.push(' '); - } - builder.push_str(line); - - // declare queries when we see a semicolon at the end of the line - if line.ends_with(';') { - if let Some(q) = builder.build_and_reset() { - queries.push(q); + if line.starts_with("--") { + return builder; + } + if line.is_empty() { + return builder; } - } - }); + // replace newlines + if !builder.is_empty() { + builder.push(' '); + } + builder.push_str(line); + + // declare queries when we see a semicolon at the end of the line + if line.ends_with(';') { + if let Some(q) = builder.build_and_reset() { + queries.push(q); + } + } + builder + }); + + // get last one, if any if let Some(q) = builder.build_and_reset() { queries.push(q); } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index ad6f645019..20c32270f1 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -37,6 +37,7 @@ flatbuffers = { version = "23", features = ["std"] } flate2 = { version = "1", features = ["miniz_oxide", "rust_backend"] } futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } futures-core = { version = "0.3", features = ["alloc", "std"] } +futures-executor = { version = "0.3", features = ["std"] } futures-io = { version = "0.3", features = ["std"] } futures-sink = { version = "0.3", features = ["alloc", "std"] } futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } @@ -74,8 +75,8 @@ serde_json = { version = "1", features = ["raw_value", "std"] } sha2 = { version = "0.10", features = ["std"] } similar = { version = "2", features = ["inline", "text"] } smallvec = { version = "1", default-features = false, features = ["union"] } -sqlx = { version = "0.6", features = ["_rt-tokio", "json", "macros", "migrate", "postgres", "runtime-tokio-rustls", "sqlx-macros", "tls", "uuid"] } -sqlx-core = { version = "0.6", default-features = false, features = ["_rt-tokio", "_tls-rustls", "any", "base64", "crc", "dirs", "hkdf", "hmac", "json", "md-5", "migrate", "postgres", "rand", "runtime-tokio-rustls", "rustls", "rustls-pemfile", "serde", "serde_json", "sha1", "sha2", "tokio-stream", "uuid", "webpki-roots", "whoami"] } +sqlx = { version = "0.6", features = ["_rt-tokio", "json", "macros", "migrate", "postgres", "runtime-tokio-rustls", "sqlite", "sqlx-macros", "tls", "uuid"] } +sqlx-core = { version = "0.6", default-features = false, features = ["_rt-tokio", "_tls-rustls", "any", "base64", "crc", "dirs", "flume", "futures-executor", "hkdf", "hmac", "json", "libsqlite3-sys", "md-5", "migrate", "postgres", "rand", "runtime-tokio-rustls", "rustls", "rustls-pemfile", "serde", "serde_json", "sha1", "sha2", "sqlite", "tokio-stream", "uuid", "webpki-roots", "whoami"] } thrift = { version = "0.17", features = ["log", "server", "threadpool"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "parking_lot", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "tracing"] } tokio-stream = { version = "0.1", features = ["fs", "net", "time"] } @@ -107,6 +108,7 @@ either = { version = "1", features = ["use_std"] } fixedbitset = { version = "0.4", features = ["std"] } futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } futures-core = { version = "0.3", features = ["alloc", "std"] } +futures-executor = { version = "0.3", features = ["std"] } futures-io = { version = "0.3", features = ["std"] } futures-sink = { version = "0.3", features = ["alloc", "std"] } futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } @@ -137,8 +139,8 @@ serde = { version = "1", features = ["derive", "rc", "serde_derive", "std"] } serde_json = { version = "1", features = ["raw_value", "std"] } sha2 = { version = "0.10", features = ["std"] } smallvec = { version = "1", default-features = false, features = ["union"] } -sqlx-core = { version = "0.6", default-features = false, features = ["_rt-tokio", "_tls-rustls", "any", "base64", "crc", "dirs", "hkdf", "hmac", "json", "md-5", "migrate", "postgres", "rand", "runtime-tokio-rustls", "rustls", "rustls-pemfile", "serde", "serde_json", "sha1", "sha2", "tokio-stream", "uuid", "webpki-roots", "whoami"] } -sqlx-macros = { version = "0.6", default-features = false, features = ["_rt-tokio", "json", "migrate", "postgres", "runtime-tokio-rustls", "serde_json", "sha2", "uuid"] } +sqlx-core = { version = "0.6", default-features = false, features = ["_rt-tokio", "_tls-rustls", "any", "base64", "crc", "dirs", "flume", "futures-executor", "hkdf", "hmac", "json", "libsqlite3-sys", "md-5", "migrate", "postgres", "rand", "runtime-tokio-rustls", "rustls", "rustls-pemfile", "serde", "serde_json", "sha1", "sha2", "sqlite", "tokio-stream", "uuid", "webpki-roots", "whoami"] } +sqlx-macros = { version = "0.6", default-features = false, features = ["_rt-tokio", "json", "migrate", "postgres", "runtime-tokio-rustls", "serde_json", "sha2", "sqlite", "uuid"] } syn = { version = "1", features = ["clone-impls", "derive", "extra-traits", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "parking_lot", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "tracing"] } tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }