diff --git a/Cargo.lock b/Cargo.lock index 8474d61947..ec6ccbe445 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -769,9 +769,9 @@ dependencies = [ [[package]] name = "crypto-mac" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6" +checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" dependencies = [ "generic-array", "subtle", @@ -826,6 +826,7 @@ dependencies = [ "influxdb_line_protocol", "num_cpus", "observability_deps", + "once_cell", "percent-encoding", "regex", "serde", @@ -843,7 +844,7 @@ dependencies = [ [[package]] name = "datafusion" version = "4.0.0-SNAPSHOT" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3" dependencies = [ "ahash 0.7.4", "arrow", @@ -4330,9 +4331,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342" +checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0" dependencies = [ "tinyvec_macros", ] @@ -4345,9 +4346,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985" +checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207" dependencies = [ "autocfg", "bytes", @@ -4984,9 +4985,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" [[package]] name = "zeroize" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18" +checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd" [[package]] name = "zstd" diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml index 197174f14e..fd145052f9 100644 --- a/data_types/Cargo.toml +++ b/data_types/Cargo.toml @@ -15,6 +15,7 @@ regex = "1.4" serde = { version = "1.0", features = ["rc", "derive"] } snafu = "0.6" observability_deps = { path = "../observability_deps" } +once_cell = { version = "1.4.0", features = ["parking_lot"] } [dev-dependencies] # In alphabetical order test_helpers = { path = "../test_helpers" } diff --git a/data_types/src/database_rules.rs b/data_types/src/database_rules.rs index 6cff31a510..86a71be778 100644 --- a/data_types/src/database_rules.rs +++ b/data_types/src/database_rules.rs @@ -166,6 +166,10 @@ pub struct LifecycleRules { /// Maximum number of rows to buffer in a MUB chunk before compacting it pub mub_row_threshold: NonZeroUsize, + + /// Use up to this amount of space in bytes for caching Parquet files. None + /// will disable Parquet file caching. + pub parquet_cache_limit: Option, } impl LifecycleRules { @@ -195,6 +199,7 @@ impl Default for LifecycleRules { persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS) .unwrap(), mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(), + parquet_cache_limit: None, } } } diff --git a/data_types/src/instant.rs b/data_types/src/instant.rs new file mode 100644 index 0000000000..807bcbba49 --- /dev/null +++ b/data_types/src/instant.rs @@ -0,0 +1,53 @@ +use chrono::{DateTime, Utc}; +use once_cell::sync::OnceCell; +use std::time::Instant; + +/// Stores an Instant and DateTime captured as close as possible together +static INSTANCE: OnceCell<(DateTime, Instant)> = OnceCell::new(); + +/// Provides a conversion from Instant to DateTime for display purposes +/// +/// It is an approximation as if the system clock changes, the returned DateTime will not be +/// the same as the DateTime that would have been recorded at the time the Instant was created. +/// +/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger +/// Instant will have a larger returned DateTime. +/// +/// This should ONLY be used for display purposes, the results should not be used to +/// drive logic, nor persisted +pub fn to_approximate_datetime(instant: Instant) -> DateTime { + let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now())); + + if ref_instant > instant { + ref_date + - chrono::Duration::from_std(ref_instant.duration_since(instant)) + .expect("date overflow") + } else { + ref_date + + chrono::Duration::from_std(instant.duration_since(ref_instant)) + .expect("date overflow") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_to_datetime() { + // Seed global state + to_approximate_datetime(Instant::now()); + + let (ref_date, ref_instant) = *INSTANCE.get().unwrap(); + + assert_eq!( + to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)), + ref_date + chrono::Duration::nanoseconds(78) + ); + + assert_eq!( + to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)), + ref_date - chrono::Duration::nanoseconds(23) + ); + } +} diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index bea9629bc3..f222aad0ff 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -13,12 +13,14 @@ pub mod chunk_metadata; pub mod consistent_hasher; mod database_name; -pub use database_name::*; pub mod database_rules; pub mod database_state; pub mod error; +pub mod instant; pub mod job; pub mod names; pub mod partition_metadata; pub mod server_id; pub mod timestamp; +pub mod write_summary; +pub use database_name::*; diff --git a/data_types/src/write_summary.rs b/data_types/src/write_summary.rs new file mode 100644 index 0000000000..9574910262 --- /dev/null +++ b/data_types/src/write_summary.rs @@ -0,0 +1,20 @@ +use chrono::{DateTime, Utc}; + +/// A description of a set of writes +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct WriteSummary { + /// The wall clock timestamp of the last write in this summary + pub time_of_first_write: DateTime, + + /// The wall clock timestamp of the last write in this summary + pub time_of_last_write: DateTime, + + /// The minimum row timestamp for data in this summary + pub min_timestamp: DateTime, + + /// The maximum row timestamp value for data in this summary + pub max_timestamp: DateTime, + + /// The number of rows in this summary + pub row_count: usize, +} diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 64a9e97e69..f969251e6b 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version" # Rename to workaround doctest bug # Turn off optional datafusion features (function packages) -upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" } +upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" } diff --git a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto index 530c95bca6..b1ad761dbe 100644 --- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto +++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto @@ -82,6 +82,10 @@ message LifecycleRules { // If 0, compactions are limited to the default number. // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS uint32 max_active_compactions = 16; + + // Use up to this amount of space in bytes for caching Parquet files. + // A value of 0 disables Parquet caching + uint64 parquet_cache_limit = 17; } message DatabaseRules { diff --git a/generated_types/src/database_rules/lifecycle.rs b/generated_types/src/database_rules/lifecycle.rs index b9612bf1b6..ab71e38de5 100644 --- a/generated_types/src/database_rules/lifecycle.rs +++ b/generated_types/src/database_rules/lifecycle.rs @@ -35,6 +35,10 @@ impl From for management::LifecycleRules { persist_row_threshold: config.persist_row_threshold.get() as u64, persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(), mub_row_threshold: config.mub_row_threshold.get() as u64, + parquet_cache_limit: config + .parquet_cache_limit + .map(|v| v.get()) + .unwrap_or_default(), } } } @@ -69,6 +73,7 @@ impl TryFrom for LifecycleRules { .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()), mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize) .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()), + parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit), }) } } @@ -93,6 +98,7 @@ mod tests { persist_row_threshold: 57, persist_age_threshold_seconds: 23, mub_row_threshold: 3454, + parquet_cache_limit: 10, }; let config: LifecycleRules = protobuf.clone().try_into().unwrap(); @@ -125,6 +131,11 @@ mod tests { protobuf.persist_age_threshold_seconds ); assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold); + assert_eq!( + config.parquet_cache_limit.unwrap().get(), + protobuf.parquet_cache_limit + ); + assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit); } #[test] diff --git a/internal_types/src/schema.rs b/internal_types/src/schema.rs index 2afb0cede3..9ac26a8731 100644 --- a/internal_types/src/schema.rs +++ b/internal_types/src/schema.rs @@ -11,7 +11,7 @@ use arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, }; -use snafu::Snafu; +use snafu::{OptionExt, Snafu}; use crate::{ schema::sort::{ColumnSort, SortKey}, @@ -395,11 +395,9 @@ impl Schema { pub fn compute_select_indicies(&self, columns: &[&str]) -> Result> { columns .iter() - .map(|column_name| { + .map(|&column_name| { self.find_index_of(column_name) - .ok_or_else(|| Error::ColumnNotFound { - column_name: column_name.to_string(), - }) + .context(ColumnNotFound { column_name }) }) .collect() } @@ -788,12 +786,12 @@ macro_rules! assert_column_eq { #[cfg(test)] mod test { + use arrow::compute::SortOptions; use InfluxColumnType::*; use InfluxFieldType::*; use super::{builder::SchemaBuilder, *}; use crate::schema::merge::SchemaMerger; - use crate::schema::sort::SortOptions; fn make_field( name: &str, diff --git a/internal_types/src/schema/sort.rs b/internal_types/src/schema/sort.rs index a56fd0a495..0612b11dbc 100644 --- a/internal_types/src/schema/sort.rs +++ b/internal_types/src/schema/sort.rs @@ -1,5 +1,6 @@ use std::{fmt::Display, str::FromStr}; +use arrow::compute::SortOptions; use indexmap::{map::Iter, IndexMap}; use itertools::Itertools; use snafu::Snafu; @@ -23,24 +24,6 @@ pub enum Error { pub type Result = std::result::Result; -/// Temporary - -#[derive(Debug, Clone, Copy, Eq, PartialEq)] -pub struct SortOptions { - /// Whether to sort in descending order - pub descending: bool, - /// Whether to sort nulls first - pub nulls_first: bool, -} - -impl Default for SortOptions { - fn default() -> Self { - Self { - descending: false, - nulls_first: true, - } - } -} - #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub struct ColumnSort { /// Position of this column in the sort key diff --git a/lifecycle/src/policy.rs b/lifecycle/src/policy.rs index dfe713fa2c..beb35464ea 100644 --- a/lifecycle/src/policy.rs +++ b/lifecycle/src/policy.rs @@ -1399,6 +1399,7 @@ mod tests { let rules = LifecycleRules { late_arrive_window_seconds: NonZeroU32::new(10).unwrap(), persist_row_threshold: NonZeroUsize::new(1_000).unwrap(), + max_active_compactions: NonZeroU32::new(10).unwrap(), ..Default::default() }; @@ -1538,6 +1539,7 @@ mod tests { persist_row_threshold: NonZeroUsize::new(1_000).unwrap(), late_arrive_window_seconds: NonZeroU32::new(10).unwrap(), persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(), + max_active_compactions: NonZeroU32::new(10).unwrap(), ..Default::default() }; let now = Instant::now(); diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs index 8dd287f5ba..8cb97d0694 100644 --- a/persistence_windows/src/persistence_windows.rs +++ b/persistence_windows/src/persistence_windows.rs @@ -7,12 +7,13 @@ use std::{ use chrono::{DateTime, TimeZone, Utc}; -use data_types::partition_metadata::PartitionAddr; +use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary}; use entry::Sequence; use internal_types::guard::{ReadGuard, ReadLock}; use crate::checkpoint::PartitionCheckpoint; use crate::min_max_sequence::MinMaxSequence; +use data_types::instant::to_approximate_datetime; const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30); @@ -45,6 +46,9 @@ pub struct PersistenceWindows { late_arrival_period: Duration, closed_window_period: Duration, + /// The instant this PersistenceWindows was created + created_at: Instant, + /// The last instant passed to PersistenceWindows::add_range last_instant: Instant, @@ -106,6 +110,8 @@ impl PersistenceWindows { let closed_window_count = late_arrival_seconds / closed_window_seconds; + let created_at_instant = Instant::now(); + Self { persistable: ReadLock::new(None), closed: VecDeque::with_capacity(closed_window_count as usize), @@ -113,11 +119,18 @@ impl PersistenceWindows { addr, late_arrival_period, closed_window_period, - last_instant: Instant::now(), + created_at: created_at_instant, + last_instant: created_at_instant, max_sequence_numbers: Default::default(), } } + /// Updates the late arrival period of this `PersistenceWindows` instance + pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) { + self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD); + self.late_arrival_period = late_arrival_period; + } + /// Updates the windows with the information from a batch of rows from a single sequencer /// to the same partition. The min and max times are the times on the row data. The `received_at` /// Instant is when the data was received. Taking it in this function is really just about @@ -165,7 +178,7 @@ impl PersistenceWindows { self.rotate(received_at); match self.open.as_mut() { - Some(w) => w.add_range(sequence, row_count, min_time, max_time), + Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at), None => { self.open = Some(Window::new( received_at, @@ -335,6 +348,21 @@ impl PersistenceWindows { self.windows().next() } + /// Returns approximate summaries of the unpersisted writes contained + /// recorded by this PersistenceWindow instance + /// + /// These are approximate because persistence may partially flush a window, which will + /// update the min row timestamp but not the row count + pub fn summaries(&self) -> impl Iterator + '_ { + self.windows().map(move |window| WriteSummary { + time_of_first_write: to_approximate_datetime(window.created_at), + time_of_last_write: to_approximate_datetime(window.last_instant), + min_timestamp: window.min_time, + max_timestamp: window.max_time, + row_count: window.row_count, + }) + } + /// Returns true if this PersistenceWindows instance is empty pub fn is_empty(&self) -> bool { self.minimum_window().is_none() @@ -374,9 +402,14 @@ struct Window { /// The server time when this window was created. Used to determine how long data in this /// window has been sitting in memory. created_at: Instant, + /// The server time of the last write to this window + last_instant: Instant, + /// The number of rows in the window row_count: usize, - min_time: DateTime, // min time value for data in the window - max_time: DateTime, // max time value for data in the window + /// min time value for data in the window + min_time: DateTime, + /// max time value for data in the window + max_time: DateTime, /// maps sequencer_id to the minimum and maximum sequence numbers seen sequencer_numbers: BTreeMap, } @@ -399,6 +432,7 @@ impl Window { Self { created_at, + last_instant: created_at, row_count, min_time, max_time, @@ -414,7 +448,11 @@ impl Window { row_count: usize, min_time: DateTime, max_time: DateTime, + instant: Instant, ) { + assert!(self.created_at <= instant); + self.last_instant = instant; + self.row_count += row_count; if self.min_time > min_time { self.min_time = min_time; @@ -440,6 +478,10 @@ impl Window { /// Add one window to another. Used to collapse closed windows into persisted. fn add_window(&mut self, other: Self) { + assert!(self.last_instant <= other.created_at); + assert!(self.last_instant <= other.last_instant); + + self.last_instant = other.last_instant; self.row_count += other.row_count; if self.min_time > other.min_time { self.min_time = other.min_time; @@ -1265,4 +1307,119 @@ mod tests { assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2)); assert_eq!(w.closed[1].row_count, 11); } + + #[test] + fn test_summaries() { + let late_arrival_period = Duration::from_secs(100); + let mut w = make_windows(late_arrival_period); + let instant = w.created_at; + let created_at_time = to_approximate_datetime(w.created_at); + + // Window 1 + w.add_range( + Some(&Sequence { id: 1, number: 1 }), + 11, + Utc.timestamp_nanos(10), + Utc.timestamp_nanos(11), + instant + Duration::from_millis(1), + ); + + w.add_range( + Some(&Sequence { id: 1, number: 2 }), + 4, + Utc.timestamp_nanos(10), + Utc.timestamp_nanos(340), + instant + Duration::from_millis(30), + ); + + w.add_range( + Some(&Sequence { id: 1, number: 3 }), + 6, + Utc.timestamp_nanos(1), + Utc.timestamp_nanos(5), + instant + Duration::from_millis(50), + ); + + // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2 + w.add_range( + Some(&Sequence { id: 1, number: 4 }), + 3, + Utc.timestamp_nanos(89), + Utc.timestamp_nanos(90), + instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1), + ); + + // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3 + w.add_range( + Some(&Sequence { id: 1, number: 5 }), + 8, + Utc.timestamp_nanos(3), + Utc.timestamp_nanos(4), + instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3, + ); + + let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap(); + + let summaries: Vec<_> = w.summaries().collect(); + + assert_eq!(summaries.len(), 3); + assert_eq!( + summaries, + vec![ + WriteSummary { + time_of_first_write: created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: created_at_time + chrono::Duration::milliseconds(50), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(340), + row_count: 21 + }, + WriteSummary { + time_of_first_write: created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + time_of_last_write: created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + min_timestamp: Utc.timestamp_nanos(89), + max_timestamp: Utc.timestamp_nanos(90), + row_count: 3 + }, + WriteSummary { + time_of_first_write: created_at_time + closed_duration * 3, + time_of_last_write: created_at_time + closed_duration * 3, + min_timestamp: Utc.timestamp_nanos(3), + max_timestamp: Utc.timestamp_nanos(4), + row_count: 8 + }, + ] + ); + + // Rotate first and second windows into persistable + w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2); + + let summaries: Vec<_> = w.summaries().collect(); + + assert_eq!(summaries.len(), 2); + assert_eq!( + summaries, + vec![ + WriteSummary { + time_of_first_write: created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(340), + row_count: 24 + }, + WriteSummary { + time_of_first_write: created_at_time + closed_duration * 3, + time_of_last_write: created_at_time + closed_duration * 3, + min_timestamp: Utc.timestamp_nanos(3), + max_timestamp: Utc.timestamp_nanos(4), + row_count: 8 + }, + ] + ); + } } diff --git a/query/src/exec.rs b/query/src/exec.rs index ce5e085581..e5b7034c75 100644 --- a/query/src/exec.rs +++ b/query/src/exec.rs @@ -39,6 +39,7 @@ use crate::plan::{ }; use self::{ + context::IOxExecutionConfig, split::StreamSplitNode, task::{DedicatedExecutor, Error as ExecutorError}, }; @@ -111,6 +112,9 @@ pub struct Executor { /// Executor for running system/reorganization tasks such as /// compact reorg_exec: DedicatedExecutor, + + /// The default configuration options with which to create contexts + config: IOxExecutionConfig, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -128,12 +132,25 @@ impl Executor { let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads); let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads); + let config = IOxExecutionConfig::new(); + Self { query_exec, reorg_exec, + config, } } + /// returns the config of this executor + pub fn config(&self) -> &IOxExecutionConfig { + &self.config + } + + /// returns a mutable reference to this executor's config + pub fn config_mut(&mut self) -> &mut IOxExecutionConfig { + &mut self.config + } + /// Executes this plan on the query pool, and returns the /// resulting set of strings pub async fn to_string_set(&self, plan: StringSetPlan) -> Result { @@ -289,7 +306,7 @@ impl Executor { pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext { let executor = self.executor(executor_type).clone(); - IOxExecutionContext::new(executor) + IOxExecutionContext::new(executor, self.config.clone()) } /// Return the execution pool of the specified type diff --git a/query/src/exec/context.rs b/query/src/exec/context.rs index 702e032bac..ffad4541c5 100644 --- a/query/src/exec/context.rs +++ b/query/src/exec/context.rs @@ -5,6 +5,7 @@ use std::{fmt, sync::Arc}; use arrow::record_batch::RecordBatch; use datafusion::{ + catalog::catalog::CatalogProvider, execution::context::{ExecutionContextState, QueryPlanner}, logical_plan::{LogicalPlan, UserDefinedLogicalNode}, physical_plan::{ @@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner { } } +// Configuration for an IOx execution context +#[derive(Clone)] +pub struct IOxExecutionConfig { + /// Configuration options to pass to DataFusion + inner: ExecutionConfig, +} + +impl Default for IOxExecutionConfig { + fn default() -> Self { + const BATCH_SIZE: usize = 1000; + + // Setup default configuration + let inner = ExecutionConfig::new() + .with_batch_size(BATCH_SIZE) + .create_default_catalog_and_schema(true) + .with_information_schema(true) + .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA) + .with_query_planner(Arc::new(IOxQueryPlanner {})); + + Self { inner } + } +} + +impl fmt::Debug for IOxExecutionConfig { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "IOxExecutionConfig ...") + } +} + +impl IOxExecutionConfig { + pub fn new() -> Self { + Default::default() + } + + /// Set execution concurrency + pub fn set_concurrency(&mut self, concurrency: usize) { + self.inner.concurrency = concurrency; + } +} + /// This is an execution context for planning in IOx. It wraps a /// DataFusion execution context with the information needed for planning. /// @@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext { impl IOxExecutionContext { /// Create an ExecutionContext suitable for executing DataFusion plans - /// - /// The config is created with a default catalog and schema, but this - /// can be overridden at a later date - pub fn new(exec: DedicatedExecutor) -> Self { - const BATCH_SIZE: usize = 1000; - - // TBD: Should we be reusing an execution context across all executions? - let config = ExecutionConfig::new() - .with_batch_size(BATCH_SIZE) - .create_default_catalog_and_schema(true) - .with_information_schema(true) - .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA) - .with_query_planner(Arc::new(IOxQueryPlanner {})); - - let inner = ExecutionContext::with_config(config); + pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self { + let inner = ExecutionContext::with_config(config.inner); Self { inner, exec } } @@ -160,11 +188,13 @@ impl IOxExecutionContext { &self.inner } - /// returns a mutable reference to the inner datafusion execution context - pub fn inner_mut(&mut self) -> &mut ExecutionContext { - &mut self.inner + /// registers a catalog with the inner context + pub fn register_catalog(&mut self, name: impl Into, catalog: Arc) { + self.inner.register_catalog(name, catalog); } + /// + /// Prepare a SQL statement for execution. This assumes that any /// tables referenced in the SQL have been registered with this context pub fn prepare_sql(&mut self, sql: &str) -> Result> { diff --git a/query/src/frontend/reorg.rs b/query/src/frontend/reorg.rs index a1c2df8599..43e1c824e7 100644 --- a/query/src/frontend/reorg.rs +++ b/query/src/frontend/reorg.rs @@ -268,8 +268,9 @@ struct ScanPlan { #[cfg(test)] mod test { + use arrow::compute::SortOptions; use arrow_util::assert_batches_eq; - use internal_types::schema::{merge::SchemaMerger, sort::SortOptions}; + use internal_types::schema::merge::SchemaMerger; use crate::{ exec::{Executor, ExecutorType}, diff --git a/query/src/frontend/sql.rs b/query/src/frontend/sql.rs index be737fd4d6..c08ad7c671 100644 --- a/query/src/frontend/sql.rs +++ b/query/src/frontend/sql.rs @@ -87,7 +87,7 @@ impl SqlQueryPlanner { executor: &Executor, ) -> Result> { let mut ctx = executor.new_context(ExecutorType::Query); - ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database); + ctx.register_catalog(DEFAULT_CATALOG, database); ctx.prepare_sql(query).context(Preparing) } } diff --git a/query/src/provider/deduplicate/algo.rs b/query/src/provider/deduplicate/algo.rs index d3267b5760..db1165ff3a 100644 --- a/query/src/provider/deduplicate/algo.rs +++ b/query/src/provider/deduplicate/algo.rs @@ -366,21 +366,12 @@ impl RecordBatchDeduplicator { } /// Create a new record batch from offset --> len - /// - /// for adding this upstream fn slice_record_batch( batch: &RecordBatch, offset: usize, len: usize, ) -> ArrowResult { - let schema = batch.schema(); - let new_columns: Vec<_> = batch - .columns() - .iter() - .map(|old_column| old_column.slice(offset, len)) - .collect(); - - let batch = RecordBatch::try_new(schema, new_columns)?; + let batch = batch.slice(offset, len); // At time of writing, `concat_batches` concatenates the // contents of dictionaries as well; Do a post pass to remove the diff --git a/query_tests/cases/in/all_chunks_dropped.expected b/query_tests/cases/in/all_chunks_dropped.expected index 5febb4d2e9..65e17df50a 100644 --- a/query_tests/cases/in/all_chunks_dropped.expected +++ b/query_tests/cases/in/all_chunks_dropped.expected @@ -1,25 +1,27 @@ -- Test Setup: OneMeasurementAllChunksDropped -- SQL: SELECT * from information_schema.tables; -+---------------+--------------------+---------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+---------------+------------+ -| public | iox | h2o | BASE TABLE | -| public | system | chunks | BASE TABLE | -| public | system | columns | BASE TABLE | -| public | system | chunk_columns | BASE TABLE | -| public | system | operations | BASE TABLE | -| public | information_schema | tables | VIEW | -| public | information_schema | columns | VIEW | -+---------------+--------------------+---------------+------------+ ++---------------+--------------------+---------------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+---------------------+------------+ +| public | iox | h2o | BASE TABLE | +| public | system | chunks | BASE TABLE | +| public | system | columns | BASE TABLE | +| public | system | chunk_columns | BASE TABLE | +| public | system | operations | BASE TABLE | +| public | system | persistence_windows | BASE TABLE | +| public | information_schema | tables | VIEW | +| public | information_schema | columns | VIEW | ++---------------+--------------------+---------------------+------------+ -- SQL: SHOW TABLES; -+---------------+--------------------+---------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+---------------+------------+ -| public | iox | h2o | BASE TABLE | -| public | system | chunks | BASE TABLE | -| public | system | columns | BASE TABLE | -| public | system | chunk_columns | BASE TABLE | -| public | system | operations | BASE TABLE | -| public | information_schema | tables | VIEW | -| public | information_schema | columns | VIEW | -+---------------+--------------------+---------------+------------+ ++---------------+--------------------+---------------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+---------------------+------------+ +| public | iox | h2o | BASE TABLE | +| public | system | chunks | BASE TABLE | +| public | system | columns | BASE TABLE | +| public | system | chunk_columns | BASE TABLE | +| public | system | operations | BASE TABLE | +| public | system | persistence_windows | BASE TABLE | +| public | information_schema | tables | VIEW | +| public | information_schema | columns | VIEW | ++---------------+--------------------+---------------------+------------+ diff --git a/query_tests/cases/in/duplicates.expected b/query_tests/cases/in/duplicates.expected index 97ba6e6001..d83e04b0dc 100644 --- a/query_tests/cases/in/duplicates.expected +++ b/query_tests/cases/in/duplicates.expected @@ -1,86 +1,87 @@ -- Test Setup: OneMeasurementThreeChunksWithDuplicates --- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] | -| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | -| | ExecutionPlan(PlaceHolder) | -| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o; -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | -| | ExecutionPlan(PlaceHolder) | -| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o; -+-----------------------------------------+-------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-------------------------------------------------------------------------------+ -| logical_plan | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=None | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=Some([4]) | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=Some([1]) | -| logical_plan after simplify_expressions | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=Some([4]) | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=Some([1]) | -| physical_plan | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[state@0 as name] | -| | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[state@1 as state] | -| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | ProjectionExec: expr=[city@0 as name] | -| | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[city@0 as city] | -| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-------------------------------------------------------------------------------+ +-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | +| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | +| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | +| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] | +| | CoalescePartitionsExec | +| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | +| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | +| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; ++---------------+-----------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------+ +| logical_plan | Union | +| | Projection: #h2o.state AS name | +| | TableScan: h2o projection=Some([4]) | +| | Projection: #h2o.city AS name | +| | TableScan: h2o projection=Some([1]) | +| physical_plan | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[state@0 as name] | +| | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[state@1 as state] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | ProjectionExec: expr=[city@0 as name] | +| | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[city@0 as city] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------+ diff --git a/query_tests/cases/in/duplicates.sql b/query_tests/cases/in/duplicates.sql index c9e0159199..5ef261c75e 100644 --- a/query_tests/cases/in/duplicates.sql +++ b/query_tests/cases/in/duplicates.sql @@ -2,11 +2,11 @@ -- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates -- Plan with order by -explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; +explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; -- plan without order by -explain verbose select time, state, city, min_temp, max_temp, area from h2o; +EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; -- Union plan -EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o; +EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; diff --git a/query_tests/cases/in/pushdown.expected b/query_tests/cases/in/pushdown.expected index 7173930313..e4e84b8b9e 100644 --- a/query_tests/cases/in/pushdown.expected +++ b/query_tests/cases/in/pushdown.expected @@ -1,218 +1,167 @@ -- Test Setup: TwoMeasurementsPredicatePushDown --- SQL: EXPLAIN VERBOSE SELECT * from restaurant; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Float64) > 200 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 4 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury'; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 4 AND system@1 < 7 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND system@1 < 7 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200; ++---------------+--------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] | ++---------------+--------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0; ++---------------+----------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+----------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Float64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Float64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] | ++---------------+----------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0; ++---------------+---------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] | ++---------------+---------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury'; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and count < 40000; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)] | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/query_tests/cases/in/pushdown.sql b/query_tests/cases/in/pushdown.sql index 6ccf718c32..e97c34ef85 100644 --- a/query_tests/cases/in/pushdown.sql +++ b/query_tests/cases/in/pushdown.sql @@ -2,44 +2,44 @@ -- IOX_SETUP: TwoMeasurementsPredicatePushDown -- Test 1: Select everything -EXPLAIN VERBOSE SELECT * from restaurant; +EXPLAIN SELECT * from restaurant; -- Test 2: One push-down expression: count > 200 -- TODO: Make push-down predicates shown in explain verbose. Ticket #1538 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200; +EXPLAIN SELECT * from restaurant where count > 200; -- Test 2.2: One push-down expression: count > 200.0 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0; +EXPLAIN SELECT * from restaurant where count > 200.0; -- Test 2.3: One push-down expression: system > 4.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0; +EXPLAIN SELECT * from restaurant where system > 4.0; -- Test 3: Two push-down expression: count > 200 and town != 'tewsbury' -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury'; +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury'; -- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury' -- even though the results are different -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); -- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; -- Test 6: two push-down expression: count > 200 and count < 40000 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000; +EXPLAIN SELECT * from restaurant where count > 200 and count < 40000; -- Test 7: two push-down expression on float: system > 4.0 and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0; +EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0; -- Test 8: two push-down expression on float: system > 5.0 and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0; +EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0; -- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; +EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; -- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); +EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); -- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and -- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130) -EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); +EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); diff --git a/query_tests/src/runner.rs b/query_tests/src/runner.rs index 6edf05ce56..7abbf40403 100644 --- a/query_tests/src/runner.rs +++ b/query_tests/src/runner.rs @@ -4,12 +4,16 @@ mod parse; mod setup; use arrow::record_batch::RecordBatch; -use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner}; +use query::{ + exec::{Executor, ExecutorType}, + frontend::sql::SqlQueryPlanner, +}; use snafu::{OptionExt, ResultExt, Snafu}; use std::{ io::LineWriter, io::Write, path::{Path, PathBuf}, + sync::Arc, }; use self::{parse::TestQueries, setup::TestSetup}; @@ -261,7 +265,13 @@ impl Runner { writeln!(self.log, "Running scenario '{}'", scenario_name)?; writeln!(self.log, "SQL: '{:#?}'", sql)?; let planner = SqlQueryPlanner::default(); - let executor = db.executor(); + let num_threads = 1; + let mut executor = Executor::new(num_threads); + + // hardcode concurrency in tests as by default is is the + // number of cores, which varies across machines + executor.config_mut().set_concurrency(4); + let executor = Arc::new(executor); let physical_plan = planner .query(db, &sql, executor.as_ref()) diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs index 58072c5a97..6fc2c13550 100644 --- a/query_tests/src/sql.rs +++ b/query_tests/src/sql.rs @@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() { // validate we have access to information schema for listing table // names let expected = vec![ - "+---------------+--------------------+---------------+------------+", - "| table_catalog | table_schema | table_name | table_type |", - "+---------------+--------------------+---------------+------------+", - "| public | information_schema | columns | VIEW |", - "| public | information_schema | tables | VIEW |", - "| public | iox | h2o | BASE TABLE |", - "| public | iox | o2 | BASE TABLE |", - "| public | system | chunk_columns | BASE TABLE |", - "| public | system | chunks | BASE TABLE |", - "| public | system | columns | BASE TABLE |", - "| public | system | operations | BASE TABLE |", - "+---------------+--------------------+---------------+------------+", + "+---------------+--------------------+---------------------+------------+", + "| table_catalog | table_schema | table_name | table_type |", + "+---------------+--------------------+---------------------+------------+", + "| public | information_schema | columns | VIEW |", + "| public | information_schema | tables | VIEW |", + "| public | iox | h2o | BASE TABLE |", + "| public | iox | o2 | BASE TABLE |", + "| public | system | chunk_columns | BASE TABLE |", + "| public | system | chunks | BASE TABLE |", + "| public | system | columns | BASE TABLE |", + "| public | system | operations | BASE TABLE |", + "| public | system | persistence_windows | BASE TABLE |", + "+---------------+--------------------+---------------------+------------+", ]; run_sql_test_case!( TwoMeasurementsManyFields {}, diff --git a/server/src/config.rs b/server/src/config.rs index 65dab0501c..f869319a87 100644 --- a/server/src/config.rs +++ b/server/src/config.rs @@ -8,7 +8,7 @@ use data_types::{ DatabaseName, }; use metrics::MetricRegistry; -use object_store::{path::ObjectStorePath, ObjectStore}; +use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi}; use parquet_file::catalog::PreservedCatalog; use query::exec::Executor; use write_buffer::config::WriteBufferConfig; @@ -16,9 +16,13 @@ use write_buffer::config::WriteBufferConfig; /// This module contains code for managing the configuration of the server. use crate::{ db::{catalog::Catalog, DatabaseToCommit, Db}, - Error, JobRegistry, Result, + DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error, + InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch, + ServerShuttingDown, }; +use object_store::path::Path; use observability_deps::tracing::{self, error, info, warn, Instrument}; +use snafu::{ensure, OptionExt}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; @@ -34,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb"; /// run to completion if the tokio runtime is dropped #[derive(Debug)] pub(crate) struct Config { - shutdown: CancellationToken, jobs: Arc, - state: RwLock, + object_store: Arc, + exec: Arc, + server_id: ServerId, metric_registry: Arc, + + shutdown: CancellationToken, + state: RwLock, } pub(crate) enum UpdateError { @@ -55,14 +63,20 @@ impl Config { /// Create new empty config. pub(crate) fn new( jobs: Arc, + object_store: Arc, + exec: Arc, + server_id: ServerId, metric_registry: Arc, remote_template: Option, ) -> Self { Self { + jobs, + object_store, + exec, + server_id, + metric_registry, shutdown: Default::default(), state: RwLock::new(ConfigState::new(remote_template)), - jobs, - metric_registry, } } @@ -77,33 +91,20 @@ impl Config { /// This only works if the database is not yet known. To recover a database out of an uninitialized state, see /// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog) /// without initializing it, see [`block_db`](Self::block_db). - pub(crate) fn create_db( - &self, - object_store: Arc, - exec: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - ) -> Result> { + pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } - if state.databases.contains_key(&db_name) { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); + ensure!( + !state.databases.contains_key(&db_name), + DatabaseAlreadyExists { db_name } + ); state.reservations.insert(db_name.clone()); Ok(DatabaseHandle { - state: Some(Arc::new(DatabaseState::Known { - object_store, - exec, - server_id, - db_name, - })), + state: Some(Arc::new(DatabaseState::Known { db_name })), config: &self, }) } @@ -115,32 +116,27 @@ impl Config { /// While the handle is held, no other operations for the given database can be executed. /// /// This only works if the database is known but is uninitialized. To create a new database that is not yet known, - /// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog) + /// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog) /// without initializing it, see [`block_db`](Self::block_db). pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); - let db_state = - state - .databases - .get(&db_name) - .cloned() - .ok_or_else(|| Error::DatabaseNotFound { - db_name: db_name.to_string(), - })?; + let db_state = state + .databases + .get(&db_name) + .cloned() + .context(DatabaseNotFound { db_name: &db_name })?; - if db_state.is_initialized() { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !db_state.is_initialized(), + DatabaseAlreadyExists { db_name } + ); - state.reservations.insert(db_name.clone()); + state.reservations.insert(db_name); Ok(DatabaseHandle { state: Some(db_state), config: &self, @@ -159,16 +155,14 @@ impl Config { db_name: DatabaseName<'static>, ) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } - if state.databases.contains_key(&db_name) { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); + ensure!( + !state.databases.contains_key(&db_name), + DatabaseAlreadyExists { db_name } + ); state.reservations.insert(db_name.clone()); Ok(BlockDatabaseGuard { @@ -228,11 +222,9 @@ impl Config { // TODO: implement for non-initialized databases let db = self .db_initialized(db_name) - .ok_or_else(|| Error::DatabaseNotFound { - db_name: db_name.to_string(), - })?; + .context(DatabaseNotFound { db_name })?; - db.update_db_rules(update).map_err(UpdateError::Closure) + db.update_rules(update).map_err(UpdateError::Closure) } /// Get all registered remote servers. @@ -311,6 +303,24 @@ impl Config { pub fn metrics_registry(&self) -> Arc { Arc::clone(&self.metric_registry) } + + /// Returns the object store of this server + pub fn object_store(&self) -> Arc { + Arc::clone(&self.object_store) + } + + /// Returns the server id of this server + pub fn server_id(&self) -> ServerId { + self.server_id + } + + /// Base location in object store for this server. + pub fn root_path(&self) -> Path { + let id = self.server_id.get(); + let mut path = self.object_store.new_path(); + path.push_dir(format!("{}", id)); + path + } } /// Get object store path for the database config under the given root (= path under with the server with the current ID @@ -373,41 +383,14 @@ impl RemoteTemplate { } /// Internal representation of the different database states. -/// -/// # Shared Data During Transitions -/// The following elements can safely be shared between states because they won't be poisoned by any half-done -/// transition (e.g. starting a transition and then failing due to an IO error): -/// - `object_store` -/// - `exec` -/// -/// The following elements can trivially be copied from one state to the next: -/// - `server_id` -/// - `db_name` -/// -/// The following elements MUST be copied from one state to the next because partial modifications are not allowed: -/// - `rules` -/// -/// Exceptions to the above rules are the following states: -/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm -/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else #[derive(Debug)] #[allow(clippy::large_enum_variant)] enum DatabaseState { /// Database is known but nothing is loaded. - Known { - object_store: Arc, - exec: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - }, + Known { db_name: DatabaseName<'static> }, /// Rules are loaded - RulesLoaded { - object_store: Arc, - exec: Arc, - server_id: ServerId, - rules: Arc, - }, + RulesLoaded { rules: Arc }, /// Catalog is loaded but data from sequencers / write buffers is not yet replayed. Replay { db: Arc }, @@ -465,24 +448,6 @@ impl DatabaseState { } } - fn object_store(&self) -> Arc { - match self { - DatabaseState::Known { object_store, .. } => Arc::clone(object_store), - DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store), - DatabaseState::Replay { db, .. } => Arc::clone(&db.store), - DatabaseState::Initialized { db, .. } => Arc::clone(&db.store), - } - } - - fn server_id(&self) -> ServerId { - match self { - DatabaseState::Known { server_id, .. } => *server_id, - DatabaseState::RulesLoaded { server_id, .. } => *server_id, - DatabaseState::Replay { db, .. } => db.server_id, - DatabaseState::Initialized { db, .. } => db.server_id, - } - } - fn rules(&self) -> Option> { match self { DatabaseState::Known { .. } => None, @@ -548,12 +513,12 @@ impl<'a> DatabaseHandle<'a> { /// Get object store. pub fn object_store(&self) -> Arc { - self.state().object_store() + Arc::clone(&self.config.object_store) } /// Get server ID. pub fn server_id(&self) -> ServerId { - self.state().server_id() + self.config.server_id } /// Get metrics registry. @@ -592,32 +557,26 @@ impl<'a> DatabaseHandle<'a> { /// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded). pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> { match self.state().as_ref() { - DatabaseState::Known { - object_store, - exec, - server_id, - db_name, - } => { - if db_name != &rules.name { - return Err(Error::RulesDatabaseNameMismatch { - actual: rules.name.to_string(), - expected: db_name.to_string(), - }); - } + DatabaseState::Known { db_name } => { + ensure!( + db_name == &rules.name, + RulesDatabaseNameMismatch { + actual: rules.name, + expected: db_name, + } + ); self.state = Some(Arc::new(DatabaseState::RulesLoaded { - object_store: Arc::clone(&object_store), - exec: Arc::clone(&exec), - server_id: *server_id, rules: Arc::new(rules), })); Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::Known, - }), + } + .fail(), } } @@ -629,16 +588,11 @@ impl<'a> DatabaseHandle<'a> { write_buffer: Option, ) -> Result<()> { match self.state().as_ref() { - DatabaseState::RulesLoaded { - object_store, - exec, - server_id, - rules, - } => { + DatabaseState::RulesLoaded { rules } => { let database_to_commit = DatabaseToCommit { - server_id: *server_id, - object_store: Arc::clone(&object_store), - exec: Arc::clone(&exec), + server_id: self.config.server_id, + object_store: Arc::clone(&self.config.object_store), + exec: Arc::clone(&self.config.exec), preserved_catalog, catalog, rules: Arc::clone(&rules), @@ -650,10 +604,11 @@ impl<'a> DatabaseHandle<'a> { Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::RulesLoaded, - }), + } + .fail(), } } @@ -663,7 +618,7 @@ impl<'a> DatabaseHandle<'a> { DatabaseState::Replay { db } => { if self.config.shutdown.is_cancelled() { error!("server is shutting down"); - return Err(Error::ServerShuttingDown); + return ServerShuttingDown.fail(); } let shutdown = self.config.shutdown.child_token(); @@ -686,10 +641,11 @@ impl<'a> DatabaseHandle<'a> { Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::Replay, - }), + } + .fail(), } } } @@ -730,40 +686,32 @@ mod test { use super::*; use std::num::NonZeroU32; + fn make_config(remote_template: Option) -> Config { + let store = Arc::new(ObjectStore::new_in_memory()); + let server_id = ServerId::try_from(1).unwrap(); + let metric_registry = Arc::new(metrics::MetricRegistry::new()); + Config::new( + Arc::new(JobRegistry::new()), + Arc::clone(&store), + Arc::new(Executor::new(1)), + server_id, + Arc::clone(&metric_registry), + remote_template, + ) + } + #[tokio::test] async fn create_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); // getting handle while DB is reserved => fails { - let _db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let _db_reservation = config.create_db(name.clone()).unwrap(); - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseReserved { .. })); let err = config.block_db(name.clone()).unwrap_err(); @@ -775,14 +723,7 @@ mod test { // name in rules must match reserved name { - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - DatabaseName::new("bar").unwrap(), - ) - .unwrap(); + let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap(); let err = db_reservation .advance_rules_loaded(rules.clone()) @@ -795,14 +736,7 @@ mod test { // handle.abort just works (aka does not mess up the transaction afterwards) { - let db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - DatabaseName::new("bar").unwrap(), - ) - .unwrap(); + let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap(); db_reservation.abort(); } @@ -812,21 +746,14 @@ mod test { // create DB successfull { - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let mut db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.advance_rules_loaded(rules).unwrap(); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -866,14 +793,7 @@ mod test { assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // create DB as second time => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // block fully initiliazed DB => fail @@ -888,40 +808,18 @@ mod test { async fn recover_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); // create DB but don't continue with rules loaded (e.g. because the rules file is broken) { - let db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.commit(); } assert!(config.has_uninitialized_database(&name)); // create DB while it is uninitialized => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // recover an unknown DB => fail @@ -935,19 +833,19 @@ mod test { let mut db_reservation = config.recover_db(name.clone()).unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_none()); db_reservation.advance_rules_loaded(rules).unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -958,13 +856,13 @@ mod test { .unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); db_reservation.advance_init().unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); db_reservation.commit(); @@ -978,14 +876,7 @@ mod test { assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // create recovered DB => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // block recovered DB => fail @@ -1000,28 +891,13 @@ mod test { async fn block_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); // block DB let handle = config.block_db(name.clone()).unwrap(); // create while blocked => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseReserved { .. })); // recover while blocked => fail @@ -1034,14 +910,7 @@ mod test { // unblock => DB can be created drop(handle); - config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + config.create_db(name.clone()).unwrap(); // cleanup config.drain().await @@ -1051,20 +920,12 @@ mod test { async fn test_db_drop() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -1072,14 +933,7 @@ mod test { .unwrap(); // create DB - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let mut db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.advance_rules_loaded(rules).unwrap(); db_reservation .advance_replay(preserved_catalog, catalog, None) @@ -1126,12 +980,7 @@ mod test { #[test] fn resolve_remote() { - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - Some(RemoteTemplate::new("http://iox-query-{id}:8082")), - ); + let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082"))); let server_id = ServerId::new(NonZeroU32::new(42).unwrap()); let remote = config.resolve_remote(server_id); diff --git a/server/src/db.rs b/server/src/db.rs index c39655f107..6dee5446b1 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -50,7 +50,7 @@ use std::{ time::{Duration, Instant}, }; use write_buffer::config::WriteBufferConfig; -use write_buffer::core::WriteBufferError; +use write_buffer::core::{FetchHighWatermark, WriteBufferError}; pub mod access; pub mod catalog; @@ -144,6 +144,91 @@ pub enum Error { pub type Result = std::result::Result; +/// Metrics for data ingest via write buffer. +#[derive(Debug)] +struct WriteBufferIngestMetrics { + /// Metrics domain + domain: Arc, +} + +impl WriteBufferIngestMetrics { + fn new(domain: Arc) -> Self { + Self { domain } + } + + fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics { + let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())]; + + let red = self + .domain + .register_red_metric_with_labels(Some("ingest"), labels.clone()); + let bytes_read = self.domain.register_counter_metric_with_labels( + "read", + Some("bytes"), + "Bytes read from sequencer", + labels.clone(), + ); + let last_sequence_number = self.domain.register_gauge_metric_with_labels( + "last_sequence_number", + None, + "Last consumed sequence number (e.g. Kafka offset)", + &labels, + ); + let sequence_number_lag = self.domain.register_gauge_metric_with_labels( + "sequence_number_lag", + None, + "The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number", + &labels, + ); + let last_min_ts = self.domain.register_gauge_metric_with_labels( + "last_min_ts", + None, + "Minimum timestamp of last write as unix timestamp in nanoseconds", + &labels, + ); + let last_max_ts = self.domain.register_gauge_metric_with_labels( + "last_max_ts", + None, + "Maximum timestamp of last write as unix timestamp in nanoseconds", + &labels, + ); + + SequencerMetrics { + red, + bytes_read, + last_sequence_number, + sequence_number_lag, + last_min_ts, + last_max_ts, + } + } +} + +/// Metrics for a single sequencer. +#[derive(Debug)] +struct SequencerMetrics { + /// Metrics for tracking ingest. + red: metrics::RedMetric, + + /// Bytes read from sequencer. + /// + /// This metrics is independent of the success / error state of the entries. + bytes_read: metrics::Counter, + + /// Last consumed sequence number (e.g. Kafka offset). + last_sequence_number: metrics::Gauge, + + // The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed + // sequence number. + sequence_number_lag: metrics::Gauge, + + /// Minimum timestamp of last write as unix timestamp in nanoseconds. + last_min_ts: metrics::Gauge, + + /// Maximum timestamp of last write as unix timestamp in nanoseconds. + last_max_ts: metrics::Gauge, +} + /// This is the main IOx Database object. It is the root object of any /// specific InfluxDB IOx instance /// @@ -203,10 +288,10 @@ pub type Result = std::result::Result; pub struct Db { rules: RwLock>, - pub server_id: ServerId, // this is also the Query Server ID + server_id: ServerId, // this is also the Query Server ID /// Interface to use for persistence - pub store: Arc, + store: Arc, /// Executor for running queries exec: Arc, @@ -248,8 +333,8 @@ pub struct Db { /// Metric labels metric_labels: Vec, - /// Metrics for tracking the number of errors that occur while ingesting data - ingest_errors: metrics::Counter, + /// Ingest metrics + ingest_metrics: WriteBufferIngestMetrics, /// Optionally connect to a write buffer for either buffering writes or reading buffered writes write_buffer: Option, @@ -285,9 +370,8 @@ impl Db { let metric_labels = database_to_commit.catalog.metric_labels.clone(); let ingest_domain = - metrics_registry.register_domain_with_labels("ingest", metric_labels.clone()); - let ingest_errors = - ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest"); + metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone()); + let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain)); let catalog = Arc::new(database_to_commit.catalog); @@ -316,7 +400,7 @@ impl Db { worker_iterations_lifecycle: AtomicUsize::new(0), worker_iterations_cleanup: AtomicUsize::new(0), metric_labels, - ingest_errors, + ingest_metrics, write_buffer: database_to_commit.write_buffer, cleanup_lock: Default::default(), } @@ -333,13 +417,40 @@ impl Db { } /// Updates the database rules - pub fn update_db_rules(&self, update: F) -> Result, E> + pub fn update_rules(&self, update: F) -> Result, E> where F: FnOnce(DatabaseRules) -> Result, { - let mut rules = self.rules.write(); - let new_rules = Arc::new(update(rules.as_ref().clone())?); - *rules = Arc::clone(&new_rules); + let (late_arrive_window_updated, new_rules) = { + let mut rules = self.rules.write(); + info!(db_name=%rules.name, "updating rules for database"); + let new_rules = Arc::new(update(rules.as_ref().clone())?); + let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds + != new_rules.lifecycle_rules.late_arrive_window_seconds; + + *rules = Arc::clone(&new_rules); + (late_arrive_window_updated, new_rules) + }; + + if late_arrive_window_updated { + // Hold a read lock to prevent concurrent modification and + // use values from re-acquired read guard + let current = self.rules.read(); + + // Update windows + let partitions = self.catalog.partitions(); + for partition in &partitions { + let mut partition = partition.write(); + let addr = partition.addr().clone(); + if let Some(windows) = partition.persistence_windows_mut() { + info!(partition=%addr, "updating persistence windows"); + windows.set_late_arrival_period(Duration::from_secs( + current.lifecycle_rules.late_arrive_window_seconds.get() as u64, + )) + } + } + } + Ok(new_rules) } @@ -656,9 +767,17 @@ impl Db { // streaming from the write buffer loop async { if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer { + let mut write_buffer = write_buffer + .try_lock() + .expect("no streams should exist at this point"); let mut futures = vec![]; - for (_sequencer_id, stream) in write_buffer.streams() { - let fut = self.stream_in_sequenced_entries(stream); + for (sequencer_id, stream) in write_buffer.streams() { + let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id); + let fut = self.stream_in_sequenced_entries( + stream.stream, + stream.fetch_high_watermark, + metrics, + ); futures.push(fut); } @@ -675,32 +794,122 @@ impl Db { /// This is used to take entries from a `Stream` and put them in the mutable buffer, such as /// streaming entries from a write buffer. - async fn stream_in_sequenced_entries( - &self, - stream: BoxStream<'_, Result>, + async fn stream_in_sequenced_entries<'a>( + &'a self, + mut stream: BoxStream<'a, Result>, + f_mark: FetchHighWatermark<'a>, + mut metrics: SequencerMetrics, ) { - stream - .for_each(|sequenced_entry_result| async { - let sequenced_entry = match sequenced_entry_result { - Ok(sequenced_entry) => sequenced_entry, - Err(e) => { - debug!(?e, "Error converting write buffer data to SequencedEntry"); - self.ingest_errors.add(1); - return; - } - }; + let mut watermark_last_updated: Option = None; + let mut watermark = 0; - let sequenced_entry = Arc::new(sequenced_entry); + while let Some(sequenced_entry_result) = stream.next().await { + let red_observation = metrics.red.observation(); - if let Err(e) = self.store_sequenced_entry(sequenced_entry) { + // get entry from sequencer + let sequenced_entry = match sequenced_entry_result { + Ok(sequenced_entry) => sequenced_entry, + Err(e) => { + debug!(?e, "Error converting write buffer data to SequencedEntry"); + red_observation.client_error(); + continue; + } + }; + let sequenced_entry = Arc::new(sequenced_entry); + + // store entry + match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) { + Ok(_) => { + red_observation.ok(); + } + Err(e) => { debug!( ?e, "Error storing SequencedEntry from write buffer in database" ); - self.ingest_errors.add(1); + red_observation.error(); } - }) - .await + } + + // maybe update sequencer watermark + // We are not updating this watermark every round because asking the sequencer for that watermark can be + // quite expensive. + if watermark_last_updated + .map(|ts| ts.elapsed() > Duration::from_secs(10)) + .unwrap_or(true) + { + match f_mark().await { + Ok(w) => { + watermark = w; + } + Err(e) => { + debug!(%e, "Error while reading sequencer watermark") + } + } + watermark_last_updated = Some(Instant::now()); + } + + // update: + // - bytes read + // - last sequence number + // - lag + // - min ts + // - max ts + let sequence = sequenced_entry + .sequence() + .expect("entry from write buffer must be sequenced"); + let entry = sequenced_entry.entry(); + metrics.bytes_read.add(entry.data().len() as u64); + metrics + .last_sequence_number + .set(sequence.number as usize, &[]); + metrics.sequence_number_lag.set( + watermark.saturating_sub(sequence.number).saturating_sub(1) as usize, + &[], + ); + if let Some(min_ts) = entry + .partition_writes() + .map(|partition_writes| { + partition_writes + .iter() + .filter_map(|partition_write| { + partition_write + .table_batches() + .iter() + .filter_map(|table_batch| table_batch.min_max_time().ok()) + .map(|(min, _max)| min) + .max() + }) + .min() + }) + .flatten() + { + metrics + .last_min_ts + .set(min_ts.timestamp_nanos() as usize, &[]); + } + if let Some(max_ts) = entry + .partition_writes() + .map(|partition_writes| { + partition_writes + .iter() + .filter_map(|partition_write| { + partition_write + .table_batches() + .iter() + .filter_map(|table_batch| table_batch.min_max_time().ok()) + .map(|(_min, max)| max) + .max() + }) + .max() + }) + .flatten() + { + metrics + .last_max_ts + .set(max_ts.timestamp_nanos() as usize, &[]); + } + } } async fn cleanup_unreferenced_parquet_files( @@ -1208,17 +1417,27 @@ mod tests { #[tokio::test] async fn read_from_write_buffer_write_to_mutable_buffer() { - let entry = lp_to_entry("cpu bar=1 10"); let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1); - write_buffer_state - .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap()); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + write_buffer_state.push_entry( + SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10")) + .unwrap(), + ); + write_buffer_state.push_entry( + SequencedEntry::new_from_sequence( + Sequence::new(0, 7), + lp_to_entry("cpu bar=2 20\ncpu bar=3 30"), + ) + .unwrap(), + ); + let write_buffer = MockBufferForReading::new(write_buffer_state); - let db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + let test_db = TestDb::builder() + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() - .await - .db; + .await; + let db = test_db.db; // do: start background task loop let shutdown: CancellationToken = Default::default(); @@ -1247,18 +1466,84 @@ mod tests { tokio::time::sleep(Duration::from_millis(100)).await; } + // check: metrics + // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise + let metrics = test_db.metric_registry; + metrics + .has_metric_family("write_buffer_ingest_requests_total") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ("status", "ok"), + ]) + .counter() + .eq(2.0) + .unwrap(); + metrics + .has_metric_family("write_buffer_read_bytes_total") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .counter() + .eq(528.0) + .unwrap(); + metrics + .has_metric_family("write_buffer_last_sequence_number") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(7.0) + .unwrap(); + metrics + .has_metric_family("write_buffer_sequence_number_lag") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(0.0) + .unwrap(); + metrics + .has_metric_family("write_buffer_last_min_ts") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(20.0) + .unwrap(); + metrics + .has_metric_family("write_buffer_last_max_ts") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(30.0) + .unwrap(); + // do: stop background task loop shutdown.cancel(); join_handle.await.unwrap(); // check: the expected results should be there - let batches = run_query(db, "select * from cpu").await; + let batches = run_query(db, "select * from cpu order by time").await; let expected = vec![ "+-----+-------------------------------+", "| bar | time |", "+-----+-------------------------------+", - "| 1 | 1970-01-01 00:00:00.000000010 |", + "| 2 | 1970-01-01 00:00:00.000000020 |", + "| 3 | 1970-01-01 00:00:00.000000030 |", "+-----+-------------------------------+", ]; assert_batches_eq!(expected, &batches); @@ -1271,10 +1556,12 @@ mod tests { String::from("Something bad happened on the way to creating a SequencedEntry").into(), 0, ); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + let write_buffer = MockBufferForReading::new(write_buffer_state); let test_db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() .await; @@ -1291,11 +1578,16 @@ mod tests { // check: after a while the error should be reported in the database's metrics let t_0 = Instant::now(); loop { - let family = metrics.try_has_metric_family("ingest_errors_total"); + let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total"); if let Ok(metric) = family { if metric - .with_labels(&[("db_name", "placeholder"), ("svr_id", "1")]) + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ("status", "client_error"), + ]) .counter() .eq(1.0) .is_ok() @@ -2259,10 +2551,12 @@ mod tests { ); write_buffer_state .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap()); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + let write_buffer = MockBufferForReading::new(write_buffer_state); let db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() .await .db; diff --git a/server/src/db/catalog.rs b/server/src/db/catalog.rs index 6a06b06731..dff3c37b6b 100644 --- a/server/src/db/catalog.rs +++ b/server/src/db/catalog.rs @@ -6,15 +6,16 @@ use hashbrown::{HashMap, HashSet}; use data_types::chunk_metadata::ChunkSummary; use data_types::chunk_metadata::DetailedChunkSummary; -use data_types::partition_metadata::{PartitionSummary, TableSummary}; +use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary}; use internal_types::schema::Schema; -use snafu::Snafu; +use snafu::{OptionExt, Snafu}; use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; use self::chunk::CatalogChunk; use self::metrics::CatalogMetrics; use self::partition::Partition; use self::table::Table; +use data_types::write_summary::WriteSummary; pub mod chunk; mod metrics; @@ -135,11 +136,8 @@ impl Catalog { /// Get a specific table by name, returning `None` if there is no such table pub fn table(&self, table_name: impl AsRef) -> Result> { let table_name = table_name.as_ref(); - RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err( - |_| Error::TableNotFound { - table: table_name.to_string(), - }, - ) + RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)) + .map_err(|_| TableNotFound { table: table_name }.build()) } /// Get a specific partition by name, returning an error if it can't be found @@ -154,9 +152,9 @@ impl Catalog { self.table(table_name)? .partition(partition_key) .cloned() - .ok_or_else(|| Error::PartitionNotFound { - partition: partition_key.to_string(), - table: table_name.to_string(), + .context(PartitionNotFound { + partition: partition_key, + table: table_name, }) } @@ -174,9 +172,9 @@ impl Catalog { .read() .chunk(chunk_id) .cloned() - .ok_or_else(|| Error::ChunkNotFound { - partition: partition_key.to_string(), - table: table_name.to_string(), + .context(ChunkNotFound { + partition: partition_key, + table: table_name, chunk_id, }) } @@ -228,6 +226,23 @@ impl Catalog { .collect() } + /// Returns a list of persistence window summaries for each partition + pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> { + let mut summaries = Vec::new(); + let tables = self.tables.read(); + for table in tables.values() { + for partition in table.partitions() { + let partition = partition.read(); + if let Some(w) = partition.persistence_windows() { + for summary in w.summaries() { + summaries.push((partition.addr().clone(), summary)) + } + } + } + } + summaries + } + pub fn chunk_summaries(&self) -> Vec { let partition_key = None; let table_names = TableNameFilter::AllTables; diff --git a/server/src/db/chunk.rs b/server/src/db/chunk.rs index 997030cb08..e4ba6870cd 100644 --- a/server/src/db/chunk.rs +++ b/server/src/db/chunk.rs @@ -5,7 +5,7 @@ use std::{ use data_types::partition_metadata; use partition_metadata::TableSummary; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_util::MemoryStream; @@ -417,7 +417,7 @@ impl QueryChunk for DbChunk { // column out to get the set of values. let values = values .remove(column_name) - .ok_or_else(|| Error::ReadBufferError { + .with_context(|| ReadBufferError { chunk_id: self.id(), msg: format!( "failed to find column_name {:?} in results of tag_values", diff --git a/server/src/db/system_tables.rs b/server/src/db/system_tables.rs index f80f06b9bc..bcc474e230 100644 --- a/server/src/db/system_tables.rs +++ b/server/src/db/system_tables.rs @@ -7,38 +7,31 @@ //! //! For example `SELECT * FROM system.chunks` -use std::convert::AsRef; +use std::any::Any; use std::sync::Arc; -use std::{any::Any, collections::HashMap}; - -use chrono::{DateTime, Utc}; use arrow::{ - array::{ - ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray, - UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, - }, - datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}, + datatypes::{Field, Schema, SchemaRef}, error::Result, record_batch::RecordBatch, }; -use data_types::{ - chunk_metadata::{ChunkSummary, DetailedChunkSummary}, - error::ErrorLogger, - job::Job, - partition_metadata::PartitionSummary, -}; +use chrono::{DateTime, Utc}; + use datafusion::{ catalog::schema::SchemaProvider, datasource::{datasource::Statistics, TableProvider}, error::{DataFusionError, Result as DataFusionResult}, physical_plan::{memory::MemoryExec, ExecutionPlan}, }; -use tracker::TaskTracker; + +use crate::JobRegistry; use super::catalog::Catalog; -use crate::JobRegistry; -use data_types::partition_metadata::TableSummary; + +mod chunks; +mod columns; +mod operations; +mod persistence; // The IOx system schema pub const SYSTEM_SCHEMA: &str = "system"; @@ -47,12 +40,14 @@ const CHUNKS: &str = "chunks"; const COLUMNS: &str = "columns"; const CHUNK_COLUMNS: &str = "chunk_columns"; const OPERATIONS: &str = "operations"; +const PERSISTENCE_WINDOWS: &str = "persistence_windows"; pub struct SystemSchemaProvider { chunks: Arc, columns: Arc, chunk_columns: Arc, operations: Arc, + persistence_windows: Arc, } impl std::fmt::Debug for SystemSchemaProvider { @@ -67,22 +62,26 @@ impl SystemSchemaProvider { pub fn new(db_name: impl Into, catalog: Arc, jobs: Arc) -> Self { let db_name = db_name.into(); let chunks = Arc::new(SystemTableProvider { - inner: ChunksTable::new(Arc::clone(&catalog)), + inner: chunks::ChunksTable::new(Arc::clone(&catalog)), }); let columns = Arc::new(SystemTableProvider { - inner: ColumnsTable::new(Arc::clone(&catalog)), + inner: columns::ColumnsTable::new(Arc::clone(&catalog)), }); let chunk_columns = Arc::new(SystemTableProvider { - inner: ChunkColumnsTable::new(catalog), + inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)), }); let operations = Arc::new(SystemTableProvider { - inner: OperationsTable::new(db_name, jobs), + inner: operations::OperationsTable::new(db_name, jobs), + }); + let persistence_windows = Arc::new(SystemTableProvider { + inner: persistence::PersistenceWindowsTable::new(catalog), }); Self { chunks, columns, chunk_columns, operations, + persistence_windows, } } } @@ -98,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider { COLUMNS.to_string(), CHUNK_COLUMNS.to_string(), OPERATIONS.to_string(), + PERSISTENCE_WINDOWS.to_string(), ] } @@ -107,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider { COLUMNS => Some(Arc::clone(&self.columns)), CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)), OPERATIONS => Some(Arc::clone(&self.operations)), + PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)), _ => None, } } @@ -162,407 +163,6 @@ fn time_to_ts(time: Option>) -> Option { time.map(|ts| ts.timestamp_nanos()) } -/// Implementation of system.chunks table -#[derive(Debug)] -struct ChunksTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ChunksTable { - fn new(catalog: Arc) -> Self { - Self { - schema: chunk_summaries_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ChunksTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - from_chunk_summaries(self.schema(), self.catalog.chunk_summaries()) - .log_if_error("system.chunks table") - } -} - -fn chunk_summaries_schema() -> SchemaRef { - let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); - Arc::new(Schema::new(vec![ - Field::new("id", DataType::UInt32, false), - Field::new("partition_key", DataType::Utf8, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("storage", DataType::Utf8, false), - Field::new("lifecycle_action", DataType::Utf8, true), - Field::new("memory_bytes", DataType::UInt64, false), - Field::new("object_store_bytes", DataType::UInt64, false), - Field::new("row_count", DataType::UInt64, false), - Field::new("time_of_first_write", ts.clone(), true), - Field::new("time_of_last_write", ts.clone(), true), - Field::new("time_closed", ts, true), - ])) -} - -fn from_chunk_summaries(schema: SchemaRef, chunks: Vec) -> Result { - let id = chunks.iter().map(|c| Some(c.id)).collect::(); - let partition_key = chunks - .iter() - .map(|c| Some(c.partition_key.as_ref())) - .collect::(); - let table_name = chunks - .iter() - .map(|c| Some(c.table_name.as_ref())) - .collect::(); - let storage = chunks - .iter() - .map(|c| Some(c.storage.as_str())) - .collect::(); - let lifecycle_action = chunks - .iter() - .map(|c| c.lifecycle_action.map(|a| a.name())) - .collect::(); - let memory_bytes = chunks - .iter() - .map(|c| Some(c.memory_bytes as u64)) - .collect::(); - let object_store_bytes = chunks - .iter() - .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0)) - .collect::(); - let row_counts = chunks - .iter() - .map(|c| Some(c.row_count as u64)) - .collect::(); - let time_of_first_write = chunks - .iter() - .map(|c| c.time_of_first_write) - .map(time_to_ts) - .collect::(); - let time_of_last_write = chunks - .iter() - .map(|c| c.time_of_last_write) - .map(time_to_ts) - .collect::(); - let time_closed = chunks - .iter() - .map(|c| c.time_closed) - .map(time_to_ts) - .collect::(); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(id), - Arc::new(partition_key), - Arc::new(table_name), - Arc::new(storage), - Arc::new(lifecycle_action), - Arc::new(memory_bytes), - Arc::new(object_store_bytes), - Arc::new(row_counts), - Arc::new(time_of_first_write), - Arc::new(time_of_last_write), - Arc::new(time_closed), - ], - ) -} - -/// Implementation of `system.columns` system table -#[derive(Debug)] -struct ColumnsTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ColumnsTable { - fn new(catalog: Arc) -> Self { - Self { - schema: partition_summaries_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ColumnsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - fn batch(&self) -> Result { - from_partition_summaries(self.schema(), self.catalog.partition_summaries()) - .log_if_error("system.columns table") - } -} - -fn partition_summaries_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("partition_key", DataType::Utf8, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("column_type", DataType::Utf8, false), - Field::new("influxdb_type", DataType::Utf8, true), - ])) -} - -fn from_partition_summaries( - schema: SchemaRef, - partitions: Vec, -) -> Result { - // Assume each partition has roughly 5 tables with 5 columns - let row_estimate = partitions.len() * 25; - - let mut partition_key = StringBuilder::new(row_estimate); - let mut table_name = StringBuilder::new(row_estimate); - let mut column_name = StringBuilder::new(row_estimate); - let mut column_type = StringBuilder::new(row_estimate); - let mut influxdb_type = StringBuilder::new(row_estimate); - - // Note no rows are produced for partitions with no tabes, or - // tables with no columns: There are other tables to list tables - // and columns - for partition in partitions { - let table = partition.table; - for column in table.columns { - partition_key.append_value(&partition.key)?; - table_name.append_value(&table.name)?; - column_name.append_value(&column.name)?; - column_type.append_value(column.type_name())?; - if let Some(t) = &column.influxdb_type { - influxdb_type.append_value(t.as_str())?; - } else { - influxdb_type.append_null()?; - } - } - } - - RecordBatch::try_new( - schema, - vec![ - Arc::new(partition_key.finish()) as ArrayRef, - Arc::new(table_name.finish()), - Arc::new(column_name.finish()), - Arc::new(column_type.finish()), - Arc::new(influxdb_type.finish()), - ], - ) -} - -/// Implementation of system.column_chunks table -#[derive(Debug)] -struct ChunkColumnsTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ChunkColumnsTable { - fn new(catalog: Arc) -> Self { - Self { - schema: chunk_columns_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ChunkColumnsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries()) - .log_if_error("system.column_chunks table") - } -} - -fn chunk_columns_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("partition_key", DataType::Utf8, false), - Field::new("chunk_id", DataType::UInt32, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("storage", DataType::Utf8, false), - Field::new("row_count", DataType::UInt64, true), - Field::new("min_value", DataType::Utf8, true), - Field::new("max_value", DataType::Utf8, true), - Field::new("memory_bytes", DataType::UInt64, true), - ])) -} - -fn assemble_chunk_columns( - schema: SchemaRef, - chunk_summaries: Vec<(Arc, DetailedChunkSummary)>, -) -> Result { - /// Builds an index from column_name -> size - fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> { - summary - .columns - .iter() - .map(|column_summary| { - ( - column_summary.name.as_ref(), - column_summary.memory_bytes as u64, - ) - }) - .collect() - } - - // Assume each chunk has roughly 5 columns - let row_estimate = chunk_summaries.len() * 5; - - let mut partition_key = StringBuilder::new(row_estimate); - let mut chunk_id = UInt32Builder::new(row_estimate); - let mut table_name = StringBuilder::new(row_estimate); - let mut column_name = StringBuilder::new(row_estimate); - let mut storage = StringBuilder::new(row_estimate); - let mut row_count = UInt64Builder::new(row_estimate); - let mut min_values = StringBuilder::new(row_estimate); - let mut max_values = StringBuilder::new(row_estimate); - let mut memory_bytes = UInt64Builder::new(row_estimate); - - // Note no rows are produced for partitions with no chunks, or - // tables with no partitions: There are other tables to list tables - // and columns - for (table_summary, chunk_summary) in chunk_summaries { - let mut column_index = make_column_index(&chunk_summary); - let storage_value = chunk_summary.inner.storage.as_str(); - - for column in &table_summary.columns { - partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?; - chunk_id.append_value(chunk_summary.inner.id)?; - table_name.append_value(&chunk_summary.inner.table_name)?; - column_name.append_value(&column.name)?; - storage.append_value(storage_value)?; - row_count.append_value(column.count())?; - if let Some(v) = column.stats.min_as_str() { - min_values.append_value(v)?; - } else { - min_values.append(false)?; - } - if let Some(v) = column.stats.max_as_str() { - max_values.append_value(v)?; - } else { - max_values.append(false)?; - } - - let size = column_index.remove(column.name.as_str()); - - memory_bytes.append_option(size)?; - } - } - - RecordBatch::try_new( - schema, - vec![ - Arc::new(partition_key.finish()) as ArrayRef, - Arc::new(chunk_id.finish()), - Arc::new(table_name.finish()), - Arc::new(column_name.finish()), - Arc::new(storage.finish()), - Arc::new(row_count.finish()), - Arc::new(min_values.finish()), - Arc::new(max_values.finish()), - Arc::new(memory_bytes.finish()), - ], - ) -} - -/// Implementation of system.operations table -#[derive(Debug)] -struct OperationsTable { - schema: SchemaRef, - db_name: String, - jobs: Arc, -} - -impl OperationsTable { - fn new(db_name: String, jobs: Arc) -> Self { - Self { - schema: operations_schema(), - db_name, - jobs, - } - } -} - -impl IoxSystemTable for OperationsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked()) - .log_if_error("system.operations table") - } -} - -fn operations_schema() -> SchemaRef { - let ts = DataType::Time64(TimeUnit::Nanosecond); - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Utf8, false), - Field::new("status", DataType::Utf8, true), - Field::new("cpu_time_used", ts.clone(), true), - Field::new("wall_time_used", ts, true), - Field::new("partition_key", DataType::Utf8, true), - Field::new("chunk_id", DataType::UInt32, true), - Field::new("description", DataType::Utf8, true), - ])) -} - -fn from_task_trackers( - schema: SchemaRef, - db_name: &str, - jobs: Vec>, -) -> Result { - let jobs = jobs - .into_iter() - .filter(|job| job.metadata().db_name() == Some(db_name)) - .collect::>(); - - let ids = jobs - .iter() - .map(|job| Some(job.id().to_string())) - .collect::(); - let statuses = jobs - .iter() - .map(|job| Some(job.get_status().name())) - .collect::(); - let cpu_time_used = jobs - .iter() - .map(|job| job.get_status().cpu_nanos().map(|n| n as i64)) - .collect::(); - let wall_time_used = jobs - .iter() - .map(|job| job.get_status().wall_nanos().map(|n| n as i64)) - .collect::(); - let partition_keys = jobs - .iter() - .map(|job| job.metadata().partition_key()) - .collect::(); - let chunk_ids = jobs - .iter() - .map(|job| job.metadata().chunk_id()) - .collect::(); - let descriptions = jobs - .iter() - .map(|job| Some(job.metadata().description())) - .collect::(); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(ids) as ArrayRef, - Arc::new(statuses), - Arc::new(cpu_time_used), - Arc::new(wall_time_used), - Arc::new(partition_keys), - Arc::new(chunk_ids), - Arc::new(descriptions), - ], - ) -} - /// Creates a DataFusion ExecutionPlan node that scans a single batch /// of records. fn scan_batch( @@ -605,141 +205,10 @@ fn scan_batch( #[cfg(test)] mod tests { - use super::*; + use arrow::array::{ArrayRef, UInt64Array}; use arrow_util::assert_batches_eq; - use chrono::NaiveDateTime; - use data_types::{ - chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage}, - partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary}, - }; - #[test] - fn test_from_chunk_summaries() { - let chunks = vec![ - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 0, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action: None, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(10, 0), - Utc, - )), - time_of_last_write: None, - time_closed: None, - }, - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 1, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action: Some(ChunkLifecycleAction::Persisting), - memory_bytes: 23455, - object_store_bytes: 0, - row_count: 22, - time_of_first_write: None, - time_of_last_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(80, 0), - Utc, - )), - time_closed: None, - }, - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 2, - storage: ChunkStorage::ObjectStoreOnly, - lifecycle_action: None, - memory_bytes: 1234, - object_store_bytes: 5678, - row_count: 33, - time_of_first_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(100, 0), - Utc, - )), - time_of_last_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(200, 0), - Utc, - )), - time_closed: None, - }, - ]; - - let expected = vec![ - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - "| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |", - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - "| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |", - "| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |", - "| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |", - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - ]; - - let schema = chunk_summaries_schema(); - let batch = from_chunk_summaries(schema, chunks).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } - - #[test] - fn test_from_partition_summaries() { - let partitions = vec![ - PartitionSummary { - key: "p1".to_string(), - table: TableSummary { - name: "t1".to_string(), - columns: vec![ - ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Tag), - stats: Statistics::I64(StatValues::new_with_value(23)), - }, - ColumnSummary { - name: "c2".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::I64(StatValues::new_with_value(43)), - }, - ColumnSummary { - name: "c3".to_string(), - influxdb_type: None, - stats: Statistics::String(StatValues::new_with_value( - "foo".to_string(), - )), - }, - ColumnSummary { - name: "time".to_string(), - influxdb_type: Some(InfluxDbType::Timestamp), - stats: Statistics::I64(StatValues::new_with_value(43)), - }, - ], - }, - }, - PartitionSummary { - key: "p3".to_string(), - table: TableSummary { - name: "t1".to_string(), - columns: vec![], - }, - }, - ]; - - let expected = vec![ - "+---------------+------------+-------------+-------------+---------------+", - "| partition_key | table_name | column_name | column_type | influxdb_type |", - "+---------------+------------+-------------+-------------+---------------+", - "| p1 | t1 | c1 | I64 | Tag |", - "| p1 | t1 | c2 | I64 | Field |", - "| p1 | t1 | c3 | String | |", - "| p1 | t1 | time | I64 | Timestamp |", - "+---------------+------------+-------------+-------------+---------------+", - ]; - - let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } + use super::*; fn seq_array(start: u64, end: u64) -> ArrayRef { Arc::new(UInt64Array::from_iter_values(start..end)) @@ -820,130 +289,4 @@ mod tests { err_string ); } - - #[test] - fn test_assemble_chunk_columns() { - let lifecycle_action = None; - - let summaries = vec![ - ( - Arc::new(TableSummary { - name: "t1".to_string(), - columns: vec![ - ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::String(StatValues::new( - Some("bar".to_string()), - Some("foo".to_string()), - 55, - )), - }, - ColumnSummary { - name: "c2".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)), - }, - ], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p1".into(), - table_name: "t1".into(), - id: 42, - storage: ChunkStorage::ReadBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ - ChunkColumnSummary { - name: "c1".into(), - memory_bytes: 11, - }, - ChunkColumnSummary { - name: "c2".into(), - memory_bytes: 12, - }, - ], - }, - ), - ( - Arc::new(TableSummary { - name: "t1".to_string(), - columns: vec![ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)), - }], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p2".into(), - table_name: "t1".into(), - id: 43, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ChunkColumnSummary { - name: "c1".into(), - memory_bytes: 100, - }], - }, - ), - ( - Arc::new(TableSummary { - name: "t2".to_string(), - columns: vec![ColumnSummary { - name: "c3".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)), - }], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p2".into(), - table_name: "t2".into(), - id: 44, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ChunkColumnSummary { - name: "c3".into(), - memory_bytes: 200, - }], - }, - ), - ]; - - let expected = vec![ - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - "| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |", - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - "| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |", - "| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |", - "| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |", - "| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |", - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - ]; - - let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } } diff --git a/server/src/db/system_tables/chunks.rs b/server/src/db/system_tables/chunks.rs new file mode 100644 index 0000000000..90acda0629 --- /dev/null +++ b/server/src/db/system_tables/chunks.rs @@ -0,0 +1,201 @@ +use std::sync::Arc; + +use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::chunk_metadata::ChunkSummary; +use data_types::error::ErrorLogger; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::{time_to_ts, IoxSystemTable}; + +/// Implementation of system.chunks table +#[derive(Debug)] +pub(super) struct ChunksTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ChunksTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: chunk_summaries_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ChunksTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_chunk_summaries(self.schema(), self.catalog.chunk_summaries()) + .log_if_error("system.chunks table") + } +} + +fn chunk_summaries_schema() -> SchemaRef { + let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); + Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt32, false), + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("storage", DataType::Utf8, false), + Field::new("lifecycle_action", DataType::Utf8, true), + Field::new("memory_bytes", DataType::UInt64, false), + Field::new("object_store_bytes", DataType::UInt64, false), + Field::new("row_count", DataType::UInt64, false), + Field::new("time_of_first_write", ts.clone(), true), + Field::new("time_of_last_write", ts.clone(), true), + Field::new("time_closed", ts, true), + ])) +} + +fn from_chunk_summaries(schema: SchemaRef, chunks: Vec) -> Result { + let id = chunks.iter().map(|c| Some(c.id)).collect::(); + let partition_key = chunks + .iter() + .map(|c| Some(c.partition_key.as_ref())) + .collect::(); + let table_name = chunks + .iter() + .map(|c| Some(c.table_name.as_ref())) + .collect::(); + let storage = chunks + .iter() + .map(|c| Some(c.storage.as_str())) + .collect::(); + let lifecycle_action = chunks + .iter() + .map(|c| c.lifecycle_action.map(|a| a.name())) + .collect::(); + let memory_bytes = chunks + .iter() + .map(|c| Some(c.memory_bytes as u64)) + .collect::(); + let object_store_bytes = chunks + .iter() + .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0)) + .collect::(); + let row_counts = chunks + .iter() + .map(|c| Some(c.row_count as u64)) + .collect::(); + let time_of_first_write = chunks + .iter() + .map(|c| c.time_of_first_write) + .map(time_to_ts) + .collect::(); + let time_of_last_write = chunks + .iter() + .map(|c| c.time_of_last_write) + .map(time_to_ts) + .collect::(); + let time_closed = chunks + .iter() + .map(|c| c.time_closed) + .map(time_to_ts) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(id), + Arc::new(partition_key), + Arc::new(table_name), + Arc::new(storage), + Arc::new(lifecycle_action), + Arc::new(memory_bytes), + Arc::new(object_store_bytes), + Arc::new(row_counts), + Arc::new(time_of_first_write), + Arc::new(time_of_last_write), + Arc::new(time_closed), + ], + ) +} + +#[cfg(test)] +mod tests { + use chrono::{DateTime, NaiveDateTime, Utc}; + + use arrow_util::assert_batches_eq; + use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage}; + + use super::*; + + #[test] + fn test_from_chunk_summaries() { + let chunks = vec![ + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 0, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action: None, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(10, 0), + Utc, + )), + time_of_last_write: None, + time_closed: None, + }, + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 1, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action: Some(ChunkLifecycleAction::Persisting), + memory_bytes: 23455, + object_store_bytes: 0, + row_count: 22, + time_of_first_write: None, + time_of_last_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(80, 0), + Utc, + )), + time_closed: None, + }, + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 2, + storage: ChunkStorage::ObjectStoreOnly, + lifecycle_action: None, + memory_bytes: 1234, + object_store_bytes: 5678, + row_count: 33, + time_of_first_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(100, 0), + Utc, + )), + time_of_last_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(200, 0), + Utc, + )), + time_closed: None, + }, + ]; + + let expected = vec![ + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + "| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |", + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + "| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |", + "| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |", + "| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |", + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + ]; + + let schema = chunk_summaries_schema(); + let batch = from_chunk_summaries(schema, chunks).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} diff --git a/server/src/db/system_tables/columns.rs b/server/src/db/system_tables/columns.rs new file mode 100644 index 0000000000..5f0b8f6fdd --- /dev/null +++ b/server/src/db/system_tables/columns.rs @@ -0,0 +1,404 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::chunk_metadata::DetailedChunkSummary; +use data_types::error::ErrorLogger; +use data_types::partition_metadata::{PartitionSummary, TableSummary}; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::IoxSystemTable; + +/// Implementation of `system.columns` system table +#[derive(Debug)] +pub(super) struct ColumnsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ColumnsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: partition_summaries_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ColumnsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + fn batch(&self) -> Result { + from_partition_summaries(self.schema(), self.catalog.partition_summaries()) + .log_if_error("system.columns table") + } +} + +fn partition_summaries_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("column_type", DataType::Utf8, false), + Field::new("influxdb_type", DataType::Utf8, true), + ])) +} + +fn from_partition_summaries( + schema: SchemaRef, + partitions: Vec, +) -> Result { + // Assume each partition has roughly 5 tables with 5 columns + let row_estimate = partitions.len() * 25; + + let mut partition_key = StringBuilder::new(row_estimate); + let mut table_name = StringBuilder::new(row_estimate); + let mut column_name = StringBuilder::new(row_estimate); + let mut column_type = StringBuilder::new(row_estimate); + let mut influxdb_type = StringBuilder::new(row_estimate); + + // Note no rows are produced for partitions with no tabes, or + // tables with no columns: There are other tables to list tables + // and columns + for partition in partitions { + let table = partition.table; + for column in table.columns { + partition_key.append_value(&partition.key)?; + table_name.append_value(&table.name)?; + column_name.append_value(&column.name)?; + column_type.append_value(column.type_name())?; + if let Some(t) = &column.influxdb_type { + influxdb_type.append_value(t.as_str())?; + } else { + influxdb_type.append_null()?; + } + } + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key.finish()) as ArrayRef, + Arc::new(table_name.finish()), + Arc::new(column_name.finish()), + Arc::new(column_type.finish()), + Arc::new(influxdb_type.finish()), + ], + ) +} + +/// Implementation of system.column_chunks table +#[derive(Debug)] +pub(super) struct ChunkColumnsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ChunkColumnsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: chunk_columns_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ChunkColumnsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries()) + .log_if_error("system.column_chunks table") + } +} + +fn chunk_columns_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("chunk_id", DataType::UInt32, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("storage", DataType::Utf8, false), + Field::new("row_count", DataType::UInt64, true), + Field::new("min_value", DataType::Utf8, true), + Field::new("max_value", DataType::Utf8, true), + Field::new("memory_bytes", DataType::UInt64, true), + ])) +} + +fn assemble_chunk_columns( + schema: SchemaRef, + chunk_summaries: Vec<(Arc, DetailedChunkSummary)>, +) -> Result { + /// Builds an index from column_name -> size + fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> { + summary + .columns + .iter() + .map(|column_summary| { + ( + column_summary.name.as_ref(), + column_summary.memory_bytes as u64, + ) + }) + .collect() + } + + // Assume each chunk has roughly 5 columns + let row_estimate = chunk_summaries.len() * 5; + + let mut partition_key = StringBuilder::new(row_estimate); + let mut chunk_id = UInt32Builder::new(row_estimate); + let mut table_name = StringBuilder::new(row_estimate); + let mut column_name = StringBuilder::new(row_estimate); + let mut storage = StringBuilder::new(row_estimate); + let mut row_count = UInt64Builder::new(row_estimate); + let mut min_values = StringBuilder::new(row_estimate); + let mut max_values = StringBuilder::new(row_estimate); + let mut memory_bytes = UInt64Builder::new(row_estimate); + + // Note no rows are produced for partitions with no chunks, or + // tables with no partitions: There are other tables to list tables + // and columns + for (table_summary, chunk_summary) in chunk_summaries { + let mut column_index = make_column_index(&chunk_summary); + let storage_value = chunk_summary.inner.storage.as_str(); + + for column in &table_summary.columns { + partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?; + chunk_id.append_value(chunk_summary.inner.id)?; + table_name.append_value(&chunk_summary.inner.table_name)?; + column_name.append_value(&column.name)?; + storage.append_value(storage_value)?; + row_count.append_value(column.count())?; + if let Some(v) = column.stats.min_as_str() { + min_values.append_value(v)?; + } else { + min_values.append(false)?; + } + if let Some(v) = column.stats.max_as_str() { + max_values.append_value(v)?; + } else { + max_values.append(false)?; + } + + let size = column_index.remove(column.name.as_str()); + + memory_bytes.append_option(size)?; + } + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key.finish()) as ArrayRef, + Arc::new(chunk_id.finish()), + Arc::new(table_name.finish()), + Arc::new(column_name.finish()), + Arc::new(storage.finish()), + Arc::new(row_count.finish()), + Arc::new(min_values.finish()), + Arc::new(max_values.finish()), + Arc::new(memory_bytes.finish()), + ], + ) +} + +#[cfg(test)] +mod tests { + use arrow_util::assert_batches_eq; + use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary}; + use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics}; + + use super::*; + + #[test] + fn test_from_partition_summaries() { + let partitions = vec![ + PartitionSummary { + key: "p1".to_string(), + table: TableSummary { + name: "t1".to_string(), + columns: vec![ + ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Tag), + stats: Statistics::I64(StatValues::new_with_value(23)), + }, + ColumnSummary { + name: "c2".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::I64(StatValues::new_with_value(43)), + }, + ColumnSummary { + name: "c3".to_string(), + influxdb_type: None, + stats: Statistics::String(StatValues::new_with_value( + "foo".to_string(), + )), + }, + ColumnSummary { + name: "time".to_string(), + influxdb_type: Some(InfluxDbType::Timestamp), + stats: Statistics::I64(StatValues::new_with_value(43)), + }, + ], + }, + }, + PartitionSummary { + key: "p3".to_string(), + table: TableSummary { + name: "t1".to_string(), + columns: vec![], + }, + }, + ]; + + let expected = vec![ + "+---------------+------------+-------------+-------------+---------------+", + "| partition_key | table_name | column_name | column_type | influxdb_type |", + "+---------------+------------+-------------+-------------+---------------+", + "| p1 | t1 | c1 | I64 | Tag |", + "| p1 | t1 | c2 | I64 | Field |", + "| p1 | t1 | c3 | String | |", + "| p1 | t1 | time | I64 | Timestamp |", + "+---------------+------------+-------------+-------------+---------------+", + ]; + + let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } + + #[test] + fn test_assemble_chunk_columns() { + let lifecycle_action = None; + + let summaries = vec![ + ( + Arc::new(TableSummary { + name: "t1".to_string(), + columns: vec![ + ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::String(StatValues::new( + Some("bar".to_string()), + Some("foo".to_string()), + 55, + )), + }, + ColumnSummary { + name: "c2".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)), + }, + ], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p1".into(), + table_name: "t1".into(), + id: 42, + storage: ChunkStorage::ReadBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ + ChunkColumnSummary { + name: "c1".into(), + memory_bytes: 11, + }, + ChunkColumnSummary { + name: "c2".into(), + memory_bytes: 12, + }, + ], + }, + ), + ( + Arc::new(TableSummary { + name: "t1".to_string(), + columns: vec![ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)), + }], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p2".into(), + table_name: "t1".into(), + id: 43, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ChunkColumnSummary { + name: "c1".into(), + memory_bytes: 100, + }], + }, + ), + ( + Arc::new(TableSummary { + name: "t2".to_string(), + columns: vec![ColumnSummary { + name: "c3".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)), + }], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p2".into(), + table_name: "t2".into(), + id: 44, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ChunkColumnSummary { + name: "c3".into(), + memory_bytes: 200, + }], + }, + ), + ]; + + let expected = vec![ + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + "| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |", + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + "| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |", + "| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |", + "| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |", + "| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |", + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + ]; + + let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} diff --git a/server/src/db/system_tables/operations.rs b/server/src/db/system_tables/operations.rs new file mode 100644 index 0000000000..d8b2af0ac2 --- /dev/null +++ b/server/src/db/system_tables/operations.rs @@ -0,0 +1,108 @@ +use std::sync::Arc; + +use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::error::ErrorLogger; +use data_types::job::Job; +use tracker::TaskTracker; + +use crate::db::system_tables::IoxSystemTable; +use crate::JobRegistry; + +/// Implementation of system.operations table +#[derive(Debug)] +pub(super) struct OperationsTable { + schema: SchemaRef, + db_name: String, + jobs: Arc, +} + +impl OperationsTable { + pub(super) fn new(db_name: String, jobs: Arc) -> Self { + Self { + schema: operations_schema(), + db_name, + jobs, + } + } +} + +impl IoxSystemTable for OperationsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked()) + .log_if_error("system.operations table") + } +} + +fn operations_schema() -> SchemaRef { + let ts = DataType::Time64(TimeUnit::Nanosecond); + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + Field::new("cpu_time_used", ts.clone(), true), + Field::new("wall_time_used", ts, true), + Field::new("partition_key", DataType::Utf8, true), + Field::new("chunk_id", DataType::UInt32, true), + Field::new("description", DataType::Utf8, true), + ])) +} + +fn from_task_trackers( + schema: SchemaRef, + db_name: &str, + jobs: Vec>, +) -> Result { + let jobs = jobs + .into_iter() + .filter(|job| job.metadata().db_name() == Some(db_name)) + .collect::>(); + + let ids = jobs + .iter() + .map(|job| Some(job.id().to_string())) + .collect::(); + let statuses = jobs + .iter() + .map(|job| Some(job.get_status().name())) + .collect::(); + let cpu_time_used = jobs + .iter() + .map(|job| job.get_status().cpu_nanos().map(|n| n as i64)) + .collect::(); + let wall_time_used = jobs + .iter() + .map(|job| job.get_status().wall_nanos().map(|n| n as i64)) + .collect::(); + let partition_keys = jobs + .iter() + .map(|job| job.metadata().partition_key()) + .collect::(); + let chunk_ids = jobs + .iter() + .map(|job| job.metadata().chunk_id()) + .collect::(); + let descriptions = jobs + .iter() + .map(|job| Some(job.metadata().description())) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(ids) as ArrayRef, + Arc::new(statuses), + Arc::new(cpu_time_used), + Arc::new(wall_time_used), + Arc::new(partition_keys), + Arc::new(chunk_ids), + Arc::new(descriptions), + ], + ) +} diff --git a/server/src/db/system_tables/persistence.rs b/server/src/db/system_tables/persistence.rs new file mode 100644 index 0000000000..3392ff5032 --- /dev/null +++ b/server/src/db/system_tables/persistence.rs @@ -0,0 +1,154 @@ +use std::sync::Arc; + +use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::error::ErrorLogger; +use data_types::partition_metadata::PartitionAddr; +use data_types::write_summary::WriteSummary; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::IoxSystemTable; + +/// Implementation of system.persistence_windows table +#[derive(Debug)] +pub(super) struct PersistenceWindowsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl PersistenceWindowsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: persistence_windows_schema(), + catalog, + } + } +} + +impl IoxSystemTable for PersistenceWindowsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_write_summaries(self.schema(), self.catalog.persistence_summaries()) + .log_if_error("system.persistence_windows table") + } +} + +fn persistence_windows_schema() -> SchemaRef { + let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("row_count", DataType::UInt64, false), + Field::new("time_of_first_write", ts.clone(), false), + Field::new("time_of_last_write", ts.clone(), false), + Field::new("min_timestamp", ts.clone(), false), + Field::new("max_timestamp", ts, false), + ])) +} + +fn from_write_summaries( + schema: SchemaRef, + chunks: Vec<(PartitionAddr, WriteSummary)>, +) -> Result { + let partition_key = chunks + .iter() + .map(|(addr, _)| Some(addr.partition_key.as_ref())) + .collect::(); + let table_name = chunks + .iter() + .map(|(addr, _)| Some(addr.table_name.as_ref())) + .collect::(); + let row_counts = chunks + .iter() + .map(|(_, w)| Some(w.row_count as u64)) + .collect::(); + let time_of_first_write = chunks + .iter() + .map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos())) + .collect::(); + let time_of_last_write = chunks + .iter() + .map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos())) + .collect::(); + let min_timestamp = chunks + .iter() + .map(|(_, w)| Some(w.min_timestamp.timestamp_nanos())) + .collect::(); + let max_timestamp = chunks + .iter() + .map(|(_, w)| Some(w.max_timestamp.timestamp_nanos())) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key), + Arc::new(table_name), + Arc::new(row_counts), + Arc::new(time_of_first_write), + Arc::new(time_of_last_write), + Arc::new(min_timestamp), + Arc::new(max_timestamp), + ], + ) +} + +#[cfg(test)] +mod tests { + use chrono::{TimeZone, Utc}; + + use arrow_util::assert_batches_eq; + + use super::*; + + #[test] + fn test_from_write_summaries() { + let addr = PartitionAddr { + db_name: Arc::from("db"), + table_name: Arc::from("table"), + partition_key: Arc::from("partition"), + }; + + let summaries = vec![ + ( + addr.clone(), + WriteSummary { + time_of_first_write: Utc.timestamp_nanos(0), + time_of_last_write: Utc.timestamp_nanos(20), + min_timestamp: Utc.timestamp_nanos(50), + max_timestamp: Utc.timestamp_nanos(60), + row_count: 320, + }, + ), + ( + addr, + WriteSummary { + time_of_first_write: Utc.timestamp_nanos(6), + time_of_last_write: Utc.timestamp_nanos(21), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(2), + row_count: 2, + }, + ), + ]; + + let expected = vec![ + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + "| partition_key | table_name | row_count | time_of_first_write | time_of_last_write | min_timestamp | max_timestamp |", + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + "| partition | table | 320 | 1970-01-01 00:00:00 | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |", + "| partition | table | 2 | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |", + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + ]; + + let schema = persistence_windows_schema(); + let batch = from_write_summaries(schema, summaries).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} diff --git a/server/src/init.rs b/server/src/init.rs index c06cbc61b6..2351821f9c 100644 --- a/server/src/init.rs +++ b/server/src/init.rs @@ -2,29 +2,19 @@ use data_types::{ database_rules::{DatabaseRules, WriteBufferConnection}, database_state::DatabaseStateCode, - server_id::ServerId, + error::ErrorLogger, DatabaseName, }; use futures::TryStreamExt; use generated_types::database_rules::decode_database_rules; -use internal_types::once::OnceNonZeroU32; use object_store::{ path::{parsed::DirsAndFileName, ObjectStorePath, Path}, ObjectStore, ObjectStoreApi, }; -use observability_deps::tracing::{debug, error, info, warn}; -use parking_lot::Mutex; +use observability_deps::tracing::{error, info, warn}; use parquet_file::catalog::PreservedCatalog; -use query::exec::Executor; -use snafu::{OptionExt, ResultExt, Snafu}; -use std::{ - collections::HashMap, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, -}; -use tokio::sync::Semaphore; +use snafu::{ResultExt, Snafu}; +use std::sync::Arc; use write_buffer::config::WriteBufferConfig; use crate::{ @@ -45,9 +35,6 @@ pub enum Error { source: generated_types::database_rules::DecodeError, }, - #[snafu(display("id already set"))] - IdAlreadySet { id: ServerId }, - #[snafu(display("unable to use server until id is set"))] IdNotSet, @@ -97,472 +84,254 @@ pub enum Error { pub type Result = std::result::Result; -#[derive(Debug, Default)] -pub struct CurrentServerId(OnceNonZeroU32); +/// Loads the database configurations based on the databases in the +/// object store. Any databases in the config already won't be +/// replaced. +/// +/// Returns a Vec containing the results of loading the contained databases +pub(crate) async fn initialize_server( + config: Arc, + wipe_on_error: bool, +) -> Result, Result<()>)>> { + let root = config.root_path(); -impl CurrentServerId { - pub fn set(&self, id: ServerId) -> Result<()> { - let id = id.get(); + // get the database names from the object store prefixes + // TODO: update object store to pull back all common prefixes by + // following the next tokens. + let list_result = config + .object_store() + .list_with_delimiter(&root) + .await + .context(StoreError)?; - match self.0.set(id) { - Ok(()) => { - info!(server_id = id, "server ID set"); - Ok(()) - } - Err(id) => Err(Error::IdAlreadySet { - id: ServerId::new(id), - }), - } - } + let handles: Vec<_> = list_result + .common_prefixes + .into_iter() + .filter_map(|mut path| { + let config = Arc::clone(&config); + let root = root.clone(); + path.set_file_name(DB_RULES_FILE_NAME); + let db_name = db_name_from_rules_path(&path) + .log_if_error("invalid database path") + .ok()?; - pub fn get(&self) -> Result { - self.0.get().map(ServerId::new).context(IdNotSet) - } -} - -#[derive(Debug)] -pub struct InitStatus { - pub server_id: CurrentServerId, - - /// Flags that databases are loaded and server is ready to read/write data. - initialized: AtomicBool, - - /// Semaphore that limits the number of jobs that load DBs when the serverID is set. - /// - /// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex, - /// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We - /// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical - /// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore - /// cannot be configured. - initialize_semaphore: Semaphore, - - /// Error occurred during generic server init (e.g. listing store content). - error_generic: Mutex>>, - - /// Errors that occurred during some DB init. - errors_databases: Arc>>>, - - /// Automatic wipe-on-error recovery - /// - /// See - pub(crate) wipe_on_error: AtomicBool, -} - -impl InitStatus { - /// Create new "not initialized" status. - pub fn new() -> Self { - Self { - server_id: Default::default(), - initialized: AtomicBool::new(false), - // Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`. - initialize_semaphore: Semaphore::new(1), - error_generic: Default::default(), - errors_databases: Default::default(), - wipe_on_error: AtomicBool::new(true), - } - } - - /// Base location in object store for this writer. - pub fn root_path(&self, store: &ObjectStore) -> Result { - let id = self.server_id.get()?; - - let mut path = store.new_path(); - path.push_dir(format!("{}", id)); - Ok(path) - } - - /// Check if server is loaded. Databases are loaded and server is ready to read/write. - pub fn initialized(&self) -> bool { - // Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that - // `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that - // every data acccess AFTER the following line will also stay AFTER this line. - self.initialized.load(Ordering::Acquire) - } - - /// Error occurred during generic server init (e.g. listing store content). - pub fn error_generic(&self) -> Option> { - let guard = self.error_generic.lock(); - guard.clone() - } - - /// List all databases with errors in sorted order. - pub fn databases_with_errors(&self) -> Vec { - let guard = self.errors_databases.lock(); - let mut names: Vec<_> = guard.keys().cloned().collect(); - names.sort(); - names - } - - /// Error that occurred during initialization of a specific database. - pub fn error_database(&self, db_name: &str) -> Option> { - let guard = self.errors_databases.lock(); - guard.get(db_name).cloned() - } - - /// Loads the database configurations based on the databases in the - /// object store. Any databases in the config already won't be - /// replaced. - /// - /// This requires the serverID to be set (will be a no-op otherwise). - /// - /// It will be a no-op if the configs are already loaded and the server is ready. - pub(crate) async fn maybe_initialize_server( - &self, - store: Arc, - config: Arc, - exec: Arc, - ) { - let server_id = match self.server_id.get() { - Ok(id) => id, - Err(e) => { - debug!(%e, "cannot initialize server because cannot get serverID"); - return; - } - }; - - let _guard = self - .initialize_semaphore - .acquire() - .await - .expect("semaphore should not be closed"); - - // Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread - // that enters this semaphore after we've left actually sees the correct `is_ready` flag. - if self.initialized.load(Ordering::Acquire) { - // already loaded, so do nothing - return; - } - - // Check if there was a previous failed attempt - if self.error_generic().is_some() { - return; - } - - match self - .maybe_initialize_server_inner(store, config, exec, server_id) - .await - { - Ok(_) => { - // mark as ready (use correct ordering for Acquire-Release) - self.initialized.store(true, Ordering::Release); - info!("loaded databases, server is initalized"); - } - Err(e) => { - error!(%e, "error during server init"); - let mut guard = self.error_generic.lock(); - *guard = Some(Arc::new(e)); - } - } - } - - async fn maybe_initialize_server_inner( - &self, - store: Arc, - config: Arc, - exec: Arc, - server_id: ServerId, - ) -> Result<()> { - let root = self.root_path(&store)?; - - // get the database names from the object store prefixes - // TODO: update object store to pull back all common prefixes by - // following the next tokens. - let list_result = store.list_with_delimiter(&root).await.context(StoreError)?; - - let handles: Vec<_> = list_result - .common_prefixes - .into_iter() - .filter_map(|mut path| { - let store = Arc::clone(&store); - let config = Arc::clone(&config); - let exec = Arc::clone(&exec); - let errors_databases = Arc::clone(&self.errors_databases); - let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed); - let root = root.clone(); - - path.set_file_name(DB_RULES_FILE_NAME); - - match db_name_from_rules_path(&path) { - Ok(db_name) => { - let handle = tokio::task::spawn(async move { - match Self::initialize_database( - server_id, - store, - config, - exec, - root, - db_name.clone(), - wipe_on_error, - ) - .await - { - Ok(()) => { - info!(%db_name, "database initialized"); - } - Err(e) => { - error!(%e, %db_name, "cannot load database"); - let mut guard = errors_databases.lock(); - guard.insert(db_name.to_string(), Arc::new(e)); - } - } - }); - Some(handle) - } - Err(e) => { - error!(%e, "invalid database path"); - None - } - } + Some(async move { + let result = + initialize_database(config, root, db_name.clone(), wipe_on_error).await; + (db_name, result) }) - .collect(); + }) + .collect(); - futures::future::join_all(handles).await; + Ok(futures::future::join_all(handles).await) +} +async fn initialize_database( + config: Arc, + root: Path, + db_name: DatabaseName<'static>, + wipe_on_error: bool, +) -> Result<()> { + // Reserve name before expensive IO (e.g. loading the preserved catalog) + let mut handle = config + .create_db(db_name) + .map_err(Box::new) + .context(InitDbError)?; + + match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await + { + Ok(true) => { + // finished init and keep DB + handle.commit(); + Ok(()) + } + Ok(false) => { + // finished but do not keep DB + handle.abort(); + Ok(()) + } + Err(e) => { + // encountered some error, still commit intermediate result + handle.commit(); + Err(e) + } + } +} + +async fn load_database_rules(store: Arc, path: Path) -> Result> { + let serialized_rules = loop { + match get_database_config_bytes(&path, &store).await { + Ok(data) => break data, + Err(e) => { + if let Error::NoDatabaseConfigError { location } = &e { + warn!(?location, "{}", e); + return Ok(None); + } + error!( + "error getting database config {:?} from object store: {}", + path, e + ); + tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS)) + .await; + } + } + }; + let rules = decode_database_rules(serialized_rules.freeze()) + .context(ErrorDeserializingRulesProtobuf)?; + + Ok(Some(rules)) +} + +pub(crate) async fn wipe_preserved_catalog_and_maybe_recover( + config: Arc, + db_name: &DatabaseName<'static>, +) -> Result<()> { + let store = config.object_store(); + + if config.has_uninitialized_database(db_name) { + let mut handle = config + .recover_db(db_name.clone()) + .map_err(|e| Arc::new(e) as _) + .context(RecoverDbError)?; + + if !((handle.state_code() == DatabaseStateCode::Known) + || (handle.state_code() == DatabaseStateCode::RulesLoaded)) + { + // cannot wipe because init state is already too far + return Err(Error::DbPartiallyInitialized { + db_name: db_name.to_string(), + }); + } + + // wipe while holding handle so no other init/wipe process can interact with the catalog + PreservedCatalog::wipe(&store, handle.server_id(), db_name) + .await + .map_err(Box::new) + .context(PreservedCatalogWipeError)?; + + let root = config.root_path(); + + let result = + try_advance_database_init_process_until_complete(&mut handle, &root, true).await; + + // Commit changes even if failed + handle.commit(); + result.map(|_| ()) + } else { + let handle = config + .block_db(db_name.clone()) + .map_err(|e| Arc::new(e) as _) + .context(RecoverDbError)?; + + PreservedCatalog::wipe(&store, config.server_id(), db_name) + .await + .map_err(Box::new) + .context(PreservedCatalogWipeError)?; + + drop(handle); + + info!(%db_name, "wiped preserved catalog of non-registered database"); Ok(()) } +} - async fn initialize_database( - server_id: ServerId, - store: Arc, - config: Arc, - exec: Arc, - root: Path, - db_name: DatabaseName<'static>, - wipe_on_error: bool, - ) -> Result<()> { - // Reserve name before expensive IO (e.g. loading the preserved catalog) - let mut handle = config - .create_db(store, exec, server_id, db_name) - .map_err(Box::new) - .context(InitDbError)?; - - match Self::try_advance_database_init_process_until_complete( - &mut handle, - &root, - wipe_on_error, - ) - .await - { - Ok(true) => { - // finished init and keep DB - handle.commit(); - Ok(()) +/// Try to make as much progress as possible with DB init. +/// +/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe +/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten +/// (e.g. because not rules file is present.) +async fn try_advance_database_init_process_until_complete( + handle: &mut DatabaseHandle<'_>, + root: &Path, + wipe_on_error: bool, +) -> Result { + loop { + match try_advance_database_init_process(handle, root, wipe_on_error).await? { + InitProgress::Unfinished => {} + InitProgress::Done => { + return Ok(true); } - Ok(false) => { - // finished but do not keep DB - handle.abort(); - Ok(()) - } - Err(e) => { - // encountered some error, still commit intermediate result - handle.commit(); - Err(e) + InitProgress::Forget => { + return Ok(false); } } } +} - async fn load_database_rules( - store: Arc, - path: Path, - ) -> Result> { - let serialized_rules = loop { - match get_database_config_bytes(&path, &store).await { - Ok(data) => break data, - Err(e) => { - if let Error::NoDatabaseConfigError { location } = &e { - warn!(?location, "{}", e); - return Ok(None); - } - error!( - "error getting database config {:?} from object store: {}", - path, e - ); - tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS)) - .await; +/// Try to make some progress in the DB init. +async fn try_advance_database_init_process( + handle: &mut DatabaseHandle<'_>, + root: &Path, + wipe_on_error: bool, +) -> Result { + match handle.state_code() { + DatabaseStateCode::Known => { + // known => load DB rules + let path = object_store_path_for_database_config(root, &handle.db_name()); + match load_database_rules(handle.object_store(), path).await? { + Some(rules) => { + handle + .advance_rules_loaded(rules) + .map_err(Box::new) + .context(InitDbError)?; + + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) + } + None => { + // no rules file present, advice to forget his DB + Ok(InitProgress::Forget) } } - }; - let rules = decode_database_rules(serialized_rules.freeze()) - .context(ErrorDeserializingRulesProtobuf)?; - - Ok(Some(rules)) - } - - pub(crate) async fn wipe_preserved_catalog_and_maybe_recover( - &self, - store: Arc, - config: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - ) -> Result<()> { - if config.has_uninitialized_database(&db_name) { - let mut handle = config - .recover_db(db_name.clone()) - .map_err(|e| Arc::new(e) as _) - .context(RecoverDbError)?; - - if !((handle.state_code() == DatabaseStateCode::Known) - || (handle.state_code() == DatabaseStateCode::RulesLoaded)) - { - // cannot wipe because init state is already too far - return Err(Error::DbPartiallyInitialized { - db_name: db_name.to_string(), - }); - } - - // wipe while holding handle so no other init/wipe process can interact with the catalog - PreservedCatalog::wipe(&store, handle.server_id(), &db_name) - .await - .map_err(Box::new) - .context(PreservedCatalogWipeError)?; - - let root = self.root_path(&store)?; - let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed); - match Self::try_advance_database_init_process_until_complete( - &mut handle, - &root, + } + DatabaseStateCode::RulesLoaded => { + // rules already loaded => continue with loading preserved catalog + let (preserved_catalog, catalog) = load_or_create_preserved_catalog( + &handle.db_name(), + handle.object_store(), + handle.server_id(), + handle.metrics_registry(), wipe_on_error, ) .await - { - Ok(_) => { - // yeah, recovered DB - handle.commit(); + .map_err(|e| Box::new(e) as _) + .context(CatalogLoadError)?; - let mut guard = self.errors_databases.lock(); - guard.remove(&db_name.to_string()); - - info!(%db_name, "wiped preserved catalog of registered database and recovered"); - Ok(()) - } - Err(e) => { - // could not recover, but still keep new result - handle.commit(); - - let mut guard = self.errors_databases.lock(); - let e = Arc::new(e); - guard.insert(db_name.to_string(), Arc::clone(&e)); - - warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover"); - Err(Error::RecoverDbError { source: e }) - } - } - } else { - let handle = config - .block_db(db_name.clone()) - .map_err(|e| Arc::new(e) as _) - .context(RecoverDbError)?; - - PreservedCatalog::wipe(&store, server_id, &db_name) + let rules = handle + .rules() + .expect("in this state rules should be loaded"); + let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules) .await + .context(CreateWriteBuffer { + config: rules.write_buffer_connection.clone(), + })?; + info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config"); + + handle + .advance_replay(preserved_catalog, catalog, write_buffer) .map_err(Box::new) - .context(PreservedCatalogWipeError)?; + .context(InitDbError)?; - drop(handle); - - info!(%db_name, "wiped preserved catalog of non-registered database"); - Ok(()) + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) } - } + DatabaseStateCode::Replay => { + let db = handle + .db_any_state() + .expect("DB should be available in this state"); + db.perform_replay().await; - /// Try to make as much progress as possible with DB init. - /// - /// Returns an error if there was an error along the way (in which case the handle should still be commit to safe - /// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten - /// (e.g. because not rules file is present.) - async fn try_advance_database_init_process_until_complete( - handle: &mut DatabaseHandle<'_>, - root: &Path, - wipe_on_error: bool, - ) -> Result { - loop { - match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? { - InitProgress::Unfinished => {} - InitProgress::Done => { - return Ok(true); - } - InitProgress::Forget => { - return Ok(false); - } - } + handle + .advance_init() + .map_err(Box::new) + .context(InitDbError)?; + + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) } - } - - /// Try to make some progress in the DB init. - async fn try_advance_database_init_process( - handle: &mut DatabaseHandle<'_>, - root: &Path, - wipe_on_error: bool, - ) -> Result { - match handle.state_code() { - DatabaseStateCode::Known => { - // known => load DB rules - let path = object_store_path_for_database_config(root, &handle.db_name()); - match Self::load_database_rules(handle.object_store(), path).await? { - Some(rules) => { - handle - .advance_rules_loaded(rules) - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - None => { - // no rules file present, advice to forget his DB - Ok(InitProgress::Forget) - } - } - } - DatabaseStateCode::RulesLoaded => { - // rules already loaded => continue with loading preserved catalog - let (preserved_catalog, catalog) = load_or_create_preserved_catalog( - &handle.db_name(), - handle.object_store(), - handle.server_id(), - handle.metrics_registry(), - wipe_on_error, - ) - .await - .map_err(|e| Box::new(e) as _) - .context(CatalogLoadError)?; - - let rules = handle - .rules() - .expect("in this state rules should be loaded"); - let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules) - .await - .context(CreateWriteBuffer { - config: rules.write_buffer_connection.clone(), - })?; - info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config"); - - handle - .advance_replay(preserved_catalog, catalog, write_buffer) - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - DatabaseStateCode::Replay => { - let db = handle - .db_any_state() - .expect("DB should be available in this state"); - db.perform_replay().await; - - handle - .advance_init() - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - DatabaseStateCode::Initialized => { - // database fully initialized => nothing to do - Ok(InitProgress::Done) - } + DatabaseStateCode::Initialized => { + // database fully initialized => nothing to do + Ok(InitProgress::Done) } } } diff --git a/server/src/lib.rs b/server/src/lib.rs index 2c4d666888..e2dc829e31 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -74,9 +74,8 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::BytesMut; use db::load::create_preserved_catalog; -use init::InitStatus; -use observability_deps::tracing::{debug, info, warn}; -use parking_lot::Mutex; +use observability_deps::tracing::{debug, error, info, warn}; +use parking_lot::{Mutex, RwLockUpgradableReadGuard}; use snafu::{OptionExt, ResultExt, Snafu}; use data_types::{ @@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb; use influxdb_line_protocol::ParsedLine; use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry}; use object_store::{ObjectStore, ObjectStoreApi}; +use parking_lot::RwLock; use query::{exec::Executor, DatabaseStore}; use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt}; use write_buffer::config::WriteBufferConfig; @@ -220,11 +220,11 @@ pub enum Error { #[snafu(display("cannot create preserved catalog: {}", source))] CannotCreatePreservedCatalog { source: DatabaseError }, - #[snafu(display("cannot set id: {}", source))] - SetIdError { source: crate::init::Error }, + #[snafu(display("id already set"))] + IdAlreadySet, - #[snafu(display("cannot get id: {}", source))] - GetIdError { source: crate::init::Error }, + #[snafu(display("id not set"))] + IdNotSet, #[snafu(display( "cannot create write buffer with config: {:?}, error: {}", @@ -297,6 +297,8 @@ pub struct ServerConfig { metric_registry: Arc, remote_template: Option, + + wipe_catalog_on_error: bool, } impl ServerConfig { @@ -311,6 +313,7 @@ impl ServerConfig { object_store, metric_registry, remote_template, + wipe_catalog_on_error: true, } } @@ -414,7 +417,6 @@ impl ServerMetrics { /// of these structs, which keeps track of all replication and query rules. #[derive(Debug)] pub struct Server { - config: Arc, connection_manager: Arc, pub store: Arc, exec: Arc, @@ -426,7 +428,50 @@ pub struct Server { /// and populates the endpoint with this data. pub registry: Arc, - init_status: Arc, + /// The state machine for server startup + stage: Arc>, +} + +/// The stage of the server in the startup process +/// +/// The progression is linear Startup -> InitReady -> Initializing -> Initialized +/// with the sole exception that on failure Initializing -> InitReady +/// +/// Errors encountered on server init will be retried, however, errors encountered +/// during database init will require operator intervention +/// +/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively +/// +/// They do not impact the state machine's progression, but instead are exposed to the +/// gRPC management API to allow an operator to assess the state of the system +#[derive(Debug)] +enum ServerStage { + /// Server has started but doesn't have a server id yet + Startup { + remote_template: Option, + wipe_catalog_on_error: bool, + }, + + /// Server can be initialized + InitReady { + wipe_catalog_on_error: bool, + config: Arc, + last_error: Option>, + }, + + /// Server has a server id, has started loading + Initializing { + wipe_catalog_on_error: bool, + config: Arc, + last_error: Option>, + }, + + /// Server has finish initializing, possibly with errors + Initialized { + config: Arc, + /// Errors that occurred during some DB init. + database_errors: HashMap>, + }, } #[derive(Debug)] @@ -454,22 +499,23 @@ where // to test the metrics provide a different registry to the `ServerConfig`. metric_registry, remote_template, + wipe_catalog_on_error, } = config; + let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get); + let exec = Arc::new(Executor::new(num_worker_threads)); Self { - config: Arc::new(Config::new( - Arc::clone(&jobs), - Arc::clone(&metric_registry), - remote_template, - )), store: object_store, connection_manager: Arc::new(connection_manager), - exec: Arc::new(Executor::new(num_worker_threads)), + exec, jobs, metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))), registry: Arc::clone(&metric_registry), - init_status: Arc::new(InitStatus::new()), + stage: Arc::new(RwLock::new(ServerStage::Startup { + remote_template, + wipe_catalog_on_error, + })), } } @@ -478,68 +524,112 @@ where /// /// A valid server ID Must be non-zero. pub fn set_id(&self, id: ServerId) -> Result<()> { - self.init_status.server_id.set(id).context(SetIdError) - } + let mut stage = self.stage.write(); + match &mut *stage { + ServerStage::Startup { + remote_template, + wipe_catalog_on_error, + } => { + let remote_template = remote_template.take(); - /// Returns the current server ID, or an error if not yet set. - pub fn require_id(&self) -> Result { - self.init_status.server_id.get().context(GetIdError) + *stage = ServerStage::InitReady { + wipe_catalog_on_error: *wipe_catalog_on_error, + config: Arc::new(Config::new( + Arc::clone(&self.jobs), + Arc::clone(&self.store), + Arc::clone(&self.exec), + id, + Arc::clone(&self.registry), + remote_template, + )), + last_error: None, + }; + Ok(()) + } + _ => Err(Error::IdAlreadySet), + } } /// Check if server is loaded. Databases are loaded and server is ready to read/write. pub fn initialized(&self) -> bool { - self.init_status.initialized() + matches!(&*self.stage.read(), ServerStage::Initialized { .. }) + } + + /// Require that server is loaded. Databases are loaded and server is ready to read/write. + fn require_initialized(&self) -> Result> { + match &*self.stage.read() { + ServerStage::Startup { .. } => Err(Error::IdNotSet), + ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => { + Err(Error::ServerNotInitialized { + server_id: config.server_id(), + }) + } + ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)), + } + } + + /// Returns the config for this server if server id has been set + fn config(&self) -> Result> { + let stage = self.stage.read(); + match &*stage { + ServerStage::Startup { .. } => Err(Error::IdNotSet), + ServerStage::InitReady { config, .. } + | ServerStage::Initializing { config, .. } + | ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)), + } + } + + /// Returns the server id for this server if set + pub fn server_id(&self) -> Option { + self.config().map(|x| x.server_id()).ok() } /// Error occurred during generic server init (e.g. listing store content). pub fn error_generic(&self) -> Option> { - self.init_status.error_generic() + let stage = self.stage.read(); + match &*stage { + ServerStage::InitReady { last_error, .. } => last_error.clone(), + ServerStage::Initializing { last_error, .. } => last_error.clone(), + _ => None, + } } /// List all databases with errors in sorted order. pub fn databases_with_errors(&self) -> Vec { - self.init_status.databases_with_errors() + let stage = self.stage.read(); + match &*stage { + ServerStage::Initialized { + database_errors, .. + } => database_errors.keys().cloned().collect(), + _ => Default::default(), + } } /// Error that occurred during initialization of a specific database. pub fn error_database(&self, db_name: &str) -> Option> { - self.init_status.error_database(db_name) + let stage = self.stage.read(); + match &*stage { + ServerStage::Initialized { + database_errors, .. + } => database_errors.get(db_name).cloned(), + _ => None, + } } /// Current database init state. pub fn database_state(&self, name: &str) -> Option { - if let Ok(name) = DatabaseName::new(name) { - self.config.db_state(&name) - } else { - None - } - } - - /// Require that server is loaded. Databases are loaded and server is ready to read/write. - fn require_initialized(&self) -> Result { - // since a server ID is the pre-requirement for init, check this first - let server_id = self.require_id()?; - - // ordering here isn't that important since this method is not used to check-and-modify the flag - if self.initialized() { - Ok(server_id) - } else { - Err(Error::ServerNotInitialized { server_id }) - } + let db_name = DatabaseName::new(name).ok()?; + let config = self.config().ok()?; + config.db_state(&db_name) } /// Tells the server the set of rules for a database. pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> { // Return an error if this server is not yet ready - let server_id = self.require_initialized()?; + let config = self.require_initialized()?; // Reserve name before expensive IO (e.g. loading the preserved catalog) - let mut db_reservation = self.config.create_db( - Arc::clone(&self.store), - Arc::clone(&self.exec), - server_id, - rules.name.clone(), - )?; + let mut db_reservation = config.create_db(rules.name.clone())?; // register rules db_reservation.advance_rules_loaded(rules.clone())?; @@ -548,14 +638,14 @@ where let (preserved_catalog, catalog) = create_preserved_catalog( rules.db_name(), Arc::clone(&self.store), - server_id, - self.config.metrics_registry(), + config.server_id(), + config.metrics_registry(), ) .await .map_err(|e| Box::new(e) as _) .context(CannotCreatePreservedCatalog)?; - let write_buffer = WriteBufferConfig::new(server_id, &rules) + let write_buffer = WriteBufferConfig::new(config.server_id(), &rules) .await .map_err(|e| Error::CreatingWriteBuffer { config: rules.write_buffer_connection.clone(), @@ -575,13 +665,8 @@ where } pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> { - let location = object_store_path_for_database_config( - &self - .init_status - .root_path(&self.store) - .context(GetIdError)?, - &rules.name, - ); + let config = self.config()?; + let location = object_store_path_for_database_config(&config.root_path(), &rules.name); let mut data = BytesMut::new(); encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?; @@ -604,15 +689,62 @@ where /// object store. Any databases in the config already won't be /// replaced. /// - /// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready. + /// This requires the serverID to be set. + /// + /// It will be a no-op if the configs are already loaded and the server is ready. pub async fn maybe_initialize_server(&self) { - self.init_status - .maybe_initialize_server( - Arc::clone(&self.store), - Arc::clone(&self.config), - Arc::clone(&self.exec), - ) - .await; + // Explicit scope to help async generator + let (wipe_catalog_on_error, config) = { + let state = self.stage.upgradable_read(); + match &*state { + ServerStage::InitReady { + wipe_catalog_on_error, + config, + last_error, + } => { + let config = Arc::clone(config); + let last_error = last_error.clone(); + let wipe_catalog_on_error = *wipe_catalog_on_error; + + // Mark the server as initializing and drop lock + + let mut state = RwLockUpgradableReadGuard::upgrade(state); + *state = ServerStage::Initializing { + config: Arc::clone(&config), + wipe_catalog_on_error, + last_error, + }; + (wipe_catalog_on_error, config) + } + _ => return, + } + }; + + let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await; + let new_stage = match init_result { + // Success -> move to next stage + Ok(results) => { + info!(server_id=%config.server_id(), "server initialized"); + ServerStage::Initialized { + config, + database_errors: results + .into_iter() + .filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?)))) + .collect(), + } + } + // Error -> return to InitReady + Err(err) => { + error!(%err, "error during server init"); + ServerStage::InitReady { + wipe_catalog_on_error, + config, + last_error: Some(Arc::new(err)), + } + } + }; + + *self.stage.write() = new_stage; } pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> { @@ -640,11 +772,10 @@ where default_time: i64, ) -> Result<()> { // Return an error if this server is not yet ready - self.require_initialized()?; + let config = self.require_initialized()?; let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?; - let db = self - .config + let db = config .db_initialized(&db_name) .context(DatabaseNotFound { db_name: &*db_name })?; @@ -744,9 +875,12 @@ where node_group: &[ServerId], entry: Entry, ) -> Result<()> { + // Return an error if this server is not yet ready + let config = self.config()?; + let addrs: Vec<_> = node_group .iter() - .filter_map(|&node| self.config.resolve_remote(node)) + .filter_map(|&node| config.resolve_remote(node)) .collect(); if addrs.is_empty() { return NoRemoteConfigured { node_group }.fail(); @@ -775,11 +909,10 @@ where pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec) -> Result<()> { // Return an error if this server is not yet ready - self.require_initialized()?; + let config = self.require_initialized()?; let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?; - let db = self - .config + let db = config .db_initialized(&db_name) .context(DatabaseNotFound { db_name: &*db_name })?; @@ -825,11 +958,11 @@ where } pub fn db(&self, name: &DatabaseName<'_>) -> Option> { - self.config.db_initialized(name) + self.config().ok()?.db_initialized(name) } pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option> { - self.config.db_initialized(name).map(|d| d.rules()) + self.db(name).map(|d| d.rules()) } // Update database rules and save on success. @@ -841,8 +974,8 @@ where where F: FnOnce(DatabaseRules) -> Result + Send, { - let rules = self - .config + let config = self.config()?; + let rules = config .update_db_rules(db_name, update) .map_err(|e| match e { crate::config::UpdateError::Closure(e) => UpdateError::Closure(e), @@ -854,16 +987,23 @@ where Ok(rules) } - pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> { - self.config.remotes_sorted() + pub fn remotes_sorted(&self) -> Result> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + Ok(config.remotes_sorted()) } - pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) { - self.config.update_remote(id, addr) + pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + config.update_remote(id, addr); + Ok(()) } - pub fn delete_remote(&self, id: ServerId) -> Option { - self.config.delete_remote(id) + pub fn delete_remote(&self, id: ServerId) -> Result> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + Ok(config.delete_remote(id)) } pub fn spawn_dummy_job(&self, nanos: Vec) -> TaskTracker { @@ -893,14 +1033,15 @@ where partition_key: impl Into, chunk_id: u32, ) -> Result> { + let config = self.require_initialized()?; + let db_name = db_name.to_string(); let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?; let partition_key = partition_key.into(); let table_name = table_name.into(); - let db = self - .config + let db = config .db_initialized(&name) .context(DatabaseNotFound { db_name: &db_name })?; @@ -921,25 +1062,62 @@ where /// DB jobs and this command. pub fn wipe_preserved_catalog( &self, - db_name: DatabaseName<'static>, + db_name: &DatabaseName<'static>, ) -> Result> { - if self.config.db_initialized(&db_name).is_some() { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + // Can only wipe catalog of database that failed to initialize + let config = match &*self.stage.read() { + ServerStage::Initialized { + config, + database_errors, + } => { + if config.db_initialized(db_name).is_some() { + return Err(Error::DatabaseAlreadyExists { + db_name: db_name.to_string(), + }); + } + + if !database_errors.contains_key(db_name.as_str()) { + // TODO: Should this be an error? Some end-to-end tests assume it is non-fatal + warn!(%db_name, "wiping database not present at startup"); + } + Arc::clone(config) + } + ServerStage::Startup { .. } => return Err(Error::IdNotSet), + ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => { + return Err(Error::ServerNotInitialized { + server_id: config.server_id(), + }) + } + }; let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog { db_name: db_name.to_string(), }); - let object_store = Arc::clone(&self.store); - let config = Arc::clone(&self.config); - let server_id = self.require_id()?; - let init_status = Arc::clone(&self.init_status); + + let state = Arc::clone(&self.stage); + let db_name = db_name.clone(); + let task = async move { - init_status - .wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name) - .await + let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await; + + match &mut *state.write() { + ServerStage::Initialized { + database_errors, .. + } => match result { + Ok(_) => { + info!(%db_name, "wiped preserved catalog of registered database and recovered"); + database_errors.remove(db_name.as_str()); + Ok(()) + } + Err(e) => { + warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover"); + let e = Arc::new(e); + database_errors.insert(db_name.to_string(), Arc::clone(&e)); + Err(e) + } + }, + _ => unreachable!("server cannot become uninitialized"), + } }; tokio::spawn(task.track(registration)); @@ -973,7 +1151,9 @@ where } info!("shutting down background workers"); - self.config.drain().await; + if let Ok(config) = self.config() { + config.drain().await; + } info!("draining tracker registry"); @@ -999,11 +1179,15 @@ where type Error = Error; fn db_names_sorted(&self) -> Vec { - self.config - .db_names_sorted() - .iter() - .map(|i| i.clone().into()) - .collect() + self.config() + .map(|config| { + config + .db_names_sorted() + .iter() + .map(ToString::to_string) + .collect() + }) + .unwrap_or_default() } fn db(&self, name: &str) -> Option> { @@ -1214,25 +1398,15 @@ mod tests { let manager = TestConnectionManager::new(); let server = Server::new(manager, config()); - let resp = server.require_id().unwrap_err(); - assert!(matches!( - resp, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + let resp = server.config().unwrap_err(); + assert!(matches!(resp, Error::IdNotSet)); let lines = parsed_lines("cpu foo=1 10"); let resp = server .write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME) .await .unwrap_err(); - assert!(matches!( - resp, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + assert!(matches!(resp, Error::IdNotSet)); } #[tokio::test] @@ -1559,7 +1733,7 @@ mod tests { let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2]; let db = server.db(&db_name).unwrap(); - db.update_db_rules(|mut rules| { + db.update_rules(|mut rules| { let shard_config = ShardConfig { hash_ring: Some(HashRing { shards: vec![TEST_SHARD_ID].into(), @@ -1589,7 +1763,9 @@ mod tests { ); // one remote is configured but it's down and we'll get connection error - server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into()); + server + .update_remote(bad_remote_id, BAD_REMOTE_ADDR.into()) + .unwrap(); let err = server .write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME) .await @@ -1606,8 +1782,12 @@ mod tests { // We configure the address for the other remote, this time connection will succeed // despite the bad remote failing to connect. - server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into()); - server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into()); + server + .update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into()) + .unwrap(); + server + .update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into()) + .unwrap(); // Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable // probability both the remotes will get hit. @@ -1796,7 +1976,7 @@ mod tests { let db_name = DatabaseName::new("foo").unwrap(); let db = server.db(&db_name).unwrap(); let rules = db - .update_db_rules(|mut rules| { + .update_rules(|mut rules| { rules.lifecycle_rules.buffer_size_hard = Some(std::num::NonZeroUsize::new(10).unwrap()); Ok::<_, Infallible>(rules) @@ -1844,12 +2024,7 @@ mod tests { let err = create_simple_database(&server, "bananas") .await .unwrap_err(); - assert!(matches!( - err, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + assert!(matches!(err, Error::IdNotSet)); server.set_id(ServerId::try_from(1).unwrap()).unwrap(); // do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready @@ -1873,7 +2048,7 @@ mod tests { let t_0 = Instant::now(); loop { - if server.require_initialized().is_ok() { + if server.config().is_ok() { break; } assert!(t_0.elapsed() < Duration::from_secs(10)); @@ -1916,9 +2091,12 @@ mod tests { create_simple_database(&server, "foo") .await .expect("failed to create database"); - let root = server.init_status.root_path(&store).unwrap(); - server.config.drain().await; + + let config = server.require_initialized().unwrap(); + let root = config.root_path(); + config.drain().await; drop(server); + drop(config); // tamper store let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap()); @@ -2003,18 +2181,24 @@ mod tests { let server = Server::new(manager, config); server.set_id(server_id).unwrap(); server.maybe_initialize_server().await; + create_simple_database(&server, db_name_existing.clone()) .await .expect("failed to create database"); + create_simple_database(&server, db_name_rules_broken.clone()) .await .expect("failed to create database"); + create_simple_database(&server, db_name_catalog_broken.clone()) .await .expect("failed to create database"); - let root = server.init_status.root_path(&store).unwrap(); - server.config.drain().await; + + let config = server.require_initialized().unwrap(); + let root = config.root_path(); + config.drain().await; drop(server); + drop(config); // tamper store to break one database let path = object_store_path_for_database_config(&root, &db_name_rules_broken); @@ -2045,22 +2229,18 @@ mod tests { let store = Arc::try_unwrap(store).unwrap(); store.get(&path).await.unwrap(); let manager = TestConnectionManager::new(); - let config = config_with_store(store); - let server = Server::new(manager, config); - // need to disable auto-wipe for this test - server - .init_status - .wipe_on_error - .store(false, std::sync::atomic::Ordering::Relaxed); + let mut config = config_with_store(store); + config.wipe_catalog_on_error = false; + let server = Server::new(manager, config); // cannot wipe if server ID is not set assert_eq!( server - .wipe_preserved_catalog(db_name_non_existing.clone()) + .wipe_preserved_catalog(&db_name_non_existing) .unwrap_err() .to_string(), - "cannot get id: unable to use server until id is set" + "id not set" ); server.set_id(ServerId::try_from(1).unwrap()).unwrap(); @@ -2069,31 +2249,29 @@ mod tests { // 1. cannot wipe if DB exists assert_eq!( server - .wipe_preserved_catalog(db_name_existing.clone()) + .wipe_preserved_catalog(&db_name_existing) .unwrap_err() .to_string(), "database already exists: db_existing" ); - assert!(PreservedCatalog::exists( - &server.store, - server.require_id().unwrap(), - &db_name_existing.to_string() - ) - .await - .unwrap()); + assert!( + PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str()) + .await + .unwrap() + ); // 2. wiping a non-existing DB just works, but won't bring DB into existence assert!(server.error_database(&db_name_non_existing).is_none()); PreservedCatalog::new_empty::( Arc::clone(&server.store), - server.require_id().unwrap(), + server_id, db_name_non_existing.to_string(), (), ) .await .unwrap(); let tracker = server - .wipe_preserved_catalog(db_name_non_existing.clone()) + .wipe_preserved_catalog(&db_name_non_existing) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2103,7 +2281,7 @@ mod tests { tracker.join().await; assert!(!PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_non_existing.to_string() ) .await @@ -2114,7 +2292,7 @@ mod tests { // 3. wipe DB with broken rules file, this won't bring DB back to life assert!(server.error_database(&db_name_rules_broken).is_some()); let tracker = server - .wipe_preserved_catalog(db_name_rules_broken.clone()) + .wipe_preserved_catalog(&db_name_rules_broken) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2124,7 +2302,7 @@ mod tests { tracker.join().await; assert!(!PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_rules_broken.to_string() ) .await @@ -2135,7 +2313,7 @@ mod tests { // 4. wipe DB with broken catalog, this will bring the DB back to life assert!(server.error_database(&db_name_catalog_broken).is_some()); let tracker = server - .wipe_preserved_catalog(db_name_catalog_broken.clone()) + .wipe_preserved_catalog(&db_name_catalog_broken) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2145,7 +2323,7 @@ mod tests { tracker.join().await; assert!(PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_catalog_broken.to_string() ) .await @@ -2166,18 +2344,16 @@ mod tests { .unwrap(); assert_eq!( server - .wipe_preserved_catalog(db_name_created.clone()) + .wipe_preserved_catalog(&db_name_created) .unwrap_err() .to_string(), "database already exists: db_created" ); - assert!(PreservedCatalog::exists( - &server.store, - server.require_id().unwrap(), - &db_name_created.to_string() - ) - .await - .unwrap()); + assert!( + PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string()) + .await + .unwrap() + ); } #[tokio::test] diff --git a/src/commands/database.rs b/src/commands/database.rs index d7d4ddf81d..3ff2c2bbf9 100644 --- a/src/commands/database.rs +++ b/src/commands/database.rs @@ -119,6 +119,11 @@ struct Create { /// Maximum number of rows to buffer in a MUB chunk before compacting it #[structopt(long, default_value = "100000")] mub_row_threshold: u64, + + /// Use up to this amount of space in bytes for caching Parquet files. A + /// value of zero disables Parquet file caching. + #[structopt(long, default_value = "0")] + parquet_cache_limit: u64, } /// Get list of databases @@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> { persist_row_threshold: command.persist_row_threshold, persist_age_threshold_seconds: command.persist_age_threshold_seconds, mub_row_threshold: command.mub_row_threshold, + parquet_cache_limit: command.parquet_cache_limit, }), // Default to hourly partitions diff --git a/src/commands/run.rs b/src/commands/run.rs index cab9cadb4d..878d98af97 100644 --- a/src/commands/run.rs +++ b/src/commands/run.rs @@ -231,6 +231,14 @@ Possible values (case insensitive): default_value = "serving" )] pub initial_serving_state: ServingReadinessState, + + /// Maximum size of HTTP requests. + #[structopt( + long = "--max-http-request-size", + env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE", + default_value = "10485760" // 10 MiB + )] + pub max_http_request_size: usize, } pub async fn command(config: Config) -> Result<()> { diff --git a/src/influxdb_ioxd.rs b/src/influxdb_ioxd.rs index 696ad257cb..c871bc30c5 100644 --- a/src/influxdb_ioxd.rs +++ b/src/influxdb_ioxd.rs @@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> { let bind_addr = config.http_bind_address; let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?; - let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse(); + let max_http_request_size = config.max_http_request_size; + + let http_server = http::serve( + addr, + Arc::clone(&app_server), + frontend_shutdown.clone(), + max_http_request_size, + ) + .fuse(); info!(bind_address=?bind_addr, "HTTP server listening"); info!(git_hash, "InfluxDB IOx server ready"); diff --git a/src/influxdb_ioxd/http.rs b/src/influxdb_ioxd/http.rs index f8dab0bd8c..818ba09e74 100644 --- a/src/influxdb_ioxd/http.rs +++ b/src/influxdb_ioxd/http.rs @@ -342,12 +342,26 @@ impl ApplicationError { } } -const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB - -fn router(server: Arc>) -> Router +struct Server where M: ConnectionManager + Send + Sync + Debug + 'static, { + app_server: Arc>, + max_request_size: usize, +} + +fn router( + app_server: Arc>, + max_request_size: usize, +) -> Router +where + M: ConnectionManager + Send + Sync + Debug + 'static, +{ + let server = Server { + app_server, + max_request_size, + }; + // Create a router and specify the the handlers. Router::builder() .data(server) @@ -408,7 +422,7 @@ struct WriteInfo { /// Parse the request's body into raw bytes, applying size limits and /// content encoding as needed. -async fn parse_body(req: hyper::Request) -> Result { +async fn parse_body(req: hyper::Request, max_size: usize) -> Result { // clippy says the const needs to be assigned to a local variable: // error: a `const` item with interior mutability should not be borrowed let header_name = CONTENT_ENCODING; @@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request) -> Result MAX_SIZE { + if (body.len() + chunk.len()) > max_size { return Err(ApplicationError::RequestSizeExceeded { - max_body_size: MAX_SIZE, + max_body_size: max_size, }); } body.extend_from_slice(&chunk); @@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request) -> Result>>().expect("server state")); + let Server { + app_server: server, + max_request_size, + } = req.data::>().expect("server state"); + let max_request_size = *max_request_size; + let server = Arc::clone(&server); // TODO(edd): figure out best way of catching all errors in this observation. let obs = server.metrics.http_requests.observation(); // instrument request @@ -481,7 +500,7 @@ where let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket) .context(BucketMappingError)?; - let body = parse_body(req).await?; + let body = parse_body(req, max_request_size).await?; let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?; @@ -595,7 +614,7 @@ async fn query( req: Request, ) -> Result, ApplicationError> { let path = req.uri().path().to_string(); - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); // TODO(edd): figure out best way of catching all errors in this observation. let obs = server.metrics.http_requests.observation(); // instrument request @@ -661,7 +680,7 @@ async fn query( async fn health( req: Request, ) -> Result, ApplicationError> { - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); let path = req.uri().path().to_string(); server .metrics @@ -677,7 +696,7 @@ async fn health( async fn handle_metrics( req: Request, ) -> Result, ApplicationError> { - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); let path = req.uri().path().to_string(); server .metrics @@ -700,7 +719,7 @@ async fn list_partitions( ) -> Result, ApplicationError> { let path = req.uri().path().to_string(); - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); // TODO - catch error conditions let obs = server.metrics.http_requests.observation(); @@ -841,11 +860,12 @@ pub async fn serve( addr: AddrIncoming, server: Arc>, shutdown: CancellationToken, + max_request_size: usize, ) -> Result<(), hyper::Error> where M: ConnectionManager + Send + Sync + Debug + 'static, { - let router = router(server); + let router = router(server, max_request_size); let service = RouterService::new(router).unwrap(); hyper::Server::builder(addr) @@ -1234,6 +1254,8 @@ mod tests { .await; } + const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024; + #[tokio::test] async fn client_hangup_during_parse() { #[derive(Debug, Snafu)] @@ -1253,7 +1275,9 @@ mod tests { .body(body) .unwrap(); - let parse_result = parse_body(request).await.unwrap_err(); + let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE) + .await + .unwrap_err(); assert_eq!( parse_result.to_string(), "Client hung up while sending body: error reading a body from connection: Blarg Error" @@ -1334,7 +1358,12 @@ mod tests { let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server"); let server_url = format!("http://{}", addr.local_addr()); - tokio::task::spawn(serve(addr, server, CancellationToken::new())); + tokio::task::spawn(serve( + addr, + server, + CancellationToken::new(), + TEST_MAX_REQUEST_SIZE, + )); println!("Started server at {}", server_url); server_url } diff --git a/src/influxdb_ioxd/rpc/error.rs b/src/influxdb_ioxd/rpc/error.rs index 4b3b95f314..a6ab258497 100644 --- a/src/influxdb_ioxd/rpc/error.rs +++ b/src/influxdb_ioxd/rpc/error.rs @@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status { use server::Error; match error { - Error::GetIdError { .. } => PreconditionViolation { + Error::IdNotSet => PreconditionViolation { category: "Writer ID".to_string(), subject: "influxdata.com/iox".to_string(), description: "Writer ID must be set".to_string(), diff --git a/src/influxdb_ioxd/rpc/management.rs b/src/influxdb_ioxd/rpc/management.rs index 5f81db5e46..2b26040d04 100644 --- a/src/influxdb_ioxd/rpc/management.rs +++ b/src/influxdb_ioxd/rpc/management.rs @@ -56,7 +56,7 @@ where &self, _: Request, ) -> Result, Status> { - match self.server.require_id().ok() { + match self.server.server_id() { Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })), None => return Err(NotFound::default().into()), } @@ -71,7 +71,7 @@ where match self.server.set_id(id) { Ok(_) => Ok(Response::new(UpdateServerIdResponse {})), - Err(e @ Error::SetIdError { .. }) => { + Err(e @ Error::IdAlreadySet) => { return Err(FieldViolation { field: "id".to_string(), description: e.to_string(), @@ -199,15 +199,18 @@ where &self, _: Request, ) -> Result, Status> { - let remotes = self - .server - .remotes_sorted() - .into_iter() - .map(|(id, connection_string)| Remote { - id: id.get_u32(), - connection_string, - }) - .collect(); + let result = self.server.remotes_sorted(); + let remotes = match result { + Ok(remotes) => remotes + .into_iter() + .map(|(id, connection_string)| Remote { + id: id.get_u32(), + connection_string, + }) + .collect(), + Err(e) => return Err(default_server_error_handler(e)), + }; + Ok(Response::new(ListRemotesResponse { remotes })) } @@ -221,8 +224,16 @@ where .ok_or_else(|| FieldViolation::required("remote"))?; let remote_id = ServerId::try_from(remote.id) .map_err(|_| FieldViolation::required("id").scope("remote"))?; - self.server + + let result = self + .server .update_remote(remote_id, remote.connection_string); + + match result { + Ok(_) => {} + Err(e) => return Err(default_server_error_handler(e)), + } + Ok(Response::new(UpdateRemoteResponse {})) } @@ -233,9 +244,12 @@ where let request = request.into_inner(); let remote_id = ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?; - self.server - .delete_remote(remote_id) - .ok_or_else(NotFound::default)?; + + match self.server.delete_remote(remote_id) { + Ok(Some(_)) => {} + Ok(None) => return Err(NotFound::default().into()), + Err(e) => return Err(default_server_error_handler(e)), + } Ok(Response::new(DeleteRemoteResponse {})) } @@ -455,7 +469,7 @@ where let tracker = self .server - .wipe_preserved_catalog(db_name) + .wipe_preserved_catalog(&db_name) .map_err(|e| match e { Error::DatabaseAlreadyExists { db_name } => AlreadyExists { resource_type: "database".to_string(), diff --git a/tests/end_to_end_cases/management_api.rs b/tests/end_to_end_cases/management_api.rs index 2f26969085..d543e7fb68 100644 --- a/tests/end_to_end_cases/management_api.rs +++ b/tests/end_to_end_cases/management_api.rs @@ -65,6 +65,8 @@ async fn test_list_update_remotes() { const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321"; const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321"; + client.update_server_id(123).await.unwrap(); + let res = client.list_remotes().await.expect("list remotes failed"); assert_eq!(res.len(), 0); diff --git a/tests/end_to_end_cases/management_cli.rs b/tests/end_to_end_cases/management_cli.rs index a09285c695..3e0be27290 100644 --- a/tests/end_to_end_cases/management_cli.rs +++ b/tests/end_to_end_cases/management_cli.rs @@ -244,6 +244,18 @@ async fn test_list_chunks_error() { async fn test_remotes() { let server_fixture = ServerFixture::create_single_use().await; let addr = server_fixture.grpc_base(); + + Command::cargo_bin("influxdb_iox") + .unwrap() + .arg("server") + .arg("set") + .arg("32") + .arg("--host") + .arg(addr) + .assert() + .success() + .stdout(predicate::str::contains("Ok")); + Command::cargo_bin("influxdb_iox") .unwrap() .arg("server") diff --git a/tests/end_to_end_cases/persistence.rs b/tests/end_to_end_cases/persistence.rs index 634c506856..af4dccfa61 100644 --- a/tests/end_to_end_cases/persistence.rs +++ b/tests/end_to_end_cases/persistence.rs @@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() { assert_eq!(chunks[0].row_count, 1_000); } +async fn write_data( + write_client: &mut influxdb_iox_client::write::Client, + db_name: &str, + num_payloads: u64, + num_duplicates: u64, + payload_size: u64, +) { + let payloads: Vec<_> = (0..num_payloads) + .map(|x| { + (0..payload_size) + .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i)) + .join("\n") + }) + .collect(); + + for payload in &payloads { + // Writing the same data multiple times should be compacted away + for _ in 0..=num_duplicates { + let num_lines_written = write_client + .write(db_name, payload) + .await + .expect("successful write"); + assert_eq!(num_lines_written, payload_size as usize); + } + } +} + #[tokio::test] async fn test_full_lifecycle() { let fixture = ServerFixture::create_shared().await; let mut write_client = fixture.write_client(); let num_payloads = 10; - let num_duplicates = 2; + let num_duplicates = 1; let payload_size = 1_000; - let total_rows = num_payloads * num_duplicates * payload_size; + let total_rows = num_payloads * (1 + num_duplicates) * payload_size; let db_name = rand_name(); DatabaseBuilder::new(db_name.clone()) @@ -73,24 +100,14 @@ async fn test_full_lifecycle() { .build(fixture.grpc_channel()) .await; - let payloads: Vec<_> = (0..num_payloads) - .map(|x| { - (0..payload_size) - .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i)) - .join("\n") - }) - .collect(); - - for payload in &payloads { - // Writing the same data multiple times should be compacted away - for _ in 0..num_duplicates { - let num_lines_written = write_client - .write(&db_name, payload) - .await - .expect("successful write"); - assert_eq!(num_lines_written, payload_size as usize); - } - } + write_data( + &mut write_client, + &db_name, + num_payloads, + num_duplicates, + payload_size, + ) + .await; wait_for_exact_chunk_states( &fixture, @@ -123,6 +140,58 @@ async fn test_full_lifecycle() { assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize) } +#[tokio::test] +async fn test_update_late_arrival() { + let fixture = ServerFixture::create_shared().await; + let mut write_client = fixture.write_client(); + + let payload_size = 100; + + let db_name = rand_name(); + DatabaseBuilder::new(db_name.clone()) + .persist(true) + // Don't close MUB automatically + .mub_row_threshold(payload_size * 2) + .persist_row_threshold(payload_size) + .persist_age_threshold_seconds(1000) + // Initially set to be a large value + .late_arrive_window_seconds(1000) + .build(fixture.grpc_channel()) + .await; + + write_data(&mut write_client, &db_name, 1, 0, payload_size).await; + + let mut management = fixture.management_client(); + + let chunks = management.list_chunks(&db_name).await.unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!( + chunks[0].storage, + influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32 + ); + + let mut rules = management.get_database(&db_name).await.unwrap(); + rules + .lifecycle_rules + .as_mut() + .unwrap() + .late_arrive_window_seconds = 1; + + fixture + .management_client() + .update_database(rules) + .await + .unwrap(); + + wait_for_exact_chunk_states( + &fixture, + &db_name, + vec![ChunkStorage::ReadBufferAndObjectStore], + std::time::Duration::from_secs(5), + ) + .await; +} + #[tokio::test] async fn test_query_chunk_after_restart() { // fixtures diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs index 6aacefd530..d6c69e6341 100644 --- a/write_buffer/src/config.rs +++ b/write_buffer/src/config.rs @@ -13,7 +13,7 @@ use crate::{ #[derive(Debug)] pub enum WriteBufferConfig { Writing(Arc), - Reading(Arc), + Reading(Arc>>), } impl WriteBufferConfig { @@ -36,7 +36,9 @@ impl WriteBufferConfig { Some(WriteBufferConnection::Reading(conn)) => { let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?; - Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _))) + Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new( + Box::new(kafka_buffer) as _, + ))))) } None => Ok(None), } diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs index f747fde37b..f604b80862 100644 --- a/write_buffer/src/core.rs +++ b/write_buffer/src/core.rs @@ -1,6 +1,8 @@ +use std::fmt::Debug; + use async_trait::async_trait; use entry::{Entry, Sequence, SequencedEntry}; -use futures::stream::BoxStream; +use futures::{future::BoxFuture, stream::BoxStream}; /// Generic boxed error type that is used in this crate. /// @@ -10,7 +12,7 @@ pub type WriteBufferError = Box; /// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading /// entries from the Write Buffer at a later time. #[async_trait] -pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static { +pub trait WriteBufferWriting: Sync + Send + Debug + 'static { /// Send an `Entry` to the write buffer using the specified sequencer ID. /// /// Returns information that can be used to restore entries at a later time. @@ -21,16 +23,47 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static { ) -> Result; } +pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result>; +pub type FetchHighWatermark<'a> = Box FetchHighWatermarkFut<'a>) + Send + Sync>; + /// Output stream of [`WriteBufferReading`]. -pub type EntryStream<'a> = BoxStream<'a, Result>; +pub struct EntryStream<'a> { + /// Stream that produces entries. + pub stream: BoxStream<'a, Result>, + + /// Get high watermark (= what we believe is the next sequence number to be added). + /// + /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts + /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1. + pub fetch_high_watermark: FetchHighWatermark<'a>, +} + +impl<'a> Debug for EntryStream<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EntryStream").finish_non_exhaustive() + } +} /// Produce streams (one per sequencer) of [`SequencedEntry`]s. -pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static { +#[async_trait] +pub trait WriteBufferReading: Sync + Send + Debug + 'static { /// Returns a stream per sequencer. - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait; + /// + /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same + /// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last + /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either + /// create a new [`WriteBufferReading`] or use [`seek`](Self::seek). + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>; + + /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least + /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream). + /// + /// Note that due to the mutable borrow, it is not possible to seek while streams exists. + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError>; } pub mod test_utils { @@ -65,6 +98,8 @@ pub mod test_utils { test_multi_stream_io(&adapter).await; test_multi_sequencer_io(&adapter).await; test_multi_writer_multi_reader(&adapter).await; + test_seek(&adapter).await; + test_watermark(&adapter).await; } async fn test_single_stream_io(adapter: &T) @@ -78,7 +113,7 @@ pub mod test_utils { let entry_3 = lp_to_entry("upc user=3 300"); let writer = context.writing(); - let reader = context.reading().await; + let mut reader = context.reading().await; let mut streams = reader.streams(); assert_eq!(streams.len(), 1); @@ -88,67 +123,32 @@ pub mod test_utils { let mut cx = futures::task::Context::from_waker(&waker); // empty stream is pending - assert!(stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); // adding content allows us to get results writer.store_entry(&entry_1, sequencer_id).await.unwrap(); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); // stream is pending again - assert!(stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); // adding more data unblocks the stream writer.store_entry(&entry_2, sequencer_id).await.unwrap(); writer.store_entry(&entry_3, sequencer_id).await.unwrap(); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); // stream is pending again - assert!(stream.poll_next_unpin(&mut cx).is_pending()); - } - - async fn test_multi_sequencer_io(adapter: &T) - where - T: TestAdapter, - { - let context = adapter.new_context(2).await; - - let entry_1 = lp_to_entry("upc user=1 100"); - let entry_2 = lp_to_entry("upc user=2 200"); - let entry_3 = lp_to_entry("upc user=3 300"); - - let writer = context.writing(); - let reader = context.reading().await; - - let mut streams = reader.streams(); - assert_eq!(streams.len(), 2); - let (sequencer_id_1, mut stream_1) = streams.pop().unwrap(); - let (sequencer_id_2, mut stream_2) = streams.pop().unwrap(); - assert_ne!(sequencer_id_1, sequencer_id_2); - - let waker = futures::task::noop_waker(); - let mut cx = futures::task::Context::from_waker(&waker); - - // empty streams are pending - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - - // entries arrive at the right target stream - writer.store_entry(&entry_1, sequencer_id_1).await.unwrap(); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - - writer.store_entry(&entry_2, sequencer_id_2).await.unwrap(); - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2); - - writer.store_entry(&entry_3, sequencer_id_1).await.unwrap(); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3); - - // streams are pending again - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); } async fn test_multi_stream_io(adapter: &T) @@ -162,34 +162,104 @@ pub mod test_utils { let entry_3 = lp_to_entry("upc user=3 300"); let writer = context.writing(); - let reader = context.reading().await; - - let mut streams_1 = reader.streams(); - let mut streams_2 = reader.streams(); - assert_eq!(streams_1.len(), 1); - assert_eq!(streams_2.len(), 1); - let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap(); - let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap(); - assert_eq!(sequencer_id_1, sequencer_id_2); + let mut reader = context.reading().await; let waker = futures::task::noop_waker(); let mut cx = futures::task::Context::from_waker(&waker); - // empty streams is pending - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + writer.store_entry(&entry_1, 0).await.unwrap(); + writer.store_entry(&entry_2, 0).await.unwrap(); + writer.store_entry(&entry_3, 0).await.unwrap(); - // streams poll from same source + // creating stream, drop stream, re-create it => still starts at first entry + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, stream) = streams.pop().unwrap(); + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); + + // re-creating stream after reading remembers offset + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); + + // re-creating stream after reading everything makes it pending + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); + } + + async fn test_multi_sequencer_io(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(2).await; + + let entry_1 = lp_to_entry("upc user=1 100"); + let entry_2 = lp_to_entry("upc user=2 200"); + let entry_3 = lp_to_entry("upc user=3 300"); + + let writer = context.writing(); + let mut reader = context.reading().await; + + let mut streams = reader.streams(); + assert_eq!(streams.len(), 2); + let (sequencer_id_1, mut stream_1) = streams.pop().unwrap(); + let (sequencer_id_2, mut stream_2) = streams.pop().unwrap(); + assert_ne!(sequencer_id_1, sequencer_id_2); + + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + + // empty streams are pending + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); + + // entries arrive at the right target stream writer.store_entry(&entry_1, sequencer_id_1).await.unwrap(); - writer.store_entry(&entry_2, sequencer_id_1).await.unwrap(); - writer.store_entry(&entry_3, sequencer_id_1).await.unwrap(); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1); - assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3); + assert_eq!( + stream_1.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); - // both streams are pending again - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + writer.store_entry(&entry_2, sequencer_id_2).await.unwrap(); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert_eq!( + stream_2.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); + + writer.store_entry(&entry_3, sequencer_id_1).await.unwrap(); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); + assert_eq!( + stream_1.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); + + // streams are pending again + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); } async fn test_multi_writer_multi_reader(adapter: &T) @@ -204,8 +274,8 @@ pub mod test_utils { let writer_1 = context.writing(); let writer_2 = context.writing(); - let reader_1 = context.reading().await; - let reader_2 = context.reading().await; + let mut reader_1 = context.reading().await; + let mut reader_2 = context.reading().await; // TODO: do not hard-code sequencer IDs here but provide a proper interface writer_1.store_entry(&entry_east_1, 0).await.unwrap(); @@ -213,18 +283,119 @@ pub mod test_utils { writer_2.store_entry(&entry_east_2, 0).await.unwrap(); assert_reader_content( - reader_1, + &mut reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; assert_reader_content( - reader_2, + &mut reader_2, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; } - async fn assert_reader_content(reader: R, expected: &[(u32, &[&Entry])]) + async fn test_seek(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(2).await; + + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + + let entry_east_1 = lp_to_entry("upc,region=east user=1 100"); + let entry_east_2 = lp_to_entry("upc,region=east user=2 200"); + let entry_east_3 = lp_to_entry("upc,region=east user=3 300"); + let entry_west_1 = lp_to_entry("upc,region=west user=1 200"); + + let writer = context.writing(); + let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number; + let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number; + let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number; + + let mut reader_1 = context.reading().await; + let mut reader_2 = context.reading().await; + + // forward seek + reader_1.seek(0, sequence_number_east_2).await.unwrap(); + assert_reader_content( + &mut reader_1, + &[(0, &[&entry_east_2]), (1, &[&entry_west_1])], + ) + .await; + assert_reader_content( + &mut reader_2, + &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], + ) + .await; + + // backward seek + reader_1.seek(0, 0).await.unwrap(); + assert_reader_content( + &mut reader_1, + &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])], + ) + .await; + + // seek to far end and then at data + reader_1.seek(0, 1_000_000).await.unwrap(); + let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number; + let mut streams = reader_1.streams(); + assert_eq!(streams.len(), 2); + let (_sequencer_id, mut stream_1) = streams.pop().unwrap(); + let (_sequencer_id, mut stream_2) = streams.pop().unwrap(); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); + drop(stream_1); + drop(stream_2); + drop(streams); + + // seeking unknown sequencer is NOT an error + reader_1.seek(0, 42).await.unwrap(); + } + + async fn test_watermark(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(2).await; + + let entry_east_1 = lp_to_entry("upc,region=east user=1 100"); + let entry_east_2 = lp_to_entry("upc,region=east user=2 200"); + let entry_west_1 = lp_to_entry("upc,region=west user=1 200"); + + let writer = context.writing(); + let mut reader = context.reading().await; + + let mut streams = reader.streams(); + assert_eq!(streams.len(), 2); + let (sequencer_id_1, stream_1) = streams.pop().unwrap(); + let (sequencer_id_2, stream_2) = streams.pop().unwrap(); + + // start at watermark 0 + assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0); + assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0); + + // high water mark moves + writer + .store_entry(&entry_east_1, sequencer_id_1) + .await + .unwrap(); + let mark_1 = writer + .store_entry(&entry_east_2, sequencer_id_1) + .await + .unwrap() + .number; + let mark_2 = writer + .store_entry(&entry_west_1, sequencer_id_2) + .await + .unwrap() + .number; + assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1); + assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1); + } + + async fn assert_reader_content(reader: &mut R, expected: &[(u32, &[&Entry])]) where R: WriteBufferReading, { @@ -239,6 +410,7 @@ pub mod test_utils { // we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever let mut results: Vec<_> = actual_stream + .stream .take(expected_entries.len()) .try_collect() .await diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index 15a27a401c..9f971ef9ac 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -1,22 +1,28 @@ use std::{ + collections::BTreeMap, convert::{TryFrom, TryInto}, + sync::Arc, time::Duration, }; use async_trait::async_trait; use data_types::server_id::ServerId; use entry::{Entry, Sequence, SequencedEntry}; -use futures::StreamExt; +use futures::{FutureExt, StreamExt}; use observability_deps::tracing::{debug, info}; use rdkafka::{ consumer::{BaseConsumer, Consumer, StreamConsumer}, error::KafkaError, producer::{FutureProducer, FutureRecord}, + types::RDKafkaErrorCode, util::Timeout, - ClientConfig, Message, TopicPartitionList, + ClientConfig, Message, Offset, TopicPartitionList, }; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::core::{ + EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading, + WriteBufferWriting, +}; pub struct KafkaBufferProducer { conn: String, @@ -77,8 +83,8 @@ impl KafkaBufferProducer { let mut cfg = ClientConfig::new(); cfg.set("bootstrap.servers", &conn); cfg.set("message.timeout.ms", "5000"); - cfg.set("message.max.bytes", "10000000"); - cfg.set("queue.buffering.max.kbytes", "10485760"); + cfg.set("message.max.bytes", "31457280"); + cfg.set("queue.buffering.max.kbytes", "31457280"); cfg.set("request.required.acks", "all"); // equivalent to acks=-1 let producer: FutureProducer = cfg.create()?; @@ -94,7 +100,7 @@ impl KafkaBufferProducer { pub struct KafkaBufferConsumer { conn: String, database_name: String, - consumers: Vec<(u32, StreamConsumer)>, + consumers: BTreeMap>, } // Needed because rdkafka's StreamConsumer doesn't impl Debug @@ -107,31 +113,94 @@ impl std::fmt::Debug for KafkaBufferConsumer { } } +#[async_trait] impl WriteBufferReading for KafkaBufferConsumer { - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait, - { - self.consumers - .iter() - .map(|(sequencer_id, consumer)| { - let stream = consumer - .stream() - .map(|message| { - let message = message?; - let entry = Entry::try_from(message.payload().unwrap().to_vec())?; - let sequence = Sequence { - id: message.partition().try_into()?, - number: message.offset().try_into()?, - }; + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> { + let mut streams = vec![]; - Ok(SequencedEntry::new_from_sequence(sequence, entry)?) + for (sequencer_id, consumer) in &self.consumers { + let sequencer_id = *sequencer_id; + let consumer_cloned = Arc::clone(consumer); + let database_name = self.database_name.clone(); + + let stream = consumer + .stream() + .map(move |message| { + let message = message?; + let entry = Entry::try_from(message.payload().unwrap().to_vec())?; + let sequence = Sequence { + id: message.partition().try_into()?, + number: message.offset().try_into()?, + }; + + Ok(SequencedEntry::new_from_sequence(sequence, entry)?) + }) + .boxed(); + + let fetch_high_watermark = move || { + let consumer_cloned = Arc::clone(&consumer_cloned); + let database_name = database_name.clone(); + + let fut = async move { + match tokio::task::spawn_blocking(move || { + consumer_cloned.fetch_watermarks( + &database_name, + sequencer_id as i32, + Duration::from_secs(60), + ) }) - .boxed(); - (*sequencer_id, stream) + .await + .expect("subtask failed") + { + Ok((_low, high)) => Ok(high as u64), + Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0), + Err(e) => Err(Box::new(e) as Box), + } + }; + + fut.boxed() as FetchHighWatermarkFut<'_> + }; + let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>; + + streams.push(( + sequencer_id, + EntryStream { + stream, + fetch_high_watermark, + }, + )); + } + + streams + } + + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError> { + if let Some(consumer) = self.consumers.get(&sequencer_id) { + let consumer = Arc::clone(consumer); + let database_name = self.database_name.clone(); + let offset = if sequence_number > 0 { + Offset::Offset(sequence_number as i64) + } else { + Offset::Beginning + }; + + tokio::task::spawn_blocking(move || { + consumer.seek( + &database_name, + sequencer_id as i32, + offset, + Duration::from_secs(60), + ) }) - .collect() + .await + .expect("subtask failed")?; + } + + Ok(()) } } @@ -169,11 +238,21 @@ impl KafkaBufferConsumer { let mut assignment = TopicPartitionList::new(); assignment.add_partition(&database_name, partition as i32); - consumer.assign(&assignment)?; - Ok((partition, consumer)) + // We must set the offset to `Beginning` here to avoid the following error during seek: + // KafkaError (Seek error: Local: Erroneous state) + // + // Also see: + // - https://github.com/Blizzard/node-rdkafka/issues/237 + // - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376 + assignment + .set_partition_offset(&database_name, partition as i32, Offset::Beginning) + .expect("partition was set just before"); + + consumer.assign(&assignment)?; + Ok((partition, Arc::new(consumer))) }) - .collect::, KafkaError>>()?; + .collect::>, KafkaError>>()?; Ok(Self { conn, diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs index fc15ca2534..a67000633d 100644 --- a/write_buffer/src/mock.rs +++ b/write_buffer/src/mock.rs @@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll}; use async_trait::async_trait; use entry::{Entry, Sequence, SequencedEntry}; -use futures::{stream, StreamExt}; +use futures::{stream, FutureExt, StreamExt}; use parking_lot::Mutex; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::core::{ + EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading, + WriteBufferWriting, +}; type EntryResVec = Vec>; @@ -153,21 +156,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors { } } +/// Sequencer-specific playback state +struct PlaybackState { + /// Index within the entry vector. + vector_index: usize, + + /// Offset within the sequencer IDs. + offset: u64, +} + pub struct MockBufferForReading { - state: MockBufferSharedState, - positions: Arc>>, + shared_state: MockBufferSharedState, + playback_states: Arc>>, } impl MockBufferForReading { pub fn new(state: MockBufferSharedState) -> Self { let n_sequencers = state.entries.lock().len() as u32; - let positions: BTreeMap<_, _> = (0..n_sequencers) - .map(|sequencer_id| (sequencer_id, 0)) + let playback_states: BTreeMap<_, _> = (0..n_sequencers) + .map(|sequencer_id| { + ( + sequencer_id, + PlaybackState { + vector_index: 0, + offset: 0, + }, + ) + }) .collect(); Self { - state, - positions: Arc::new(Mutex::new(positions)), + shared_state: state, + playback_states: Arc::new(Mutex::new(playback_states)), } } } @@ -178,46 +198,110 @@ impl std::fmt::Debug for MockBufferForReading { } } +#[async_trait] impl WriteBufferReading for MockBufferForReading { - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait, - { + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> { let sequencer_ids: Vec<_> = { - let positions = self.positions.lock(); - positions.keys().copied().collect() + let playback_states = self.playback_states.lock(); + playback_states.keys().copied().collect() }; let mut streams = vec![]; for sequencer_id in sequencer_ids { - let state = self.state.clone(); - let positions = Arc::clone(&self.positions); + let shared_state = self.shared_state.clone(); + let playback_states = Arc::clone(&self.playback_states); let stream = stream::poll_fn(move |_ctx| { - let entries = state.entries.lock(); - let mut positions = positions.lock(); + let entries = shared_state.entries.lock(); + let mut playback_states = playback_states.lock(); let entry_vec = entries.get(&sequencer_id).unwrap(); - let position = positions.get_mut(&sequencer_id).unwrap(); + let playback_state = playback_states.get_mut(&sequencer_id).unwrap(); - if entry_vec.len() > *position { - let entry = match &entry_vec[*position] { - Ok(entry) => Ok(entry.clone()), - Err(e) => Err(e.to_string().into()), - }; - *position += 1; - return Poll::Ready(Some(entry)); + while entry_vec.len() > playback_state.vector_index { + let entry_result = &entry_vec[playback_state.vector_index]; + + // consume entry + playback_state.vector_index += 1; + + match entry_result { + Ok(entry) => { + // found an entry => need to check if it is within the offset + let sequence = entry.sequence().unwrap(); + if sequence.number >= playback_state.offset { + // within offset => return entry to caller + return Poll::Ready(Some(Ok(entry.clone()))); + } else { + // offset is larger then the current entry => ignore entry and try next + continue; + } + } + Err(e) => { + // found an error => return entry to caller + return Poll::Ready(Some(Err(e.to_string().into()))); + } + } } + // we are at the end of the recorded entries => report pending Poll::Pending }) .boxed(); - streams.push((sequencer_id, stream)); + + let shared_state = self.shared_state.clone(); + + let fetch_high_watermark = move || { + let shared_state = shared_state.clone(); + + let fut = async move { + let entries = shared_state.entries.lock(); + let entry_vec = entries.get(&sequencer_id).unwrap(); + let watermark = entry_vec + .iter() + .filter_map(|entry_res| { + entry_res + .as_ref() + .ok() + .map(|entry| entry.sequence().unwrap().number) + }) + .max() + .map(|n| n + 1) + .unwrap_or(0); + + Ok(watermark) + }; + fut.boxed() as FetchHighWatermarkFut<'_> + }; + let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>; + + streams.push(( + sequencer_id, + EntryStream { + stream, + fetch_high_watermark, + }, + )); } streams } + + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError> { + let mut playback_states = self.playback_states.lock(); + + if let Some(playback_state) = playback_states.get_mut(&sequencer_id) { + playback_state.offset = sequence_number; + + // reset position to start since seeking might go backwards + playback_state.vector_index = 0; + } + + Ok(()) + } } #[cfg(test)]